NAME: RUKAYAT ADEOSUN

TITLE: HEIRARCHICAL CLUSTERING

In [None]:
# Importing libraries
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster import hierarchy
import numpy as np

In [None]:
# Importing the data set
df = pd.read_excel('/Users/rukayatadeosun/Downloads/data1.xlsx')

In [None]:
# To create a new column to indicate presence/absence of response
new_label = [] 
for value in df["response"]: 
    if value <= 0: 
        new_label.append("__NORES")
    else: 
        new_label.append("__RES")
       
df["New_label"] = new_label   
pd.set_option('display.max_columns', 500,'display.max_rows', 500)

#print(df) 

In [None]:
# To check the columns in the dataset
df.columns

In [None]:
# create a new column ID (New_ID) to label HC by concatenating columns ID and New label
df['New_ID'] = df[['ID', 'New_label']].apply(lambda x: ''.join(x), axis=1)
df

In [None]:
# To check the label column
df['New_label']

In [None]:
# To check the new column 
df['New_ID']

In [None]:
# To drop string labels 
del df['ID']
del df['response']
del df['New_label']

# To drop some unimportant columns 
del df['Echo_pre_LVEF']
del df['height']
del df['weight']

In [None]:
# To set the column new ID as index 
df1 = df.set_index('New_ID', inplace=False)
print(df1.shape)
print(df1)

In [None]:
# To calculate the count and percentage of categorical variables in the data
# Iterate over several given columns 
# only from the dataframe 
for column in df1[['ACEI_or_ARB', 'CAD', 'Concordance', 'DM', 'Gender', 'HTN', 'LBBB', 'MI', 'NYHA', 'Race','Smoking']]: 
     
    # Select column contents by column    
    # name using [] operator 
    columnSeriesObj = df1[column] 
    print('Column Name : ', column) 
    print('Column Count : ', columnSeriesObj.value_counts()) 
    print('Column Percentage : ', columnSeriesObj.value_counts(normalize = True)*100)
    print('\n')

In [None]:
# mean of all columns
df1.mean()

In [None]:
# Standard deviation of all columns
df1.std()

In [None]:
# convert index type to list
df1.index.tolist()

In [None]:
#from sklearn.preprocessing import normalize
#df1_scaled = normalize(df1)
#df1_scaled = pd.DataFrame(df1_scaled, columns=df1.columns)
#df1_scaled.head()

In [None]:
# Calculate the distance between each sample
# linkage method defines how the distance between clusters is measured 
# In complete linkage, the distance between clusters is the distance between the furthest points of the clusters
# In single linkage, the distance between clusters is the distance between the closest points of the clusters.

# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings = linkage(df1, method='complete')

# Plot the dendrogram, using varieties as labels
plt.figure(figsize=(35,15))
dendrogram(mergings,
           labels=df1.index.tolist(),
           leaf_rotation=90,
           leaf_font_size=8,color_threshold=240)
plt.show()

In [None]:
# Control number of clusters in the plot + add horizontal line.
plt.figure(figsize=(35,15))
dendrogram(mergings,
           labels=df1.index.tolist(),
           leaf_rotation=90,
           leaf_font_size=8,color_threshold=240)
plt.axhline(y=400, c='black', lw=1, linestyle='dashed')
plt.show()

In [None]:
# reshuffled the dataframe to confirm accuracy of HC
df_shuf = df1.sample(frac=1)
df_shuf

In [None]:
# part of the reshuffling process
df_shuf1 = df1.sample(frac=1).reset_index(drop=True)
df_shuf1 = df.set_index('New_ID', inplace=False)
df_shuf1

In [None]:
# converted the dataframe to list to be used as labels for the HC
df_shuf.index.tolist()

In [None]:
# conversion of dataframe column to list to be used as labels for the HC
df_shuf1.index.tolist()

In [None]:
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings1 = linkage(df_shuf, method='complete')

# Plot the dendrogram, using varieties as labels
plt.figure(figsize=(35,15))
dendrogram(mergings1,
           labels=df_shuf.index.tolist(),
           leaf_rotation=90,
           leaf_font_size=8,color_threshold=240)
plt.show()

In [None]:
# Control number of clusters in the plot + add horizontal line.
plt.figure(figsize=(35,15))
dendrogram(mergings1,
           labels=df_shuf.index.tolist(),
           leaf_rotation=90,
           leaf_font_size=8,color_threshold=240)
plt.axhline(y=400, c='black', lw=1, linestyle='dashed')
plt.show()

In [None]:
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings2 = linkage(df_shuf1, method='complete')

# Plot the dendrogram, using varieties as labels
plt.figure(figsize=(35,15))
dendrogram(mergings2,
           labels=df_shuf1.index.tolist(),
           leaf_rotation=90,
           leaf_font_size=8,color_threshold=240)
plt.show()

In [None]:
# Control number of clusters in the plot + add horizontal line.
plt.figure(figsize=(35,15))
dendrogram(mergings2,
           labels=df_shuf1.index.tolist(),
           leaf_rotation=90,
           leaf_font_size=8,color_threshold=240)
plt.axhline(y=400, c='black', lw=1, linestyle='dashed')
plt.show()

Since shuffling the dataset gives same dendrogram, we will go ahead and analyze the clusters

In [None]:
# Perform the necessary imports
import pandas as pd
from scipy.cluster.hierarchy import fcluster

# Use fcluster to extract labels: labels
labels = fcluster(mergings, 400, criterion='distance')
print(labels)
print(labels.shape)

In [None]:
# Create a DataFrame with labels and Response/No_response as columns: df2
df2 = pd.DataFrame({'LABELS': labels, 'Res/No_Res': df1.index.tolist()})
arranged_labels = df2.sort_values('LABELS')

arranged_labels


In [None]:
# Create crosstab: ct
ct = pd.crosstab(df2['LABELS'], df2['Res/No_Res'])
# Display ct
ct

In [None]:
# To check cluster statistics
arranged_labels.describe()

In [None]:
# To check the information in the arranged labels
arranged_labels.info()

In [None]:
# To cull out cluster 1
lab1 = arranged_labels[(arranged_labels['LABELS']==1)]
print(lab1)
print(lab1.count())

In [None]:
# To filter patients with response in phenogroup 1
# String to be searched in end of string  
search1 ="_NORES"
search2 ="_RES"
  
# boolean series returned with False at place of NaN 
NORES_df1 = lab1["Res/No_Res"].str.endswith(search1, na = False) 
RES_df1 = lab1["Res/No_Res"].str.endswith(search2, na = False)
  
# displaying filtered dataframe 
print(lab1[NORES_df1])
print(lab1[NORES_df1].count())
print('\n')
print(lab1[RES_df1])
print(lab1[RES_df1].count())


In [None]:
# Set index for cluster 1
print(lab1.shape)
lab1 = lab1.set_index('Res/No_Res', inplace=False)
print(lab1.shape)
clus1 = lab1.index.tolist()

clus1

In [None]:
# To select rows in cluster 1
clus_1 = df1[df1.index.isin(clus1)]
print(clus_1.shape)
print(clus_1)

In [None]:
# Create a new column with its corresponding cluster number
clus_1['New_group'] = 1
print(clus_1.shape)
print(clus_1)

In [None]:
# To calculate the count and percentage of categorical variables in cluster 1
# Iterate over many given columns 
# only from the dataframe 
for column in clus_1[['ACEI_or_ARB', 'CAD', 'Concordance', 'DM', 'Gender', 'HTN', 'LBBB', 'MI', 'NYHA', 'Race','Smoking']]: 
     
    # Select column contents by column    
    # name using [] operator 
    columnSeriesObj = clus_1[column] 
    print('Column Name : ', column) 
    print('Column Count : ', columnSeriesObj.value_counts()) 
    print('Column Percentage : ', columnSeriesObj.value_counts(normalize = True)*100)
    print('\n')

In [None]:
# To cull out cluster 2
lab2 = arranged_labels[(arranged_labels['LABELS']==2)]
print(lab2.count())
print(lab2.shape)
print(lab2)


In [None]:
# To filter patients with response in phenogroup 2
# String to be searched in end of string  
search1 ="_NORES"
search2 ="_RES"
  
# boolean series returned with False at place of NaN 
NORES_df2 = lab2["Res/No_Res"].str.endswith(search1, na = False) 
RES_df2 = lab2["Res/No_Res"].str.endswith(search2, na = False)
  
# displaying filtered dataframe 
print(lab2[NORES_df2])
print(lab2[NORES_df2].count())
print('\n')
print(lab2[RES_df2])
print(lab2[RES_df2].count())

In [None]:
# Set index for cluster 2
print(lab2.shape)
lab2 = lab2.set_index("Res/No_Res", inplace=False)
print(lab2.shape)
clus2 = lab2.index.tolist()

clus2

In [None]:
# To select rows in cluster 2
clus_2 = df1[df1.index.isin(clus2)]
print(clus_2.shape)
print(clus_2)

In [None]:
# Create a new column with its corresponding cluster number
clus_2['New_group'] = 2
print(clus_2.shape)
print(clus_2)

In [None]:
# To calculate the count and percentage of categorical variables in cluster 2
# Iterate over many given columns 
# only from the dataframe 
for column in clus_2[['ACEI_or_ARB', 'CAD', 'Concordance', 'DM', 'Gender', 'HTN', 'LBBB', 'MI', 'NYHA', 'Race','Smoking']]: 
     
    # Select column contents by column    
    # name using [] operator 
    columnSeriesObj = clus_2[column] 
    print('Column Name : ', column) 
    print('Column Count : ', columnSeriesObj.value_counts()) 
    print('Column Percentage : ', columnSeriesObj.value_counts(normalize = True)*100)
    print('\n')

In [None]:
# To cull out cluster 3
lab3 = arranged_labels[(arranged_labels['LABELS']==3)]
print(lab3)
print(lab3.count())

In [None]:
# To filter patients with response in phenogroup 3
# String to be searched in end of string  
search1 ="_NORES"
search2 ="_RES"
  
# boolean series returned with False at place of NaN 
NORES_df3 = lab3["Res/No_Res"].str.endswith(search1, na = False) 
RES_df3 = lab3["Res/No_Res"].str.endswith(search2, na = False)
  
# displaying filtered dataframe 
print(lab3[NORES_df3])
print(lab3[NORES_df3].count())
print('\n')
print(lab3[RES_df3])
print(lab3[RES_df3].count())

In [None]:
# Set index for cluster 3
print(lab3.shape)
lab3 = lab3.set_index('Res/No_Res', inplace=False)
print(lab3.shape)
clus3 = lab3.index.tolist()

clus3

In [None]:
# To select rows in cluster 3
clus_3 = df1[df1.index.isin(clus3)]
print(clus_3.shape)
print(clus_3)

In [None]:
# Create a new column with its corresponding cluster number
clus_3['New_group'] = 3
print(clus_3.shape)
print(clus_3)

In [None]:
# To calculate the count and percentage of categorical variables in cluster 3
# Iterate over many given columns 
# only from the dataframe 
for column in clus_3[['ACEI_or_ARB', 'CAD', 'Concordance', 'DM', 'Gender', 'HTN', 'LBBB', 'MI', 'NYHA', 'Race','Smoking']]: 
     
    # Select column contents by column    
    # name using [] operator 
    columnSeriesObj = clus_3[column] 
    print('Column Name : ', column) 
    print('Column Count : ', columnSeriesObj.value_counts()) 
    print('Column Percentage : ', columnSeriesObj.value_counts(normalize = True)*100)
    print('\n')

In [None]:
# To cull out cluster 4
lab4 = arranged_labels[(arranged_labels['LABELS']==4)]
print(lab4)
print(lab4.count())

In [None]:
# To filter patients with response in phenogroup 4
# String to be searched in end of string  
search1 ="_NORES"
search2 ="_RES"
  
# boolean series returned with False at place of NaN 
NORES_df4 = lab4["Res/No_Res"].str.endswith(search1, na = False) 
RES_df4 = lab4["Res/No_Res"].str.endswith(search2, na = False)
  
# displaying filtered dataframe 
print(lab4[NORES_df4])
print(lab4[NORES_df4].count())
print('\n')
print(lab4[RES_df4])
print(lab4[RES_df4].count())

In [None]:
# Set index for cluster 4
print(lab4.shape)
lab4 = lab4.set_index("Res/No_Res", inplace=False)
print(lab4.shape)
clus4 = lab4.index.tolist()

clus4

In [None]:
# To select rows in cluster 4
clus_4 = df1[df1.index.isin(clus4)]
print(clus_4.shape)
print(clus_4)

In [None]:
# Create a new column with its corresponding cluster number
clus_4['New_group'] = 4
print(clus_4.shape)
print(clus_4)

In [None]:
# To calculate the count and percentage of categorical variables in cluster 4
# Iterate over many given columns 
# only from the dataframe 
for column in clus_4[['ACEI_or_ARB', 'CAD', 'Concordance', 'DM', 'Gender', 'HTN', 'LBBB', 'MI', 'NYHA', 'Race','Smoking']]: 
     
    # Select column contents by column    
    # name using [] operator 
    columnSeriesObj = clus_4[column] 
    print('Column Name : ', column) 
    print('Column Count : ', columnSeriesObj.value_counts()) 
    print('Column Percentage : ', columnSeriesObj.value_counts(normalize = True)*100)
    print('\n')

In [None]:
# mean of columns in phenogroup 1
clus_1.mean()

In [None]:
# standard deviation of phenogroup 1
clus_1.std()

In [None]:
#mean of columns in phenogroup 2
clus_2.mean()

In [None]:
# standard deviation of columns in phenogroup 2
clus_2.std()

In [None]:
# mean of columns in phenogroup 3
clus_3.mean()

In [None]:
# standard deviation of columns in phenogroup 3
clus_3.std()

In [None]:
# mean of columns in phenogroup 4
clus_4.mean()

In [None]:
# standard deviation of columns in phenogroup 4
clus_4.std()

In [None]:
# Concatenating all phenogroups into a new dataframe
new_df1 = pd.concat([clus_1, clus_2, clus_3, clus_4])
print(new_df1)

In [None]:
# import packages
import statsmodels.api as sm
from statsmodels.formula.api import ols  #ordinary squares

In [None]:
# boxplot of columns age and new_group
new_df1.boxplot('Age', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('Age ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns ECG_pre_QRSd and new_group
new_df1.boxplot('ECG_pre_QRSd', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('ECG_pre_QRSd ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_EDE and new_group
new_df1.boxplot('SPECT_pre_EDE', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_EDE ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_EDSI and new_group
new_df1.boxplot("SPECT_pre_EDSI", by="New_group")

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_EDSI ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_EDV and new_group
new_df1.boxplot('SPECT_pre_EDV', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_EDV ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_ESE and new_group
new_df1.boxplot('SPECT_pre_ESE', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_ESE ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_ESSI and new_group
new_df1.boxplot('SPECT_pre_ESSI', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_ESSI ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_ESV and new_group
new_df1.boxplot('SPECT_pre_ESV', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_ESV ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_LVEF and new_group
new_df1.boxplot('SPECT_pre_LVEF', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_LVEF ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_PBW and new_group
new_df1.boxplot('SPECT_pre_PBW', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_PBW ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_PSD and new_group
new_df1.boxplot('SPECT_pre_PSD', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_PSD ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns SPECT_pre_50scarand new_group
new_df1.boxplot('SPECT_pre_50scar', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('SPECT_pre_50scar ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns ECHO_pre_EDV and new_group
new_df1.boxplot('Echo_pre_EDV', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('Echo_pre_EDV ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns ECHO_pre_ESV and new_group
new_df1.boxplot('Echo_pre_ESV', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('Echo_pre_ESV ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# boxplot of columns LVEF and new_group
new_df1.boxplot('LVEF', by='New_group')

In [None]:
# Corresponding ANOVA of the above cell
mod = ols('LVEF ~ New_group', data=new_df1).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print(aov_table)

In [None]:
# crosstab of the ACEI_ARB and new_group column 
ACEI_crosstab = pd.crosstab(new_df1['ACEI_or_ARB'], 
                            new_df1['New_group'],  
                               margins = False) 
print(ACEI_crosstab) 

In [None]:
# required imports
from scipy.stats import chi2_contingency

In [None]:
# Get chi-square value , p-value, degrees of freedom, expected frequencies using the function chi2_contingency
stat, p, dof, expected = chi2_contingency(ACEI_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Phenogroups are associated (reject H0)')
else:
    print('Variables are not associated(fail to reject H0)')


In [None]:
# crosstab of the CAD and new_group column
CAD_crosstab = pd.crosstab(new_df1['CAD'], 
                            new_df1['New_group'],  
                               margins = False) 
print(CAD_crosstab) 

In [None]:
# Get chi-square value , p-value, degrees of freedom, expected frequencies using the function chi2_contingency
stat, p, dof, expected = chi2_contingency(CAD_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the Concordance and new_group column
Con_crosstab = pd.crosstab(new_df1['Concordance'], 
                            new_df1['New_group'],  
                               margins = False) 
print(Con_crosstab) 

In [None]:
# Get chi-square value , p-value, degrees of freedom, expected frequencies using the function chi2_contingency
stat, p, dof, expected = chi2_contingency(Con_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Variables are associated (reject H0)')
else:
    print('Variables are not associated(fail to reject H0)')

In [None]:
# crosstab of the DM and new_group column
DM_crosstab = pd.crosstab(new_df1['DM'], 
                            new_df1['New_group'],  
                               margins = False) 
print(DM_crosstab) 

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(DM_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the Gender and new_group column
Gender_crosstab = pd.crosstab(new_df1['Gender'], 
                            new_df1['New_group'],  
                               margins = False) 
print(Gender_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(Gender_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the HTN and new_group column
HTN_crosstab = pd.crosstab(new_df1['HTN'], 
                            new_df1['New_group'],  
                               margins = False) 
print(HTN_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(HTN_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the LBBB and new_group column
LBBB_crosstab = pd.crosstab(new_df1['LBBB'], 
                            new_df1['New_group'],  
                               margins = False) 
print(LBBB_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(LBBB_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the MI and new_group column
MI_crosstab = pd.crosstab(new_df1['MI'], 
                            new_df1['New_group'],  
                               margins = False) 
print(MI_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(MI_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the NYHA and new_group column
NYHA_crosstab = pd.crosstab(new_df1['NYHA'], 
                            new_df1['New_group'],  
                               margins = False) 
print(NYHA_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(NYHA_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the Race and new_group column
Race_crosstab = pd.crosstab(new_df1['Race'], 
                            new_df1['New_group'],  
                               margins = False) 
print(Race_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(Race_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))

In [None]:
# crosstab of the Smoking and new_group column
Smoking_crosstab = pd.crosstab(new_df1['Smoking'], 
                            new_df1['New_group'],  
                               margins = False) 
print(Smoking_crosstab)

In [None]:
# Chi square test of the above cell
stat, p, dof, expected = chi2_contingency(Smoking_crosstab)

# select significance value
alpha = 0.05
# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))