In [4]:
import numpy as np
import pandas as pd
from scipy import stats
from pathlib import Path
import os

SEED = 123

In [5]:
df = pd.read_csv(
    Path('../data/Merged Pu_Wilson_.csv')
)

# There are identical columns Gender and gender
df = df.drop(columns='gender')

# Clean column names
df.columns = (
    df.columns
    .str.lower()
	.str.replace(r' +|/', r'_', regex=True)
	.str.replace(r'\(|\)|-', r'', regex=True)
	.str.replace('%_', 'percent_', regex=True)
	.str.replace('%', 'percent_', regex=True)
)

# Calculate age in years
df['study_date_dt'] = pd.to_datetime(df['study_date'], format='%Y%m%d')
df['dob_dt'] = pd.to_datetime(df['dob'], format='%Y%m%d')
df['age_years'] = np.round((df['study_date_dt'] - df['dob_dt']).dt.days / 365.25, 1)

# Keep subset
columns_to_keep = [
    'pid',
    'study',
    'sample_id',
    'class',
    'age_years',
    'gender',
    'smoking_status',
    'packyears',
    'artery_number_5',
    'artery_volume_5',
    'artery_tortuosity_5',
    'artery_number_10',
    'artery_volume_10',
    'artery_tortuosity_10',
    'artery_number_15',
    'artery_volume_15',
    'artery_tortuosity_15',
    'vein_number_5',
    'vein_volume_5',
    'vein_tortuosity_5',
    'vein_number_10',
    'vein_volume_10',
    'vein_tortuosity_10',
    'vein_number_15',
    'vein_volume_15',
    'vein_tortuosity_15',
]
df = df.loc[:, columns_to_keep]

# Convert categorical columns
df['class'] = df['class'].map({0: 'Benign', 1: 'Malignant'}).astype('category')
df['gender'] = df['gender'].astype('category')
df['smoking_status'] = df['smoking_status'].astype('category')

# Export 
df.to_pickle('../data/df.pkl')

# View
print(f"DF shape: {df.shape}")
print(df.info())
df.head()

DF shape: (148, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   pid                   148 non-null    object  
 1   study                 148 non-null    object  
 2   sample_id             148 non-null    object  
 3   class                 148 non-null    category
 4   age_years             148 non-null    float64 
 5   gender                148 non-null    category
 6   smoking_status        148 non-null    category
 7   packyears             148 non-null    float64 
 8   artery_number_5       148 non-null    int64   
 9   artery_volume_5       148 non-null    float64 
 10  artery_tortuosity_5   148 non-null    float64 
 11  artery_number_10      148 non-null    int64   
 12  artery_volume_10      148 non-null    float64 
 13  artery_tortuosity_10  148 non-null    float64 
 14  artery_number_15      148 non-null    

Unnamed: 0,pid,study,sample_id,class,age_years,gender,smoking_status,packyears,artery_number_5,artery_volume_5,...,artery_tortuosity_15,vein_number_5,vein_volume_5,vein_tortuosity_5,vein_number_10,vein_volume_10,vein_tortuosity_10,vein_number_15,vein_volume_15,vein_tortuosity_15
0,2008-420,Cooper,420-1,Malignant,65.0,M,former,5.0,1,0.005662,...,1.0,1,0.013388,1.0,1,0.021113,1.0,1,0.034588,1.02139
1,03745-2,PLuSS,LS15-0098,Benign,69.4,M,former,41.0,0,0.00125,...,1.02196,0,0.0,1.0,1,0.055875,1.0,3,0.111875,1.0
2,03533-3,PLuSS,LS15-0162,Benign,66.5,F,former,60.0,1,0.034375,...,1.23261,0,0.0,1.0,1,0.03275,1.0,2,0.091125,1.41931
3,03336-8,PLuSS,LS14-0337,Benign,69.6,M,former,75.0,1,0.0145,...,1.0373,2,0.03875,1.0358,4,0.106625,1.0385,9,0.262,1.04822
4,03244-8,PLuSS,LS15-0063,Benign,70.4,M,former,37.0,4,0.25575,...,1.17149,5,0.388542,1.13853,10,0.520625,1.10964,15,0.892933,1.08346


In [6]:
def add_significance(value):
    if value <= 0.001:
        return '***'
    elif value <= 0.01:
        return '**'
    elif value <= 0.05:
        return '*'
    else:
        return ' '

In [7]:
def two_group_num_tests(group0_data, group1_data):
    """
    Conduct two-group statistical tests (t-test, Mann-Whitney U, and rank sums).
    """
    group0_data = group0_data.dropna()
    group1_data = group1_data.dropna()
    all_data = pd.concat([group0_data, group1_data], axis=0)
	
    summary_statistics = dict(
        n_group0 = len(group0_data),
        n_group1 = len(group1_data),
        n_all = len(all_data),
        mean_group0 = group0_data.mean(),
        mean_group1 = group1_data.mean(),
        mean_all = all_data.mean(),
        median_group0 = group0_data.median(),
        median_group1 = group1_data.median(),
        median_all = all_data.median(),
        sd_group0 = group0_data.std(),
        sd_group1 = group1_data.std(),
        sd_all = all_data.std(),
        iqr_group0 = group0_data.quantile(0.75) - group0_data.quantile(0.25),
        iqr_group1 = group1_data.quantile(0.75) - group1_data.quantile(0.25),
        iqr_all = all_data.quantile(0.75) - all_data.quantile(0.25),
        pct25_group0 = group0_data.quantile(0.25),
        pct25_group1 = group1_data.quantile(0.25),
        pct25_all = all_data.quantile(0.25),
        pct75_group0 = group0_data.quantile(0.75),
        pct75_group1 = group1_data.quantile(0.75),
        pct75_all = all_data.quantile(0.75),
    )
    
    n_cutoff = 2
    if group0_data.shape[-1] <= n_cutoff or group1_data.shape[-1] <= n_cutoff:
        return summary_statistics
    if np.isclose(group0_data.mean(), group1_data.mean(), equal_nan=True):
        return summary_statistics
    else:
        statistical_tests = dict(
            ttest_ind_pvalue = stats.ttest_ind(group0_data, group1_data, equal_var=False, nan_policy='omit').pvalue,
            mannwhitneyu_pvalue = stats.mannwhitneyu(group0_data, group1_data, nan_policy='omit').pvalue,
            ranksums_pvalue = stats.ranksums(group0_data, group1_data, nan_policy='omit').pvalue,
        )
        return {**summary_statistics, **statistical_tests}

def run_num_tests(df, grouper, cols_to_exclude_from_tests=None):
    """
    Conduct two-group statistcal tests across all columns of 
    """ 
    group_names = list(df.groupby(grouper).groups.keys())
    for i, n in enumerate(group_names):
    	print(f"Group {i} = {n}")

    if cols_to_exclude_from_tests is None:
        cols_to_exclude_from_tests = []
    
    df_data = dict()
    for feat in df.select_dtypes(['int', 'float']).columns.difference(cols_to_exclude_from_tests):
    	data = [group_data[feat] for _, group_data in df.groupby(grouper)]
    	df_data[feat] = two_group_num_tests(data[0], data[1])
    
    num_tests = pd.DataFrame.from_dict(df_data, orient='index')
    
    num_tests['ttest_ind_signif'] = num_tests['ttest_ind_pvalue'].apply(add_significance)
    num_tests['mannwhitneyu_signif'] = num_tests['mannwhitneyu_pvalue'].apply(add_significance)
    num_tests['ranksums_signif'] = num_tests['ranksums_pvalue'].apply(add_significance)

    return num_tests

def single_cat_test(data, feature):
	data = data.dropna(subset=feature).astype(str)
	# Get counts and proportions for groups
	counts_by_class_and_feat = data.groupby(['class', feature])[feature].count()
	counts_by_class = data.groupby(['class'])[feature].count()
	proportions = counts_by_class_and_feat / counts_by_class
	chi2_pvalue = stats.chi2_contingency(counts_by_class_and_feat.unstack().fillna(0)).pvalue
	
	# Create multiindex for df
	varname_array = [feature] * len(counts_by_class_and_feat)
	multiindex_array = list(zip(proportions.index.get_level_values(0), varname_array, proportions.index.get_level_values(1)))
	lookup_array = ['_'.join(list(tup)) for tup in multiindex_array]
	multiindex = pd.MultiIndex.from_tuples(
		list(zip(lookup_array, proportions.index.get_level_values(0), varname_array, proportions.index.get_level_values(1)))
	)
	
	# Create df with information
	counts_df = pd.concat([counts_by_class_and_feat, proportions], axis=1)
	counts_df.index = multiindex
	counts_df.index.names = ['lookup', 'class', 'variable', 'levels']
	counts_df.columns = ['counts', 'percent']
	counts_df['chi2_pvalue'] = chi2_pvalue
	counts_df['chi2_signif'] = counts_df['chi2_pvalue'].apply(add_significance)
	return counts_df

In [8]:
num_test_cols = df.select_dtypes(['int', 'float']).columns.union(['class'])
cat_test_cols = df.select_dtypes(['category']).columns

print(f"Num Columns:\n{num_test_cols}")
print(f"Cat Columns:\n{cat_test_cols}")

Num Columns:
Index(['age_years', 'artery_number_10', 'artery_number_15', 'artery_number_5',
       'artery_tortuosity_10', 'artery_tortuosity_15', 'artery_tortuosity_5',
       'artery_volume_10', 'artery_volume_15', 'artery_volume_5', 'class',
       'packyears', 'vein_number_10', 'vein_number_15', 'vein_number_5',
       'vein_tortuosity_10', 'vein_tortuosity_15', 'vein_tortuosity_5',
       'vein_volume_10', 'vein_volume_15', 'vein_volume_5'],
      dtype='object')
Cat Columns:
Index(['class', 'gender', 'smoking_status'], dtype='object')


In [9]:
num_tests = run_num_tests(df.loc[:, num_test_cols], grouper='class')
num_tests.to_csv('../output/num_tests.csv')
num_tests.head()

Group 0 = Benign
Group 1 = Malignant


Unnamed: 0,n_group0,n_group1,n_all,mean_group0,mean_group1,mean_all,median_group0,median_group1,median_all,sd_group0,...,pct25_all,pct75_group0,pct75_group1,pct75_all,ttest_ind_pvalue,mannwhitneyu_pvalue,ranksums_pvalue,ttest_ind_signif,mannwhitneyu_signif,ranksums_signif
age_years,79,69,148,66.106329,63.217391,64.759459,66.7,64.0,65.85,4.652414,...,63.0,69.9,67.0,68.85,0.001362021,0.0001083857,0.00011025,**,***,***
artery_number_10,79,69,148,4.860759,10.681159,7.574324,4.0,9.0,5.0,4.609728,...,2.0,7.0,16.0,11.0,1.49983e-05,9.419945e-05,9.870353e-05,***,***,***
artery_number_15,79,69,148,11.506329,18.695652,14.858108,9.0,13.0,9.5,10.780887,...,5.0,14.5,26.0,20.25,0.005047847,0.04007045,0.04011307,**,*,*
artery_number_5,79,69,148,1.670886,5.391304,3.405405,1.0,4.0,2.0,2.176338,...,0.0,2.0,8.0,6.0,4.58158e-09,3.225294e-10,5.576726e-10,***,***,***
artery_tortuosity_10,79,69,148,1.057518,1.066915,1.061899,1.04657,1.05751,1.04911,0.152876,...,1.013877,1.081095,1.10047,1.08532,0.6236336,0.6262307,0.6254358,,,


In [10]:
cat_tests = pd.DataFrame()
cat_data = df.loc[:, cat_test_cols]
for feature in cat_data.columns.difference(['class']):
    print(feature)
    cat_tests = pd.concat([cat_tests, single_cat_test(cat_data, feature)], axis=0)
cat_tests.to_csv('../output/cat_tests.csv')
cat_tests

gender
smoking_status


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,counts,percent,chi2_pvalue,chi2_signif
lookup,class,variable,levels,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benign_gender_F,Benign,gender,F,31,0.392405,0.478145,
Benign_gender_M,Benign,gender,M,48,0.607595,0.478145,
Malignant_gender_F,Malignant,gender,F,32,0.463768,0.478145,
Malignant_gender_M,Malignant,gender,M,37,0.536232,0.478145,
Benign_smoking_status_current,Benign,smoking_status,current,28,0.35443,1.0,
Benign_smoking_status_former,Benign,smoking_status,former,51,0.64557,1.0,
Malignant_smoking_status_current,Malignant,smoking_status,current,24,0.347826,1.0,
Malignant_smoking_status_former,Malignant,smoking_status,former,45,0.652174,1.0,
