In [16]:
# Phase 2: Step 1 - Feature Type Separation

import pandas as pd

# Load cleaned dataset
data = pd.read_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/ckd_cleaned.csv')

# Manually defining based on domain understanding
categorical_features = [
    'Sex', 'HistorySmoking', 'HistoryCHD', 'HistoryDiabetes', 'DMmeds', 'HTNmeds',
    'HistoryObesity', 'HistoryVascular', 'HistoryDLD', 'HistoryHTN', 'DLDmeds', 'ACEIARB'
]


continuous_features = [
    'AgeBaseline', 'CholesterolBaseline', 'CreatinineBaseline', 'eGFRBaseline', 'sBPBaseline', 'dBPBaseline', 'BMIBaseline'
]


# Sanity Check: Are all features accounted for?
all_features = categorical_features + continuous_features + ['EventCKD35']  # Include target

missing_features = set(data.columns) - set(all_features)

print("Categorical Features:", categorical_features)
print("\nContinuous Features:", continuous_features)
print("\nMissing or Unclassified Features:", missing_features)

Categorical Features: ['Sex', 'HistorySmoking', 'HistoryCHD', 'HistoryDiabetes', 'DMmeds', 'HTNmeds', 'HistoryObesity', 'HistoryVascular', 'HistoryDLD', 'HistoryHTN', 'DLDmeds', 'ACEIARB']

Continuous Features: ['AgeBaseline', 'CholesterolBaseline', 'CreatinineBaseline', 'eGFRBaseline', 'sBPBaseline', 'dBPBaseline', 'BMIBaseline']

Missing or Unclassified Features: {'TimeToEventMonths', 'TIME_YEAR'}


In [18]:
# Phase 2: Step 2 - Chi-Square Test for Corrected Categorical Features

from scipy.stats import chi2_contingency

# Save results
chi2_results = []

for feature in categorical_features:
    contingency_table = pd.crosstab(data[feature], data['EventCKD35'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    chi2_results.append({
        'Feature': feature,
        'Chi2 Statistic': chi2,
        'p-value': p
    })

# Create a DataFrame
chi2_df = pd.DataFrame(chi2_results)

# Sort by p-value
chi2_df = chi2_df.sort_values('p-value')

print(chi2_df)

            Feature  Chi2 Statistic       p-value
4            DMmeds       44.821205  2.158722e-11
3   HistoryDiabetes       39.556722  3.186676e-10
2        HistoryCHD       31.286623  2.226082e-08
11          ACEIARB       25.046181  5.597351e-07
9        HistoryHTN       14.049514  1.780598e-04
10          DLDmeds       12.922320  3.246872e-04
5           HTNmeds       12.165153  4.869066e-04
8        HistoryDLD        9.428392  2.136516e-03
0               Sex        3.936969  4.723581e-02
1    HistorySmoking        3.810007  5.094722e-02
7   HistoryVascular        3.696403  5.452990e-02
6    HistoryObesity        0.395553  5.293948e-01


In [19]:
# Phase 2: Step 3 - Cramer's V Calculation for Significant Features

import numpy as np

def cramers_v(x, y):
    contingency_table = pd.crosstab(x, y)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    return np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

# Only for significant features (p < 0.05)
significant_features = ['DMmeds', 'HistoryDiabetes', 'HistoryCHD', 'ACEIARB', 
                         'HistoryHTN', 'DLDmeds', 'HTNmeds', 'HistoryDLD', 'Sex']

# Calculate Cramer's V
cramers_results = []

for feature in significant_features:
    v = cramers_v(data[feature], data['EventCKD35'])
    cramers_results.append({'Feature': feature, "Cramer's V": v})

# Create a DataFrame
cramers_df = pd.DataFrame(cramers_results)

# Sort by Cramer's V
cramers_df = cramers_df.sort_values("Cramer's V", ascending=False)

print(cramers_df)

           Feature  Cramer's V
0           DMmeds    0.302135
1  HistoryDiabetes    0.283837
2       HistoryCHD    0.252429
3          ACEIARB    0.225855
4       HistoryHTN    0.169157
5          DLDmeds    0.162229
6          HTNmeds    0.157405
7       HistoryDLD    0.138573
8              Sex    0.089545


In [20]:
# Phase 2: Step 4 - Mann-Whitney U Test for Continuous Features

from scipy.stats import mannwhitneyu

# Save results
mannwhitney_results = []

for feature in continuous_features:
    group_event = data[data['EventCKD35'] == 1][feature]
    group_no_event = data[data['EventCKD35'] == 0][feature]
    
    stat, p = mannwhitneyu(group_event, group_no_event, alternative='two-sided')
    
    mannwhitney_results.append({
        'Feature': feature,
        'Mann-Whitney U Statistic': stat,
        'p-value': p
    })

# Create a DataFrame
mannwhitney_df = pd.DataFrame(mannwhitney_results)

# Sort by p-value
mannwhitney_df = mannwhitney_df.sort_values('p-value')

print(mannwhitney_df)

               Feature  Mann-Whitney U Statistic       p-value
3         eGFRBaseline                    3970.5  2.136510e-16
2   CreatinineBaseline                   18581.0  1.497031e-10
0          AgeBaseline                   17439.0  1.416119e-07
1  CholesterolBaseline                    8877.5  9.485016e-04
4          sBPBaseline                   15027.5  4.378639e-03
5          dBPBaseline                   10575.5  1.083282e-01
6          BMIBaseline                   12289.0  9.133898e-01


In [21]:
# Phase 2: Step 5 - Cohen's d Calculation for Significant Continuous Features

def cohens_d(x, y):
    nx = len(x)
    ny = len(y)
    pooled_std = np.sqrt(((nx - 1)*np.var(x, ddof=1) + (ny - 1)*np.var(y, ddof=1)) / (nx + ny - 2))
    return (np.mean(x) - np.mean(y)) / pooled_std

# Only for significant features
significant_continuous = ['eGFRBaseline', 'CreatinineBaseline', 'AgeBaseline', 'CholesterolBaseline', 'sBPBaseline']

# Calculate Cohen's d
cohensd_results = []

for feature in significant_continuous:
    group_event = data[data['EventCKD35'] == 1][feature]
    group_no_event = data[data['EventCKD35'] == 0][feature]
    
    d = cohens_d(group_event, group_no_event)
    cohensd_results.append({'Feature': feature, "Cohen's d": d})

# Create a DataFrame
cohensd_df = pd.DataFrame(cohensd_results)

# Sort by absolute value of Cohen's d
cohensd_df = cohensd_df.reindex(cohensd_df["Cohen's d"].abs().sort_values(ascending=False).index)

print(cohensd_df)

               Feature  Cohen's d
0         eGFRBaseline  -1.207690
1   CreatinineBaseline   0.987147
2          AgeBaseline   0.757721
3  CholesterolBaseline  -0.451293
4          sBPBaseline   0.387879
