# Feature Selection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

In [2]:
df = pd.read_csv('balanced_dataset.csv')
associations = {'Feature': [], 'Chi-squared': [], 'Cramer\'s V': []}

for col in df.columns:
    # Create a contingency table
    contingency_table = pd.crosstab(df["Diabetes_binary"], df[col])
    
    # Calculate the chi-squared statistic and p-value
    chi2 = chi2_contingency(contingency_table)[0]
    
    # Calculate Cramer's V
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    cramers_v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    
    associations['Feature'].append(col)
    associations['Chi-squared'].append(format(chi2, '.2f'))
    associations['Cramer\'s V'].append(format(cramers_v, '.2f'))

    # Print the results
    print(f'Association between Diabetes_binary and {col}:')
    print(f'Chi-squared statistic: {chi2:.2f}')
    print(f"Cramer's V: {cramers_v:.2f}")
    print('----------------------------------')

Association between Diabetes_binary and Diabetes_binary:
Chi-squared statistic: 78842.00
Cramer's V: 1.00
----------------------------------
Association between Diabetes_binary and HighBP:
Chi-squared statistic: 9034.16
Cramer's V: 0.34
----------------------------------
Association between Diabetes_binary and HighChol:
Chi-squared statistic: 5345.55
Cramer's V: 0.26
----------------------------------
Association between Diabetes_binary and BMI:
Chi-squared statistic: 7186.66
Cramer's V: 0.30
----------------------------------
Association between Diabetes_binary and Smoker:
Chi-squared statistic: 298.61
Cramer's V: 0.06
----------------------------------
Association between Diabetes_binary and Stroke:
Chi-squared statistic: 940.59
Cramer's V: 0.11
----------------------------------
Association between Diabetes_binary and HeartDiseaseorAttack:
Chi-squared statistic: 2777.73
Cramer's V: 0.19
----------------------------------
Association between Diabetes_binary and PhysActivity:
Chi-squa

## Sort features' association with Diabetes_binary by descending Cramer's V

In [3]:
features = pd.DataFrame(associations)
features = features.sort_values(by='Cramer\'s V', ascending=False)
features = features.reset_index(drop=True)
print(features)

                 Feature Chi-squared Cramer's V
0        Diabetes_binary    78842.00       1.00
1                GenHlth    10673.19       0.37
2                 HighBP     9034.16       0.34
3                    BMI     7186.66       0.30
4                    Age     6033.97       0.28
5               HighChol     5345.55       0.26
6               DiffWalk     4614.19       0.24
7               PhysHlth     3158.17       0.20
8                 Income     2927.59       0.19
9   HeartDiseaseorAttack     2777.73       0.19
10             Education     1564.96       0.14
11          PhysActivity     1362.28       0.13
12                Stroke      940.59       0.11
13              MentHlth      803.10       0.10
14     HvyAlcoholConsump      713.81       0.10
15                Smoker      298.61       0.06
16               Veggies      227.71       0.05
17           NoDocbcCost      120.86       0.04
18                   Sex      139.27       0.04
19                Fruits       68.66    

We will use all features with a Cramer's V value of 0.20 or larger.
The following 7 features will be used in Model Fitting: GenHlth, HighBP, BMI, Age, HighChol, DiffWalk, PhysHealth