In [69]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [70]:
dataset_path = "./data/raw/alzheimers_prediction_dataset.csv"
df_raw = pd.read_csv(dataset_path)
df_raw.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,Normal,No,90,Low,Poor,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,Normal,No,65,Low,Good,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,Normal,No,43,High,Good,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,Normal,No,81,Medium,Average,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,Normal,No,49,High,Poor,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [71]:
# Find columns with dtype 'object'
object_columns = df_raw.select_dtypes(include=['object']).columns

# Get unique values for each object column
unique_values = {col: df_raw[col].value_counts(normalize=True) for col in object_columns}

print("Unique values for object columns:")
for col, values in unique_values.items():
    if col == "Country":
        continue
    print(f"{col}: {values.to_dict()}")

Unique values for object columns:
Gender: {'Female': 0.5014471682619173, 'Male': 0.49855283173808274}
Physical Activity Level: {'High': 0.33457184012492763, 'Low': 0.3332390991209294, 'Medium': 0.33218906075414295}
Smoking Status: {'Current': 0.3354064860062195, 'Never': 0.33305063069612156, 'Former': 0.33154288329765896}
Alcohol Consumption: {'Never': 0.33473338448904866, 'Regularly': 0.33302370663543474, 'Occasionally': 0.3322429088755166}
Diabetes: {'No': 0.8013542802525477, 'Yes': 0.19864571974745232}
Hypertension: {'No': 0.7018294899236703, 'Yes': 0.2981705100763297}
Cholesterol Level: {'Normal': 0.6996621030383803, 'High': 0.3003378969616198}
Family History of Alzheimer’s: {'No': 0.7000794259790262, 'Yes': 0.29992057402097383}
Depression Level: {'Medium': 0.3344372198214935, 'Low': 0.3332929472423031, 'High': 0.33226983293620344}
Sleep Quality: {'Good': 0.3385027529852052, 'Average': 0.33271407993753616, 'Poor': 0.3287831670772586}
Dietary Habits: {'Average': 0.3354334100669063, 

In [72]:
def preprocess(df):
    cols_remove_list = ["Country"]
    df.drop(columns=cols_remove_list, inplace=True)

    # Binary Encoding
    df['Gender'] = (df['Gender'] == 'Male').astype(int)  
    df['Diabetes'] = (df['Diabetes'] == 'Yes').astype(int)  
    df['Hypertension'] = (df['Hypertension'] == 'Yes').astype(int)  
    df['Cholesterol Level'] = (df['Cholesterol Level'] == 'High').astype(int)  
    df['Family History of Alzheimer’s'] = (df['Family History of Alzheimer’s'] == 'Yes').astype(int)  
    df['Genetic Risk Factor (APOE-ε4 allele)'] = (df['Genetic Risk Factor (APOE-ε4 allele)'] == 'Yes').astype(int)  
    df['Urban vs Rural Living'] = (df['Urban vs Rural Living'] == 'Urban').astype(int)
    df['Alzheimer’s Diagnosis'] = (df['Alzheimer’s Diagnosis'] == 'Yes').astype(int)

    # Ordinal Encoding
    pal_order = {'Low': 1, 'Medium': 2, 'High': 3}
    df['Physical Activity Level'] = df['Physical Activity Level'].map(pal_order)
    ac_order = {'Never': 1, 'Occasionally': 2, 'Regularly': 3}
    df['Alcohol Consumption'] = df['Alcohol Consumption'].map(ac_order)
    dl_order = {'Low': 1, 'Medium': 2, 'High': 3}
    df['Depression Level'] = df['Depression Level'].map(dl_order)
    sq_order = {'Poor': 1, 'Average': 2, 'Good': 3}
    df['Sleep Quality'] = df['Sleep Quality'].map(sq_order)
    dh_order = {'Unhealthy': 1, 'Average': 2, 'Healthy': 3}
    df['Dietary Habits'] = df['Dietary Habits'].map(dh_order)
    ape_order = {'Low': 1, 'Medium': 2, 'High': 3}
    df['Air Pollution Exposure'] = df['Air Pollution Exposure'].map(ape_order)
    sel_order = {'Low': 1, 'Medium': 2, 'High': 3}
    df['Social Engagement Level'] = df['Social Engagement Level'].map(sel_order)
    il_order = {'Low': 1, 'Medium': 2, 'High': 3}
    df['Income Level'] = df['Income Level'].map(il_order)
    sl_order = {'Low': 1, 'Medium': 2, 'High': 3}
    df['Stress Levels'] = df['Stress Levels'].map(sl_order)

    # Custom Encoding
    df['is_former_smoker'] = (df['Smoking Status'] == 'Former').astype(int) 
    df['Smoking Status'] = (df['Smoking Status'] == 'Current').astype(int) 
    df['is_retired'] = (df['Employment Status'] == 'Retired').astype(int) 
    df['Employment Status'] = (df['Employment Status'] == 'Employed').astype(int) 
    df['is_widowed'] = (df['Marital Status'] == 'Widowed').astype(int) 
    df['Marital Status'] = (df['Marital Status'] == 'Married').astype(int) 

    return df

In [73]:
df = preprocess(df_raw)

In [74]:
display(df.head())
display(df.info())
display(df.describe())

Unnamed: 0,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis,is_former_smoker,is_retired,is_widowed
0,90,1,1,33.0,2,0,2,0,0,0,0,90,1,1,3,3,0,0,0,1,2,3,1,0,0,1,0
1,72,1,7,29.9,2,0,1,0,0,0,0,65,1,3,3,2,0,0,0,3,1,3,1,0,1,0,1
2,86,0,19,22.9,3,1,2,0,1,0,0,43,3,3,2,2,1,0,0,1,2,3,0,0,0,0,0
3,53,1,17,31.2,1,0,3,1,0,0,0,81,2,2,3,2,0,0,0,3,2,1,0,0,0,1,0
4,58,0,3,30.0,3,0,1,1,0,0,0,49,3,1,1,3,1,1,0,1,2,3,0,0,1,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 27 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age                                   74283 non-null  int64  
 1   Gender                                74283 non-null  int64  
 2   Education Level                       74283 non-null  int64  
 3   BMI                                   74283 non-null  float64
 4   Physical Activity Level               74283 non-null  int64  
 5   Smoking Status                        74283 non-null  int64  
 6   Alcohol Consumption                   74283 non-null  int64  
 7   Diabetes                              74283 non-null  int64  
 8   Hypertension                          74283 non-null  int64  
 9   Cholesterol Level                     74283 non-null  int64  
 10  Family History of Alzheimer’s         74283 non-null  int64  
 11  Cognitive Test 

None

Unnamed: 0,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis,is_former_smoker,is_retired,is_widowed
count,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0
mean,71.964703,0.498553,9.487514,26.780639,2.001333,0.335406,1.99829,0.198646,0.298171,0.300338,0.299921,64.654241,1.998977,2.00972,2.000135,2.003366,0.333253,0.330802,0.198188,1.999596,1.999663,1.999152,0.499172,0.413459,0.331543,0.332876,0.330372
std,12.980748,0.500001,5.75702,4.764679,0.817201,0.472136,0.817168,0.398983,0.457458,0.458408,0.458226,20.153247,0.815825,0.816823,0.815215,0.816825,0.471379,0.470505,0.398637,0.815694,0.818141,0.815471,0.500003,0.492457,0.470771,0.471246,0.47035
min,50.0,0.0,0.0,18.5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,30.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,61.0,0.0,4.0,22.7,1.0,0.0,1.0,0.0,0.0,0.0,0.0,47.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,72.0,0.0,9.0,26.8,2.0,0.0,2.0,0.0,0.0,0.0,0.0,65.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,83.0,1.0,14.0,30.9,3.0,1.0,3.0,0.0,1.0,1.0,1.0,82.0,3.0,3.0,3.0,3.0,1.0,1.0,0.0,3.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0
max,94.0,1.0,19.0,35.0,3.0,1.0,3.0,1.0,1.0,1.0,1.0,99.0,3.0,3.0,3.0,3.0,1.0,1.0,1.0,3.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0


In [68]:
# Feature extraction
X, y = df.drop(columns=['Alzheimer’s Diagnosis']), df["Alzheimer’s Diagnosis"]
print("Number of feature columns:", len(X.columns))
# test = SelectKBest(score_func=chi2, k=10)
# fit = test.fit(X, y)

# # Summarize scores
# np.set_printoptions(precision=3)
# print(fit.scores_)

# features = fit.transform(X)
# # Summarize selected features
# print(features[0:10,:])

k = 5
selector = SelectKBest(score_func=chi2, k=k)
X_selected = selector.fit_transform(X, y)

# Get the indices of the selected features
print(selector.scores_)
selected_indices = np.argsort(selector.scores_)[::-1][:k]
selected_features = X.columns[selected_indices]

# Print the selected features
print("Selected Features:")
print(selected_features)



Number of feature columns: 26
[3.067e+04 6.929e-02 3.615e+00 1.825e-01 1.323e-01 3.254e-01 7.954e-02
 4.252e-01 7.233e-02 2.300e-01 1.032e+03 6.083e-01 5.777e-03 3.103e-02
 4.700e-01 3.948e-01 1.446e+00 1.140e-01 2.253e+03 2.229e-01 3.776e-03
 4.769e-01 6.267e-01 7.867e-02 1.105e+00 7.555e-03]
Selected Features:
Index(['Age', 'Genetic Risk Factor (APOE-ε4 allele)',
       'Family History of Alzheimer’s', 'Education Level',
       'Employment Status'],
      dtype='object')


In [60]:
selected_cols = selected_features.to_list()
selected_cols.append("Alzheimer’s Diagnosis")
selected_cols.append("BMI")
df = df[selected_cols]

In [75]:
df.to_pickle('./data/processed/alzheimers_prediction_dataset.pkl')