In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv("data/alzheimers_prediction_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [4]:
size = df.shape[0]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               74283 non-null  object 
 1   Age                                   74283 non-null  int64  
 2   Gender                                74283 non-null  object 
 3   Education Level                       74283 non-null  int64  
 4   BMI                                   74283 non-null  float64
 5   Physical Activity Level               74283 non-null  object 
 6   Smoking Status                        74283 non-null  object 
 7   Alcohol Consumption                   74283 non-null  object 
 8   Diabetes                              74283 non-null  object 
 9   Hypertension                          74283 non-null  object 
 10  Cholesterol Level                     74283 non-null  object 
 11  Family History 

No columns have any null values

In [6]:
df.describe()

Unnamed: 0,Age,Education Level,BMI,Cognitive Test Score
count,74283.0,74283.0,74283.0,74283.0
mean,71.964703,9.487514,26.780639,64.654241
std,12.980748,5.75702,4.764679,20.153247
min,50.0,0.0,18.5,30.0
25%,61.0,4.0,22.7,47.0
50%,72.0,9.0,26.8,65.0
75%,83.0,14.0,30.9,82.0
max,94.0,19.0,35.0,99.0


Some range in values, will use standard scalar to fix this because there is no reason to not use another (not a big range, not highly skewed)

In [17]:
scaler = StandardScaler()
numerical_cols = ["Age", "Education Level", "BMI", "Cognitive Test Score"]
df_numerical = df[numerical_cols]
scaled = scaler.fit_transform(df_numerical)
df[numerical_cols] = scaled
df.describe()

Unnamed: 0,Age,Education Level,BMI,Cognitive Test Score
count,74283.0,74283.0,74283.0,74283.0
mean,2.133073e-17,5.4522480000000006e-17,-2.49536e-16,-2.587427e-16
std,1.000007,1.000007,1.000007,1.000007
min,-1.69211,-1.648002,-1.737933,-1.719548
25%,-0.8446953,-0.9531929,-0.8564409,-0.8760057
50%,0.002719233,-0.08468222,0.004063489,0.0171566
75%,0.8501337,0.7838285,0.8645679,0.8606988
max,1.697548,1.652339,1.725072,1.704241


In [7]:
df.describe(include="object")

Unnamed: 0,Country,Gender,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Depression Level,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
count,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,...,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283
unique,20,2,3,3,3,2,2,2,2,3,...,3,3,3,3,2,3,3,3,2,2
top,Brazil,Female,High,Current,Never,No,No,Normal,No,Medium,...,Average,High,Unemployed,Single,No,Medium,Low,Medium,Rural,No
freq,3839,37249,24853,24915,24865,59527,52134,51973,52004,24843,...,24917,24906,24801,25169,59561,24859,24873,24886,37203,43570


Country will get dropped because there are too many values to encode. This would lead to curse of dimensionality or the model creating an ordinal relationship between countries when one does not exist. Also, should not contribute to target variable. 

In [8]:
df = df.drop(columns=["Country"])

In [9]:
df.describe(include="object")

Unnamed: 0,Gender,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
count,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283
unique,2,3,3,3,2,2,2,2,3,3,3,3,3,3,2,3,3,3,2,2
top,Female,High,Current,Never,No,No,Normal,No,Medium,Good,Average,High,Unemployed,Single,No,Medium,Low,Medium,Rural,No
freq,37249,24853,24915,24865,59527,52134,51973,52004,24843,25145,24917,24906,24801,25169,59561,24859,24873,24886,37203,43570


All columns with only 2 or 3 values. This means one-hot encoding should work for most. However, for columns like physical activity, cholesterol, Depression, Air pollution, the labels are along the lines of low, medium, and high. This means label encoding is suitable for these columns as there is an ordinal relationship between the values. 

In [10]:
print("Unique values for:\nAlcohol Consumption: ", df["Alcohol Consumption"].unique()) # one-hot
print("Smoking Status: ", df["Smoking Status"].unique()) # one-hot
print("Depression Level: ", df["Depression Level"].unique()) # try label
print("Dietary Habits: ", df["Dietary Habits"].unique()) # one-hot
print("Employment Status: ", df["Employment Status"].unique()) # one-hot
print("Marital Status: ", df["Marital Status"].unique()) # one-hot
print("Social Engagement Level: ", df["Social Engagement Level"].unique()) # try label

Unique values for:
Alcohol Consumption:  ['Occasionally' 'Never' 'Regularly']
Smoking Status:  ['Never' 'Former' 'Current']
Depression Level:  ['Low' 'High' 'Medium']
Dietary Habits:  ['Healthy' 'Average' 'Unhealthy']
Employment Status:  ['Retired' 'Unemployed' 'Employed']
Marital Status:  ['Single' 'Widowed' 'Married']
Social Engagement Level:  ['Low' 'High' 'Medium']


In [11]:
one_hot_cols=["Gender", "Smoking Status", "Alcohol Consumption", "Diabetes", "Hypertension", "Cholesterol Level", 
              "Family History of Alzheimer's", "Dietary Habits", "Employment Status", "Marital Status", 
              "Genetic Risk Factor (APOE-ε4 allele)", "Urban vs Rural Living", "Alzeimer's Diagnosis"]
label_enc_cols = ["Physical Activity Level", "Depression Level", "Sleep Quality",  "Air Pollution Exposure",
                 "Social Engagment", "Income Level", "Stress Levels"] 
# try doing label encoding to see if it is better than one-hot for all

In [12]:
df_one_hot = df.copy() # one hot all
df_label = df.copy() # label encode select columns