In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Load the dataset
file_path = 'alzheimers_prediction_dataset.csv'
df = pd.read_csv(file_path)
print(df.head())
print(df.shape)

        Country  Age  Gender  Education Level   BMI Physical Activity Level  \
0         Spain   90    Male                1  33.0                  Medium   
1     Argentina   72    Male                7  29.9                  Medium   
2  South Africa   86  Female               19  22.9                    High   
3         China   53    Male               17  31.2                     Low   
4        Sweden   58  Female                3  30.0                    High   

  Smoking Status Alcohol Consumption Diabetes Hypertension  ...  \
0          Never        Occasionally       No           No  ...   
1         Former               Never       No           No  ...   
2        Current        Occasionally       No          Yes  ...   
3          Never           Regularly      Yes           No  ...   
4         Former               Never      Yes           No  ...   

  Dietary Habits Air Pollution Exposure  Employment Status Marital Status  \
0        Healthy                   High      

In [5]:

# List of categorical features
categorical_features = ['Country', 'Gender', 'Education Level', 'Physical Activity Level', 'Smoking Status', 
                        'Alcohol Consumption', 'Employment Status', 'Marital Status', 'Genetic Risk Factor (APOE-ε4 allele)', 
                        'Social Engagement Level', 'Urban vs Rural Living']
features = df.columns

n=0
for feature in df.columns:
    unique_values = df[feature].unique()
    n+=1
    print(f"{n}. Unique values for {feature}: {unique_values}")

1. Unique values for Country: ['Spain' 'Argentina' 'South Africa' 'China' 'Sweden' 'South Korea'
 'Germany' 'UK' 'Canada' 'India' 'Italy' 'USA' 'Russia' 'Japan'
 'Australia' 'France' 'Norway' 'Saudi Arabia' 'Mexico' 'Brazil']
2. Unique values for Age: [90 72 86 53 58 55 91 67 84 80 69 56 61 64 51 60 82 63 76 87 89 73 68 57
 77 62 59 50 88 78 52 74 70 75 65 81 94 79 93 92 83 85 71 66 54]
3. Unique values for Gender: ['Male' 'Female']
4. Unique values for Education Level: [ 1  7 19 17  3  2 18 11 15 10  6 13 12  4 16  5 14  0  8  9]
5. Unique values for BMI: [33.  29.9 22.9 31.2 30.  34.  24.1 23.6 22.  32.4 31.6 19.8 35.  32.1
 29.1 33.1 27.9 31.7 28.  30.9 19.3 25.  20.5 30.7 31.3 26.  34.1 25.6
 27.  25.8 27.2 27.5 19.7 30.2 27.8 26.1 19.5 33.2 30.4 25.4 30.8 33.8
 28.6 27.7 26.3 28.4 29.5 29.  33.6 23.5 31.4 19.9 20.1 18.8 19.  23.1
 34.7 31.9 29.3 22.6 21.3 23.3 26.5 23.9 31.5 32.6 24.3 22.2 32.  29.6
 25.2 33.9 20.9 21.  26.6 34.6 25.7 21.5 19.2 20.7 18.6 24.  32.5 20.3
 20.2 18.9 

In [None]:
def grab_cols(df):
    num_cols = list(df.select_dtypes(include="number"))
    cat_cols = [col for col in df.columns if col not in num_cols]
    num_but_cat = [col for col in num_cols if df[col].nunique()<25]
    cat_but_car = [col for col in cat_cols if df[col].nunique() >25]
    cat_cols = cat_cols + num_but_cat 
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"cat_cols = {cat_cols}")
    print(f"num_cols = {num_cols)}")
    print(f"num_but_cat = {num_but_cat}")
    print(f"cat_but_car= {cat_but_car}")
    return cat_cols,num_cols,cat_but_car,num_but_cat

grab_cols(df)

cat_cols = ['Country', 'Gender', 'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption', 'Diabetes', 'Hypertension', 'Cholesterol Level', 'Family History of Alzheimer’s', 'Depression Level', 'Sleep Quality', 'Dietary Habits', 'Air Pollution Exposure', 'Employment Status', 'Marital Status', 'Genetic Risk Factor (APOE-ε4 allele)', 'Social Engagement Level', 'Income Level', 'Stress Levels', 'Urban vs Rural Living', 'Alzheimer’s Diagnosis', 'Education Level']
num_cols = ['Age', 'BMI', 'Cognitive Test Score']
num_but_cat = ['Education Level']
cat_but_car= []


(['Country',
  'Gender',
  'Physical Activity Level',
  'Smoking Status',
  'Alcohol Consumption',
  'Diabetes',
  'Hypertension',
  'Cholesterol Level',
  'Family History of Alzheimer’s',
  'Depression Level',
  'Sleep Quality',
  'Dietary Habits',
  'Air Pollution Exposure',
  'Employment Status',
  'Marital Status',
  'Genetic Risk Factor (APOE-ε4 allele)',
  'Social Engagement Level',
  'Income Level',
  'Stress Levels',
  'Urban vs Rural Living',
  'Alzheimer’s Diagnosis',
  'Education Level'],
 ['Age', 'BMI', 'Cognitive Test Score'],
 [],
 ['Education Level'])

In [None]:

# Binary Encoding
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Genetic Risk Factor (APOE-ε4 allele)'] = df['Genetic Risk Factor (APOE-ε4 allele)'].map({'No': 0, 'Yes': 1})
df['Urban vs Rural Living'] = df['Urban vs Rural Living'].map({'Urban': 0, 'Rural': 1})
print("\nAfter Binary Encoding:")
print(df.head())

# Ordinal Encoding
ordinal_features = ['Physical Activity Level', 'Smoking Status', 'Alcohol Consumption', 'Social Engagement Level']
ordinal_mapping = {
    'Physical Activity Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Smoking Status': {'Never': 0, 'Former': 1, 'Current': 2},
    'Alcohol Consumption': {'Never': 0, 'Occasionally': 1, 'Regularly': 2},
    'Social Engagement Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Marital Status': {'Single': 0, 'Married': 1, 'Divorced': 2, 'Widowed': 3},
    'Employment Status': {'Unemployed': 0, 'Employed': 1, 'Retired': 2},
    'Marital Status': {'Single': 0, 'Married': 1, 'Widowed': 2},
}
for feature in ordinal_features:
    df[feature] = df[feature].map(ordinal_mapping[feature])
print("\nAfter Ordinal Encoding:")
print(df.head())
ohe = OneHotEncoder(drop='first', sparse_output=False)  # drop='first' to avoid dummy variable trap

# Fit and transform the categorical columns
encoded_array = ohe.fit_transform(df[['Country']])

# Get feature names for the new encoded columns
encoded_feature_names = ohe.get_feature_names_out(['Country'])

# Convert the encoded array into a DataFrame
df_encoded = pd.DataFrame(encoded_array, columns=encoded_feature_names)

# Concatenate the original DataFrame (without categorical columns) with the new encoded DataFrame
df_final = pd.concat([df.drop(columns=['Country']), df_encoded], axis=1)


missing_values = df_final.isnull().sum()
print("\nMissing Values in Final DataFrame:")
print(missing_values)


if missing_values.any() > 0:
    df_final = df_final.dropna()
    print("\nRows with Missing Values have been dropped.")
    print("\nUpdated Final DataFrame:")
    print(df_final)

print("\nOne-Hot Encoded DataFrame:")
print(df_final)




After Binary Encoding:
   Age  Gender  Education Level   BMI  Physical Activity Level  \
0   90     NaN                1  33.0                      NaN   
1   72     NaN                7  29.9                      NaN   
2   86     NaN               19  22.9                      NaN   
3   53     NaN               17  31.2                      NaN   
4   58     NaN                3  30.0                      NaN   

   Smoking Status  Alcohol Consumption  Diabetes  Hypertension  \
0             NaN                  NaN         0             0   
1             NaN                  NaN         0             0   
2             NaN                  NaN         0             1   
3             NaN                  NaN         1             0   
4             NaN                  NaN         1             0   

  Cholesterol Level  ... Country_South Africa  Country_South Korea  \
0            Normal  ...                False                False   
1            Normal  ...                Fa

KeyError: "None of [Index(['Country'], dtype='object')] are in the [columns]"