In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [11]:
# Step 2: Load the dataset and display first five rows
file_path = "C:\\MISC\\alzheimers_prediction_dataset.csv" 
df = pd.read_csv(file_path)

# Display the first few rows
df.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [13]:
# Step 3: Check dataset information
# Get basic dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               74283 non-null  object 
 1   Age                                   74283 non-null  int64  
 2   Gender                                74283 non-null  object 
 3   Education Level                       74283 non-null  int64  
 4   BMI                                   74283 non-null  float64
 5   Physical Activity Level               74283 non-null  object 
 6   Smoking Status                        74283 non-null  object 
 7   Alcohol Consumption                   74283 non-null  object 
 8   Diabetes                              74283 non-null  object 
 9   Hypertension                          74283 non-null  object 
 10  Cholesterol Level                     74283 non-null  object 
 11  Family History 

In [15]:
# Step 4: Summary statistics of the dataset
# Get summary statistics
df.describe(include="all")

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
count,74283,74283.0,74283,74283.0,74283.0,74283,74283,74283,74283,74283,...,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283
unique,20,,2,,,3,3,3,2,2,...,3,3,3,3,2,3,3,3,2,2
top,Brazil,,Female,,,High,Current,Never,No,No,...,Average,High,Unemployed,Single,No,Medium,Low,Medium,Rural,No
freq,3839,,37249,,,24853,24915,24865,59527,52134,...,24917,24906,24801,25169,59561,24859,24873,24886,37203,43570
mean,,71.964703,,9.487514,26.780639,,,,,,...,,,,,,,,,,
std,,12.980748,,5.75702,4.764679,,,,,,...,,,,,,,,,,
min,,50.0,,0.0,18.5,,,,,,...,,,,,,,,,,
25%,,61.0,,4.0,22.7,,,,,,...,,,,,,,,,,
50%,,72.0,,9.0,26.8,,,,,,...,,,,,,,,,,
75%,,83.0,,14.0,30.9,,,,,,...,,,,,,,,,,


In [17]:
# Step 4: Identify nissing values
# Check for missing values
df.isnull().sum()

Country                                 0
Age                                     0
Gender                                  0
Education Level                         0
BMI                                     0
Physical Activity Level                 0
Smoking Status                          0
Alcohol Consumption                     0
Diabetes                                0
Hypertension                            0
Cholesterol Level                       0
Family History of Alzheimer’s           0
Cognitive Test Score                    0
Depression Level                        0
Sleep Quality                           0
Dietary Habits                          0
Air Pollution Exposure                  0
Employment Status                       0
Marital Status                          0
Genetic Risk Factor (APOE-ε4 allele)    0
Social Engagement Level                 0
Income Level                            0
Stress Levels                           0
Urban vs Rural Living             

In [19]:
# Step 5: Convert binary categorical variables 0/1
# Convert Yes/No categorical variables to 0/1
binary_columns = ["Diabetes", "Hypertension", "Genetic Risk Factor (APOE-ε4 allele)", "Alzheimer’s Diagnosis"]
for col in binary_columns:
    df[col] = df[col].map({"Yes": 1, "No": 0})

# Display first few rows to confirm changes
df.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,0,0,...,Healthy,High,Retired,Single,0,Low,Medium,High,Urban,0
1,Argentina,72,Male,7,29.9,Medium,Former,Never,0,0,...,Healthy,Medium,Unemployed,Widowed,0,High,Low,High,Urban,0
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,0,1,...,Average,Medium,Employed,Single,0,Low,Medium,High,Rural,0
3,China,53,Male,17,31.2,Low,Never,Regularly,1,0,...,Healthy,Medium,Retired,Single,0,High,Medium,Low,Rural,0
4,Sweden,58,Female,3,30.0,High,Former,Never,1,0,...,Unhealthy,High,Employed,Married,0,Low,Medium,High,Rural,0


In [21]:
# Step 6: Encode categorical variables
# List of categorical variables to encode
categorical_columns = ["Country", "Gender", "Physical Activity Level", "Smoking Status",
                       "Alcohol Consumption", "Cholesterol Level", "Family History of Alzheimer’s",
                       "Depression Level", "Sleep Quality", "Dietary Habits", "Air Pollution Exposure",
                       "Employment Status", "Marital Status", "Social Engagement Level",
                       "Income Level", "Stress Levels", "Urban vs Rural Living"]

# Encode categorical variables using LabelEncoder
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the encoder for future use

# Display first few rows to verify encoding
df.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,16,90,1,1,33.0,2,2,1,0,0,...,1,0,1,1,0,1,2,0,1,0
1,0,72,1,7,29.9,2,1,0,0,0,...,1,2,2,2,0,0,1,0,1,0
2,14,86,0,19,22.9,0,0,1,0,1,...,0,2,0,1,0,1,2,0,0,0
3,4,53,1,17,31.2,1,2,2,1,0,...,1,2,1,1,0,0,2,1,0,0
4,17,58,0,3,30.0,0,1,0,1,0,...,2,0,0,0,0,1,2,0,0,0


In [25]:
# Step 7: Standardize numerical variables
# List of numerical columns to scale
numerical_columns = ["Age", "Education Level", "BMI", "Cognitive Test Score"]

# Initialize StandardScaler and scale numerical variables
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Display first few rows after scaling
df.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,16,1.389398,1,-1.474299,1.305314,2,2,1,0,0,...,1,0,1,1,0,1,2,0,1,0
1,0,0.002719,1,-0.432087,0.654689,2,1,0,0,0,...,1,2,2,2,0,0,1,0,1,0
2,14,1.081247,0,1.652339,-0.814465,0,0,1,0,1,...,0,2,0,1,0,1,2,0,0,0
3,4,-1.460997,1,1.304935,0.927532,1,2,2,1,0,...,1,2,1,1,0,0,2,1,0,0
4,17,-1.075808,0,-1.126895,0.675677,0,1,0,1,0,...,2,0,0,0,0,1,2,0,0,0


In [29]:
# Step 8: Identify potential target variables (cloumns)
# Display the extracted variables in a structured format
# Extract target variable
target_variable = "Alzheimer’s Diagnosis"

# Identify predictor variables (all columns except the target variable)
predictor_variables = [col for col in df.columns if col != target_variable]

# Display the target variable
print("Target Variable:")
print(target_variable)

# Display predictor variables
print("\nPredictor Variables:")
for col in predictor_variables:
    print("-", col)

Target Variable:
Alzheimer’s Diagnosis

Predictor Variables:
- Country
- Age
- Gender
- Education Level
- BMI
- Physical Activity Level
- Smoking Status
- Alcohol Consumption
- Diabetes
- Hypertension
- Cholesterol Level
- Family History of Alzheimer’s
- Cognitive Test Score
- Depression Level
- Sleep Quality
- Dietary Habits
- Air Pollution Exposure
- Employment Status
- Marital Status
- Genetic Risk Factor (APOE-ε4 allele)
- Social Engagement Level
- Income Level
- Stress Levels
- Urban vs Rural Living


In [31]:
# Step 9: Save the cleaned dataset
# Save the cleaned dataset to a new CSV file
cleaned_file_path = "cleaned_alzheimers_dataset.csv"  # Define the output file name
df.to_csv(cleaned_file_path, index=False)

# Confirm successful save
print(f"Cleaned dataset saved successfully as '{cleaned_file_path}'.")

Cleaned dataset saved successfully as 'cleaned_alzheimers_dataset.csv'.


In [33]:
# Show working directory
# Import OS module
import os

# Get and print the current working directory
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: C:\Users\rdarn
