In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

# Load your dataset (replace '/Users/pranjalts/Desktop/RESEARCH/who/HALE/dataset/missingvaluesfilled.csv' with your actual file path)
data = pd.read_csv('/data/raw/cleaned_dataset.csv')

#  Z-score Standardization for features in years or rates
zscore_columns = [
    'HALE_Birth',
    'HALE_60',  # This is the target variable, but normalize for comparison
    'infant mortality rate (between birth and 11 months per 1000 live births)',
    'Age-standardized suicide rates (per 100 000 population)',
    'Alcohol, total per capita (15+ years) consumption (in litres of pure alcohol) (SDG Indicator 3.5.2)',
    'Estimates of rate of homicides (per 100 000 population)',
    'Mean Total Cholesterol (crude estimate)'  # Adding this feature for Z-score normalization
]

scaler_standard = StandardScaler()
data[zscore_columns] = scaler_standard.fit_transform(data[zscore_columns])

# Min-Max Scaling for percentage-based features
percentage_columns = [
    'Prevalence of insufficient physical activity among adults aged 18+ years (age-standardized estimate) (%)',
    'Prevalence of obesity among adults, BMI Â³ 30 (age-standardized estimate) (%)',
    'Prevalence of raised blood pressure among adults aged 30-79 years',
    'Probability of dying between the exact ages 30 and 70 years from cardiovascular diseases, cancer, diabetes, or chronic respiratory diseases (SDG 3.4.1)'
]

scaler_minmax = MinMaxScaler()
data[percentage_columns] = scaler_minmax.fit_transform(data[percentage_columns])

# Cyclical encoding for 'Period' (year) to capture cyclic patterns
data['Period_sin'] = np.sin(2 * np.pi * data['Period'] / data['Period'].max())
data['Period_cos'] = np.cos(2 * np.pi * data['Period'] / data['Period'].max())

# One-hot encoding for categorical features (ParentLocation, Location, Dim1)
data = pd.get_dummies(data, columns=['ParentLocation', 'Location', 'Dim1'], drop_first=True)

# Save the normalized dataset to a CSV file
data.to_csv('/data/raw/normalized_dataset.csv', index=False)

print("Data normalization complete and saved to 'normalized_dataset.csv'")

Data normalization complete and saved to 'normalized_dataset.csv'
