In [13]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler 
import numpy as np
import joblib
import pandas as pd

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [3]:
# Importing functions
from scripts.data_preprocessing_script import preprocess_data

In [4]:
from scripts.model_training_and_evaluation import train_and_evaluate_model

In [5]:
# Dataset path
data_source = '/Users/sawlehaanwaar/Documents/GitHub/life_expectancy_prediction_ml_model/data/raw/life_expectancy_data.csv'

df_processed = preprocess_data(data_source)

df_processed.head()

Data loaded: (2938, 22)
Step 1: Clean column names - done
Step 2: Convert column names to lowercase - done
Step 3: Encode categorical variables - done
Step 4: Impute missing values - done
Step 5: Transform skewed data - done
Step 6: Cap outliers - done
Step 7: Winsorize 'Percentage expenditure' - done
Preprocessing complete.


Unnamed: 0,country,year,life expectancy,adult mortality,infant deaths,alcohol,percentage expenditure,hepatitis b,measles,bmi,...,total expenditure,diphtheria,hiv/aids,gdp,population,thinness 1-19 years,thinness 5-9 years,income composition of resources,schooling,status_Developing
0,1.0,2015.0,65.0,263.0,7.874008,0.1,4.280542,3.555348,7.051856,19.1,...,8.16,65.0,0.09531,6.372055,17.334091,17.2,17.3,0.479,10.1,1.0
1,1.0,2014.0,59.9,271.0,8.0,0.1,4.311116,3.637586,6.200509,18.6,...,8.18,62.0,0.09531,6.419501,12.699497,17.5,17.5,0.476,10.0,1.0
2,1.0,2013.0,59.9,268.0,8.124038,0.1,4.307023,3.583519,6.066108,18.1,...,8.13,64.0,0.09531,6.450067,17.272826,17.7,17.7,0.47,9.9,1.0
3,1.0,2012.0,59.5,272.0,8.306624,0.1,4.371777,3.496508,7.93308,17.6,...,8.52,67.0,0.09531,6.508708,15.123021,17.9,18.0,0.463,9.8,1.0
4,1.0,2011.0,59.2,275.0,8.42615,0.1,2.091507,3.465736,8.011023,17.2,...,7.87,68.0,0.09531,4.167242,14.906964,18.2,18.2,0.454,9.5,1.0


In [6]:
# Training Lasso regression model using preprocessed data, of which training data is scaled, by calling the train_and_evaluate function
# Step 2: Call the train_and_evaluate_model function and capture the returned values
alpha = 0.01
lasso_model, scaler, X_train_scaled, X_test_scaled, y_train, y_test, y_pred = train_and_evaluate_model(df_processed, alpha=alpha)

Mean Absolute Error: 2.5696467912344008
Mean Squared Error: 11.479217614119534
Root Mean Squared Error: 3.3880994103065416
R-squared: 0.8662565670031569


In [7]:
# Making predictions on the test set
y_pred = lasso_model.predict(X_test_scaled)

In [8]:
# 
print(f"Training data (scaled) shape: {X_train_scaled.shape}")
print(f"Test data (scaled) shape: {X_test_scaled.shape}")
print(f"First 5 predictions: {y_pred[:5]}")

Training data (scaled) shape: (2350, 21)
Test data (scaled) shape: (588, 21)
First 5 predictions: [69.49437129 78.24730562 76.09156888 77.55545299 44.41897209]


In [10]:
# Step 5: Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print performance metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")



Mean Absolute Error (MAE): 2.5696467912344008
Mean Squared Error (MSE): 11.479217614119534
Root Mean Squared Error (RMSE): 3.3880994103065416
R-squared (R2): 0.8662565670031569


In [None]:
# Saving the trained model for future use
joblib.dump(lasso_model, 'lasso_model.pkl')

In [14]:
columns = df_processed.drop('life expectancy', axis=1).columns

In [16]:
# Checking to see scaled training data
df_scaled_train = pd.DataFrame(X_train_scaled, columns=columns)
print(df_scaled_train.head())

    country   year  adult mortality  infant deaths   alcohol  \
0 -0.683673 -0.125        -0.117647      -0.380256  0.155728   
1 -0.683673 -0.875         0.052288      -0.380256 -0.052882   
2  0.500000  0.375        -0.006536      -0.380256  0.717643   
3  0.826531  0.625        -0.856209       0.000000 -0.545089   
4 -0.724490 -0.750        -0.320261      -0.380256 -1.012620   

   percentage expenditure  hepatitis b   measles       bmi  under-five deaths  \
0                0.390537    -0.754932 -0.481968 -0.493151          -0.392232   
1                0.195000     0.245068 -0.481968 -0.586301          -0.392232   
2               -0.994490    -1.642857 -0.481968  0.027397          -0.392232   
3                0.518877    -1.198895  0.000000  0.432877          -0.052549   
4                0.627615    -1.642857 -0.481968 -0.424658          -0.392232   

   ...  total expenditure  diphtheria  hiv/aids       gdp  population  \
0  ...          -0.552044    0.263158  0.760835  0.2637