# Testing the models ✅

### Accurate Model 🤓


RMSE is a commonly used metric to measure the average magnitude of prediction errors.

*   RMSE (train) = 1.186
*   RMSE (test) = 1.243

MAPE measures the average percentage difference between predicted and actual values.
*   MAPE = 1.442 %


> All rounded to rounded to 4 sf

###### Hidden work

In [49]:
import numpy as np  # For numerical computations and array manipulation
import pandas as pd  # For data manipulation and analysis using DataFrames

from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets

import statsmodels.api as sm  # For statistical modeling and analysis
import statsmodels.tools  # Provides additional utilities for statsmodels

import seaborn as sns  # For creating informative and aesthetic visualizations
import matplotlib.pyplot as plt  # For plotting and data visualization

from sklearn.preprocessing import StandardScaler  # For standardising features by scaling to unit variance
from sklearn import metrics  # For evaluating model performance using various metrics


In [50]:
# Define a function which applies standard scaling to the numerical columns:
def scaling(df):
    df = df.copy()
    # List the columns to be scaled
    scaled_col_names = ["Year", "Infant_deaths", "Under_five_deaths", "Adult_mortality", "Alcohol_consumption",
                "Hepatitis_B", "Measles", "BMI", "Polio", "Diphtheria", "Incidents_HIV", "GDP_per_capita",
                "Population_mln", "Thinness_ten_nineteen_years", "Thinness_five_nine_years", "Schooling",
                "Economy_status_Developed", "Economy_status_Developing", 'GDP_per_capita_log', 'Incidents_HIV_log']
    features = df[scaled_col_names]
    # Fit and transform the scaler on the features to be scaled
    scaler = StandardScaler().fit(features)
    scaled_features = scaler.transform(features)
    df[scaled_col_names] = scaled_features
    return df

In [51]:
# Load the dataset into a Pandas DataFrame
df = pd.read_csv("Life Expectancy Data.csv")

# Prepare feature columns (X) and target column (y)
feature_cols = list(df.columns)  # Extract all column names
feature_cols.remove('Life_expectancy')  # Remove target column from feature list
X = df[feature_cols]  # Feature DataFrame
y = df['Life_expectancy']  # Target variable

# Split the data into training and testing sets
# Stratify ensures Country distribution in training/testing sets remains consistent
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=104, stratify=X['Country'])

# Reset indices to ensure they start from 0 after splitting
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Define the feature engineering function
def feature_eng(df):
    df = df.copy()  # Operate on a copy to avoid modifying the original DataFrame

    # Apply log transformations to columns with logarithmic relationships
    df['GDP_per_capita_log'] = df['GDP_per_capita'].apply(lambda x: np.log(x))
    df['Incidents_HIV_log'] = df['Incidents_HIV'].apply(lambda x: -np.log(x))  # Negative log for inverse relationship

    # Scale numeric features using a standard scaler (assumes 'scaling' is defined elsewhere)
    scaled_df = scaling(df)  # Replace this with your scaling implementation

    # Perform one-hot encoding on the 'Region' column, converting categories into binary columns
    scaled_df = pd.get_dummies(scaled_df, columns=['Region'], drop_first=True, prefix='Region', dtype=int)

    # Add a constant column for the statsmodels regression model
    scaled_df = sm.add_constant(scaled_df)

    # Return the feature-engineered DataFrame
    return scaled_df

# Apply feature engineering to the training data
X_train_fe = feature_eng(X_train)

# Specify the final list of feature columns to include in the regression model
feature_cols = ['const', 'Year', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
                'Hepatitis_B', 'BMI', 'Polio', 'Incidents_HIV_log', 'GDP_per_capita_log',
                'Thinness_ten_nineteen_years', 'Schooling', 'Economy_status_Developed',
                'Region_Asia', 'Region_Central America and Caribbean',
                'Region_European Union', 'Region_Middle East', 'Region_North America',
                'Region_Oceania', 'Region_Rest of Europe', 'Region_South America']

# Train a linear regression model using statsmodels
lin_reg = sm.OLS(y_train, X_train_fe[feature_cols])  # OLS expects y (target) first, then X (features)
results = lin_reg.fit()  # Fit the model to the training data

# Display the summary of the linear regression model
results.summary()



0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,6991.0
Date:,"Mon, 09 Dec 2024",Prob (F-statistic):,0.0
Time:,11:53:29,Log-Likelihood:,-3640.7
No. Observations:,2291,AIC:,7323.0
Df Residuals:,2270,BIC:,7444.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.7448,0.077,894.340,0.000,68.594,68.896
Year,0.1920,0.027,7.147,0.000,0.139,0.245
Infant_deaths,-1.3676,0.175,-7.807,0.000,-1.711,-1.024
Under_five_deaths,-2.1568,0.175,-12.302,0.000,-2.501,-1.813
Adult_mortality,-4.9577,0.059,-84.704,0.000,-5.073,-4.843
Hepatitis_B,-0.1585,0.038,-4.182,0.000,-0.233,-0.084
BMI,-0.3186,0.048,-6.621,0.000,-0.413,-0.224
Polio,0.1352,0.049,2.737,0.006,0.038,0.232
Incidents_HIV_log,0.2124,0.044,4.792,0.000,0.125,0.299

0,1,2,3
Omnibus:,13.393,Durbin-Watson:,1.937
Prob(Omnibus):,0.001,Jarque-Bera (JB):,13.438
Skew:,0.182,Prob(JB):,0.00121
Kurtosis:,3.089,Cond. No.,33.0


#### Code used to test


In [52]:
# Generate predictions for the training data using the fitted model
y_train_pred = results.predict(X_train_fe[feature_cols])

# Calculate the Root Mean Squared Error (RMSE) for the training set
# RMSE is a commonly used metric to measure the average magnitude of prediction errors
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_train_pred)

# Print the RMSE value for the training set
# A lower RMSE indicates better model performance
print('TRAIN:', rmse)


TRAIN: 1.1855129546259728


In [53]:
# Apply feature engineering to the test set
# Ensures the test data is preprocessed in the same way as the training data
X_test_fe = feature_eng(X_test)

# Generate predictions for the test set using the trained regression model
y_test_pred = results.predict(X_test_fe[feature_cols])

# Calculate the Root Mean Squared Error (RMSE) for the test set
# RMSE measures how well the model performs on unseen data
rmse2 = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)

# Print the RMSE value for the test set
# This provides an indication of model generalization to new data
print('TEST:', rmse2)


TEST: 1.2428440385066843


In [54]:
# Calculate the Mean Absolute Percentage Error (MAPE) for the training set
# MAPE measures the average percentage difference between predicted and actual values
mape = metrics.mean_absolute_percentage_error(y_train, y_train_pred)

# Convert the result to a percentage
mape_percentage = mape * 100

# Print the MAPE as a percentage
print(mape_percentage)

1.4417444653395164


### Ethical Model 👍



RMSE is a commonly used metric to measure the average magnitude of prediction errors.

*   RMSE (train) = 2.213
*   RMSE (test) = 2.348

MAPE measures the average percentage difference between predicted and actual values.
*   MAPE = 2.570 %


> All rounded to rounded to 4 sf

###### Hidden work

In [56]:
# Specify the final list of feature columns to include in the regression model for the ethical model
feature_cols = ['const', 'Year', 'Adult_mortality','Alcohol_consumption', 'GDP_per_capita_log']

# Train a linear regression model using statsmodels
lin_reg = sm.OLS(y_train, X_train_fe[feature_cols])  # OLS expects y (target) first, then X (features)
results = lin_reg.fit()  # Fit the model to the training data

# Display the summary of the linear regression model
results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.944
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,9694.0
Date:,"Mon, 09 Dec 2024",Prob (F-statistic):,0.0
Time:,11:53:59,Log-Likelihood:,-5070.7
No. Observations:,2291,AIC:,10150.0
Df Residuals:,2286,BIC:,10180.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.9270,0.046,1489.107,0.000,68.836,69.018
Year,0.3543,0.047,7.549,0.000,0.262,0.446
Adult_mortality,-7.1839,0.065,-110.289,0.000,-7.312,-7.056
Alcohol_consumption,0.8304,0.057,14.499,0.000,0.718,0.943
GDP_per_capita_log,2.0945,0.075,27.756,0.000,1.947,2.243

0,1,2,3
Omnibus:,241.768,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,621.995
Skew:,-0.594,Prob(JB):,8.619999999999999e-136
Kurtosis:,5.26,Cond. No.,2.96


#### Code used to test

In [57]:
# Generate predictions for the training data using the fitted model
y_train_pred = results.predict(X_train_fe[feature_cols])

# Calculate the Root Mean Squared Error (RMSE) for the training set
# RMSE is a commonly used metric to measure the average magnitude of prediction errors
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_train_pred)

# Print the RMSE value for the training set
# A lower RMSE indicates better model performance
print('TRAIN:', rmse)


TRAIN: 2.213102472568888


In [58]:
# Apply feature engineering to the test set
# Ensures the test data is preprocessed in the same way as the training data
X_test_fe = feature_eng(X_test)

# Generate predictions for the test set using the trained regression model
y_test_pred = results.predict(X_test_fe[feature_cols])

# Calculate the Root Mean Squared Error (RMSE) for the test set
# RMSE measures how well the model performs on unseen data
rmse2 = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)

# Print the RMSE value for the test set
# This provides an indication of model generalization to new data
print('TEST:', rmse2)

TEST: 2.3481105398444164


In [59]:
# Calculate the Mean Absolute Percentage Error (MAPE) for the training set
# MAPE measures the average percentage difference between predicted and actual values
mape = metrics.mean_absolute_percentage_error(y_train, y_train_pred)

# Convert the result to a percentage
mape_percentage = mape * 100

# Print the MAPE as a percentage
print(mape_percentage)

2.5704587939261936
