In [24]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tools

In [26]:
def linear_regression(data, minimal=False, custom=None):

    #Feature Engineering Function
    def feature_engineering(df):
        df = df.copy()
        df = pd.get_dummies(df, columns=["Region"], drop_first=True, prefix="region", dtype=int)        #One Hot Encoding the region column
        df["GDP_per_capita"] = df["GDP_per_capita"] / 1000                                              #Convert GDP into easier to use units
        df = sm.add_constant(df)
        return df
    
    #Reading the data
    df = pd.read_csv(data)

    #Setting up variables and target
    fcols = list(df.columns)
    fcols.remove("Life_expectancy")
    fcols.remove("Country")
    X = df[fcols]
    y = df.Life_expectancy

    #Train-test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

    #Applying Feature Engineering Function
    X_train_fe = feature_engineering(X_train)

    #Specifying columns to use in case minimal version is used
    if custom:
        cols = custom #Allows users to use a custom set of columns with the model
    elif minimal: #Without private medical data
        cols = ['const', 'GDP_per_capita', 'Schooling', 'region_Asia', 'region_Central America and Caribbean', 'region_European Union','region_Middle East', 'region_North America', 
                'region_Oceania','region_Rest of Europe', 'region_South America']

    else: #With private medical data
        cols = ['const','Adult_mortality','Under_five_deaths',
                'GDP_per_capita','region_Central America and Caribbean', 'region_European Union']
        
    #Creating and fitting the model
    lin_reg = sm.OLS(y_train, X_train_fe[cols])
    results = lin_reg.fit()

    #Using model to make predictions on training data and report Root Mean Squared Error
    y_pred = results.predict(X_train_fe[cols])
    train_rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
    print(f"Training data RMSE:\t{train_rmse}")

    #Feature Engineering on test data to match training
    X_test_fe = feature_engineering(X_test)

    #Predicting on testing data and reporting RMSE
    y_test_pred = results.predict(X_test_fe[cols])
    test_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
    print(f"Testing data RMSE:\t{test_rmse}")

    return results


In [29]:
results = linear_regression("Life Expectancy Data.csv", minimal=True) 
results.summary()

Training data RMSE:	4.971241899293658
Testing data RMSE:	4.724580403246928


0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.722
Model:,OLS,Adj. R-squared:,0.72
Method:,Least Squares,F-statistic:,591.2
Date:,"Wed, 29 May 2024",Prob (F-statistic):,0.0
Time:,13:50:53,Log-Likelihood:,-6924.8
No. Observations:,2291,AIC:,13870.0
Df Residuals:,2280,BIC:,13930.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,53.2959,0.305,174.811,0.000,52.698,53.894
GDP_per_capita,0.1224,0.008,15.351,0.000,0.107,0.138
Schooling,0.9281,0.053,17.668,0.000,0.825,1.031
region_Asia,8.4562,0.359,23.572,0.000,7.753,9.160
region_Central America and Caribbean,10.9388,0.408,26.802,0.000,10.138,11.739
region_European Union,10.6629,0.467,22.836,0.000,9.747,11.579
region_Middle East,11.1179,0.450,24.732,0.000,10.236,11.999
region_North America,10.4344,0.895,11.661,0.000,8.680,12.189
region_Oceania,7.7332,0.494,15.665,0.000,6.765,8.701

0,1,2,3
Omnibus:,82.768,Durbin-Watson:,1.959
Prob(Omnibus):,0.0,Jarque-Bera (JB):,198.519
Skew:,0.171,Prob(JB):,7.8e-44
Kurtosis:,4.401,Cond. No.,208.0


In [22]:
list(results.params)

[53.29586723292625,
 0.12243668415386055,
 0.928052406587341,
 8.456219692528556,
 10.938777802504932,
 10.662916648072365,
 11.117855051281126,
 10.434409816790037,
 7.733166882339386,
 8.775122941806782,
 11.260807581924542]

In [20]:
vif_cols = ['Under_five_deaths', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years', 'region_Asia',
       'region_Central America and Caribbean', 'region_European Union',
       'region_Middle East', 'region_North America', 'region_Oceania',
       'region_Rest of Europe', 'region_South America', 'alcohol_exp', 'const']

cols_stepwise = ['Adult_mortality', 'Economy_status_Developed', 'region_Central America and Caribbean', 'region_South America', 'Under_five_deaths', 'GDP_per_capita', 'region_Oceania','region_European Union', 'Schooling', 'BMI', 'Year', 'Incidents_HIV', 'Hepatitis_B', 'const']

cols_stepwise_vif = ['region_Central America and Caribbean', 'region_South America',
       'Under_five_deaths', 'GDP_per_capita', 'region_Oceania',
       'region_European Union', 'Schooling', 'Incidents_HIV', 'const']

cols = ['const', 'Year', 'Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'region_Asia',
       'region_Central America and Caribbean', 'region_European Union',
       'region_Middle East', 'region_North America', 'region_Oceania',
       'region_Rest of Europe', 'region_South America', 'gdp_per_capita_exp',
       'alcohol_exp']

cols_ethical = ['const', 'Year', 'Alcohol_consumption', 'GDP_per_capita',
       'Population_mln', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'region_Asia',
       'region_Central America and Caribbean', 'region_European Union',
       'region_Middle East', 'region_North America', 'region_Oceania',
       'region_Rest of Europe', 'region_South America']

cols_ethical_stepwise = ['Schooling', 'const', 'GDP_per_capita', 'region_Central America and Caribbean', 'region_Middle East', 'region_South America', 'region_Asia', 'Economy_status_Developed', 'Economy_status_Developing', 'region_Rest of Europe', 'region_Oceania', 'Year', 'region_European Union', 'region_North America', 'Alcohol_consumption', 'Population_mln']

cols_ethical_vif = ['GDP_per_capita', 'Population_mln', 'Economy_status_Developing',
       'region_Asia', 'region_Central America and Caribbean',
       'region_European Union', 'region_Middle East', 'region_North America',
       'region_Oceania', 'region_Rest of Europe', 'region_South America', 'const']

cols_ethical_stepwise_vif = ['GDP_per_capita', 'region_Central America and Caribbean',
       'region_Middle East', 'region_South America', 'region_Asia',
       'Economy_status_Developing', 'region_Rest of Europe', 'region_Oceania',
       'region_European Union', 'region_North America', 'Population_mln']

cols_sumye = ['const',
       'Under_five_deaths', 
       'Adult_mortality',
       'GDP_per_capita',
        'Population_mln'
               ]

cols_hanaa = ['const','Adult_mortality','Under_five_deaths',
       'GDP_per_capita','region_Central America and Caribbean', 'region_European Union']

cols_ethical_sumye = ['const',
       'GDP_per_capita',
       'Schooling',
       'region_Asia',
       'region_Central America and Caribbean', 'region_European Union',
       'region_Middle East', 'region_North America', 'region_Oceania',
       'region_Rest of Europe', 'region_South America'
               ]