In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tools

In [22]:
df.describe()

Unnamed: 0,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
count,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0
mean,2007.5,30.363792,42.938268,192.251775,4.820882,84.292598,77.344972,25.032926,86.499651,86.271648,0.894288,11540.92493,36.675915,4.865852,4.899825,7.632123,0.206704,0.793296,68.856075
std,4.610577,27.538117,44.569974,114.910281,3.981949,15.995511,18.659693,2.193905,15.080365,15.534225,2.381389,16934.788931,136.485867,4.438234,4.525217,3.171556,0.405012,0.405012,9.405608
min,2000.0,1.8,2.3,49.384,0.0,12.0,10.0,19.8,8.0,16.0,0.01,148.0,0.08,0.1,0.1,1.1,0.0,0.0,39.4
25%,2003.75,8.1,9.675,106.91025,1.2,78.0,64.0,23.2,81.0,81.0,0.08,1415.75,2.0975,1.6,1.6,5.1,0.0,1.0,62.7
50%,2007.5,19.6,23.1,163.8415,4.02,89.0,83.0,25.5,93.0,93.0,0.15,4217.0,7.85,3.3,3.4,7.8,0.0,1.0,71.4
75%,2011.25,47.35,66.0,246.791375,7.7775,96.0,93.0,26.4,97.0,97.0,0.46,12557.0,23.6875,7.2,7.3,10.3,0.0,1.0,75.4
max,2015.0,138.1,224.9,719.3605,17.87,99.0,99.0,32.1,99.0,99.0,21.68,112418.0,1379.86,27.7,28.6,14.1,1.0,1.0,83.8


In [24]:
def linear_regression(data, minimal=False):

    #Feature Engineering Function
    def feature_engineering(df):
        df = df.copy()
        df = pd.get_dummies(df, columns=["Region"], drop_first=True, prefix="region", dtype=int)        #One Hot Encoding the region column
        df["GDP_per_capita"] = df["GDP_per_capita"] / 1000                                              #Convert GDP into easier to use units
        df = sm.add_constant(df)
        return df
    
    #Reading the data
    df = pd.read_csv(data)

    #Setting up variables and target
    fcols = list(df.columns)
    fcols.remove("Life_expectancy")
    fcols.remove("Country")
    X = df[fcols]
    y = df.Life_expectancy

    #Train-test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

    #Applying Feature Engineering Function
    X_train_fe = feature_engineering(X_train)

    #Specifying columns to use in case minimal version is used
    if minimal:
        cols = ['GDP_per_capita', 'region_Central America and Caribbean',
       'region_Middle East', 'region_South America', 'region_Asia',
       'Economy_status_Developing', 'region_Rest of Europe', 'region_Oceania',
       'region_European Union', 'region_North America', 'Population_mln']
    else:
        cols = ['region_Central America and Caribbean', 'region_South America',
       'Under_five_deaths', 'GDP_per_capita', 'region_Oceania',
       'region_European Union', 'Schooling', 'Incidents_HIV', 'const']

    #Creating and fitting the model
    lin_reg = sm.OLS(y_train, X_train_fe[cols])
    results = lin_reg.fit()

    #Using model to make predictions on training data and report Root Mean Squared Error
    y_pred = results.predict(X_train_fe[cols])
    train_rmse = statsmodels.tools.eval_measures.meanabs(y_train, y_pred)
    print(f"Training data RMSE:\t{train_rmse}")

    #Feature Engineering on test data to match training
    X_test_fe = feature_engineering(X_test)

    #Predicting on testing data and reporting RMSE
    y_test_pred = results.predict(X_test_fe[cols])
    test_rmse = statsmodels.tools.eval_measures.meanabs(y_test, y_test_pred)
    print(f"Testing data RMSE:\t{test_rmse}")

    return results


In [25]:
results = linear_regression("Life Expectancy Data.csv", minimal=False)
results.summary()

Training data RMSE:	1.9221466986344546
Testing data RMSE:	1.89114542280585


0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.933
Model:,OLS,Adj. R-squared:,0.932
Method:,Least Squares,F-statistic:,3954.0
Date:,"Tue, 28 May 2024",Prob (F-statistic):,0.0
Time:,14:19:14,Log-Likelihood:,-5298.5
No. Observations:,2291,AIC:,10610.0
Df Residuals:,2282,BIC:,10670.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
region_Central America and Caribbean,1.1516,0.177,6.496,0.000,0.804,1.499
region_South America,1.2171,0.212,5.749,0.000,0.802,1.632
Under_five_deaths,-0.1520,0.002,-77.204,0.000,-0.156,-0.148
GDP_per_capita,0.1058,0.004,27.685,0.000,0.098,0.113
region_Oceania,-1.1615,0.220,-5.271,0.000,-1.594,-0.729
region_European Union,0.9009,0.172,5.240,0.000,0.564,1.238
Schooling,-8.837e-05,0.029,-0.003,0.998,-0.056,0.056
Incidents_HIV,-0.9510,0.023,-41.817,0.000,-0.996,-0.906
const,74.7730,0.279,267.998,0.000,74.226,75.320

0,1,2,3
Omnibus:,46.445,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96.688
Skew:,0.019,Prob(JB):,1.0100000000000002e-21
Kurtosis:,4.006,Cond. No.,357.0
