In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import train_test_split


In [12]:
import statsmodels.api as sm
import statsmodels.tools

In [3]:
df = pd.read_csv("Life Expectancy Data.csv")

In [5]:
df.dtypes

Country                         object
Region                          object
Year                             int64
Infant_deaths                  float64
Under_five_deaths              float64
Adult_mortality                float64
Alcohol_consumption            float64
Hepatitis_B                      int64
Measles                          int64
BMI                            float64
Polio                            int64
Diphtheria                       int64
Incidents_HIV                  float64
GDP_per_capita                   int64
Population_mln                 float64
Thinness_ten_nineteen_years    float64
Thinness_five_nine_years       float64
Schooling                      float64
Economy_status_Developed         int64
Economy_status_Developing        int64
Life_expectancy                float64
dtype: object

In [7]:
fcols = list(df.columns)
fcols.remove("Life_expectancy")
fcols.remove("Country")

In [8]:
X = df[fcols]
y = df.Life_expectancy

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [25]:
def feature_engineering(df):
    df = df.copy()
    df = pd.get_dummies(df, columns=["Region"], drop_first=True, prefix="region", dtype=int)
    #df["gdp_per_capita_exp"] = np.exp(df["GDP_per_capita"], dtype=np.float64)
    df["alcohol_exp"] = np.exp(df["Alcohol_consumption"])
    df = sm.add_constant(df)
    return df

In [26]:
X_train_fe = feature_engineering(X_train)

In [29]:
feature_cols = ['const', 'Year', 'Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'region_Asia',
       'region_Central America and Caribbean', 'region_European Union',
       'region_Middle East', 'region_North America', 'region_Oceania',
       'region_Rest of Europe', 'region_South America', 'alcohol_exp']

In [30]:
lin_reg = sm.OLS(y_train, X_train_fe[feature_cols])

In [31]:
results = lin_reg.fit()
results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,5365.0
Date:,"Tue, 28 May 2024",Prob (F-statistic):,0.0
Time:,09:46:29,Log-Likelihood:,-3650.9
No. Observations:,2291,AIC:,7356.0
Df Residuals:,2264,BIC:,7511.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.7503,7.663,1.142,0.254,-6.277,23.777
Year,0.0357,0.006,6.231,0.000,0.024,0.047
Infant_deaths,-0.0521,0.006,-8.198,0.000,-0.065,-0.040
Under_five_deaths,-0.0510,0.004,-12.674,0.000,-0.059,-0.043
Adult_mortality,-0.0468,0.001,-75.767,0.000,-0.048,-0.046
Alcohol_consumption,0.0051,0.012,0.433,0.665,-0.018,0.028
Hepatitis_B,-0.0075,0.003,-2.912,0.004,-0.013,-0.002
Measles,0.0028,0.002,1.578,0.115,-0.001,0.006
BMI,-0.1347,0.023,-5.946,0.000,-0.179,-0.090

0,1,2,3
Omnibus:,8.686,Durbin-Watson:,1.97
Prob(Omnibus):,0.013,Jarque-Bera (JB):,10.018
Skew:,0.077,Prob(JB):,0.00668
Kurtosis:,3.285,Cond. No.,2760000000000000.0


In [32]:
y_pred = results.predict(X_train_fe[feature_cols])
rmse = statsmodels.tools.eval_measures.meanabs(y_train, y_pred)
print(rmse)

0.9490168522001526


In [33]:
X_test_fe = feature_engineering(X_test)
X_test_fe = X_test_fe[feature_cols]
y_test_pred = results.predict(X_test_fe)
rmse = statsmodels.tools.eval_measures.meanabs(y_test, y_test_pred)
print(rmse)

0.9409325826359322
