## Utils

In [None]:
# use linear regression to fill in missing values among highly correlated columns in the dataset
import pandas as pd
from sklearn.linear_model import LinearRegression

def find_and_impute(x, y):
    
    tiny_model=LinearRegression()
    data=pd.DataFrame(columns=['x', 'y'])
    data['x']=x
    data['y']=y
    
    train=data[data['y'].notnull()]      
    test=data[data['y'].isnull()]

    x_train=pd.DataFrame( train['x'])
    y_train=pd.DataFrame(train['y'])
    x_test=pd.DataFrame(test['x'])
    y_test=pd.DataFrame(test['y'])
        
    tiny_model.fit(x_train, y_train)
    test['y']=tiny_model.predict(x_test)
    return train.append(test)['y']

## Load data

In [None]:
dataset = pd.read_csv('dataset/Life Expectancy Data.csv')

# target and feature separatation
target = ['Life expectancy ']
df_x = dataset.drop(columns=target)
df_y = pd.DataFrame(dataset[target], columns=target)
df_x.shape

## Data Processing

In [None]:
# replace strings with integers
country_list=df_x.Country.unique().tolist()
country_map={k: v for v, k in enumerate(country_list)}

status_list=df_x.Status.unique().tolist()
status_list={k: v for v, k in enumerate(status_list)}

df_x=df_x.replace(country_map).replace(status_list)

## Impute Null values

In [None]:
#use median to fill-in
df_x['Schooling'][df_x['Schooling'].isnull()]=df_x['Schooling'].median()
df_x['Polio'][df_x['Total expenditure'].isnull()]=df_x['Total expenditure'].median()
df_x['Polio'][df_x['Polio'].isnull()]=df_x['Polio'].median()
df_x['Diphtheria '][df_x['Diphtheria '].isnull()]=df_x['Diphtheria '].median()

In [None]:
#find correlation and fill-in
df_x['Alcohol']=find_and_impute(df_x['Country'], df_x['Alcohol'])
df_x['Population']=find_and_impute(df_x['under-five deaths '], df_x['Population'])
df_x['GDP']=find_and_impute(df_x['percentage expenditure'], df_x['GDP'])
df_x['Hepatitis B']=find_and_impute(df_x['Diphtheria '], df_x['Hepatitis B'])

## Remove Null Values

In [None]:
for column in df_x.columns:
    df_x[column][df_x[column].isnull()]=df_x[column].median()  

df_y[target[0]][df_y[target[0]].isnull()]=df_y[target[0]].median()

## Standardadize Dataset

In [None]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
scalar.fit(df_x)
s=scalar.transform(df_x)
std_df_x = pd.DataFrame(s, columns=df_x.columns)
std_df_x.head()

## Dimensionality Reduction

In [None]:
# # Importing PCA 
# from sklearn.decomposition import PCA 

# # Let's say, components = 20
# pca = PCA(n_components = 20) 
# pca.fit(std_df_x) 
# x_pca = pca.transform(std_df_x) 

# std_df_x=pd.DataFrame(x_pca)

## Train the model

In [None]:
#split training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(std_df_x, df_y[target[0]], test_size = 0.25, random_state = 55)

#use polynomial(higher order) features
#tune the degree-parameter below, for me degree=2 worked the best without overfitting the data
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2)
x_train=poly_features.fit_transform(x_train)
x_test=poly_features.fit_transform(x_test)

#linear regression on polynomial features
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train, y_train)

#predict values
y_predict = pd.Series(model.predict(x_test), index=y_test.index)

## Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

print("Model Score: ", model.score(x_train, y_train))
print("R2 Score: ", r2_score(y_test, y_predict))
print("Mean Squared error: ", MSE(y_test, y_predict))
print("Percentage Mean Squared error: ", 100*MSE(y_test, y_predict)/y_test.median())

## Data Visualization

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
plt.style.use('dark_background')
plt.plot(range(y_predict.size), (y_predict-y_test)**2)
plt.title("MSE per index")
plt.xlabel("index")
plt.ylabel('squared error')
plt.show()

## Analyze large errors (deviation > 10)

In [None]:
high_error=(y_predict-y_test).abs() > 10
high_error=high_error.loc[high_error==True]
error_analysis = dataset.iloc[high_error.index.array]
error_analysis['error'] = high_error
error_analysis['predicted'] = y_predict.loc[high_error.index]
error_analysis['MSE'] = error_analysis['predicted']-error_analysis['Life expectancy ']

In [None]:
error_analysis.sort_values('MSE')['MSE']