In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from xgboost import XGBRegressor, train

In [None]:
df = pd.read_csv('/kaggle/input/life-expectancy-who/Life Expectancy Data.csv')

# EDA

In [None]:
df.head(5)

In [None]:
print("\t     Unique Values\n\n\n")
print(df.nunique())

In [None]:
df = df.drop(['Country'], axis=1)

In [None]:
df.describe()

In [None]:
print(df.info())

In [None]:
plt.figure(figsize=(12, 6))
plt.tight_layout()
sns.heatmap(df.isnull())

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['Life expectancy ', 'Adult Mortality', 'Alcohol', ' BMI ', 'Diphtheria ', 
                       ' thinness  1-19 years', ' thinness 5-9 years', 'Polio'])

In [None]:
df.isnull().sum()

Let's use KNN to impute the remaining values since it won't be feasible to drop the remaining values.

In [None]:
given_values = df.dropna()
missing_values_index = list(set(df.index) - set(given_values.index))
missing_values = df.loc[missing_values_index]

In [None]:
df.info()

In [None]:
imputer = KNNImputer(n_neighbors=2)
imputed_values = pd.DataFrame(imputer.fit_transform(df.drop(['Status'], axis=1)), columns=df.drop(['Status'], axis=1).columns)

In [None]:
imputed_values['Status'] = df['Status']

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(imputed_values.corr(), annot=True)

In [None]:
df2 = pd.DataFrame(pd.get_dummies(data=imputed_values, columns=['Status']))

In [None]:
plt.figure(figsize=(12, 6))
sns.set_style('darkgrid')
sns.scatterplot(data=df, y='Life expectancy ', x='Adult Mortality', hue='Status')

In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(df['Year'], df['Life expectancy '], hue=df['Status'], palette="coolwarm")

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(20, 40))
sns.scatterplot(data=df, y='Life expectancy ', x='Alcohol', hue='Status', palette="mako_r", alpha=0.5, ax=axes[0])
sns.scatterplot(data=df, y='Life expectancy ', x=' HIV/AIDS', hue='Status', palette="mako_r", alpha=0.5, ax=axes[1])
sns.scatterplot(data=df, y='Life expectancy ', x='Polio', hue='Status', palette="mako_r", alpha=0.5, ax=axes[2])
sns.scatterplot(data=df, y='Life expectancy ', x='Diphtheria ', hue='Status', palette="mako_r", alpha=0.5, ax=axes[3])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(30, 10))
sns.scatterplot(data=df, y='Life expectancy ', x='Schooling', hue='Status', palette="OrRd_r", alpha=0.5, ax=axes[0])
sns.scatterplot(data=df, y='Life expectancy ', x='Income composition of resources', hue='Status', palette="OrRd_r", alpha=0.5, ax=axes[1])

A closer look at Life Expectancy's correlation with other features.

In [None]:
df2.corr()['Life expectancy ']

Let's remove features with less correlations. 

In [None]:
columns_to_drop = []

In [None]:
for col in df.drop(['Status'], axis=1).columns:
    temp = df.corr()[col].loc['Life expectancy ']
    if temp < 0.35 and temp > -0.2:
        columns_to_drop.append(col)

In [None]:
df2 = df2.drop(columns_to_drop, axis=1)

In [None]:
y = df2['Life expectancy ']
X = df2.drop(['Life expectancy '], axis=1)

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Models

In [None]:
X_Train, X_CV, y_train, y_cv = train_test_split(X, y, test_size=0.4)
X_Test, X_CV, y_test, y_cv = train_test_split(X_CV, y_cv, test_size=0.5)

# Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_Train, y_train)

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=lr.predict(X_Train), y_true=y_train)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=lr.predict(X_Train), y_true=y_train))}")

### Performance on CV Set

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=lr.predict(X_CV), y_true=y_cv)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=lr.predict(X_CV), y_true=y_cv))}")

# Ridge Regression

In [None]:
gs_rr = GridSearchCV(Ridge(),
                    param_grid={
                        'alpha':[0.1, 0.3, 1, 3, 6, 8, 10]
                    }, verbose=1)

gs_rr.fit(X_Train, y_train)
rr=gs_rr.best_estimator_

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=rr.predict(X_Train), y_true=y_train)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=rr.predict(X_Train), y_true=y_train))}")

### Performance on CV Set

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=rr.predict(X_CV), y_true=y_cv)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=rr.predict(X_CV), y_true=y_cv))}")

# Lasso Regression

In [None]:
gs_lr = GridSearchCV(Lasso(),
                    param_grid={
                        'alpha': [0.1, 0.3, 1, 3, 6, 8, 10]
                    }, verbose=1)

gs_lr.fit(X_Train, y_train)

In [None]:
lasso = gs_lr.best_estimator_

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=lasso.predict(X_Train), y_true=y_train)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=lasso.predict(X_Train), y_true=y_train))}")

### Performance on CV Set

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=lasso.predict(X_CV), y_true=y_cv)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=lasso.predict(X_CV), y_true=y_cv))}")

# ANN

In [None]:
ann = Sequential()
ann.add(Dense(15, activation='relu'))
ann.add(Dense(10, activation='relu'))
ann.add(Dense(10, activation='relu'))
ann.add(Dense(5, activation='relu'))
ann.add(Dense(1))
ann.compile(optimizer='Adam', loss='mse')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=10)

In [None]:
ann.fit(x=np.array(X_Train),
       y=np.array(y_train),
       epochs=500,
       verbose=1,
       validation_data=(np.array(X_CV), np.array(y_cv)),
       callbacks=[early_stop])

In [None]:
error_ann = pd.DataFrame(ann.history.history)

In [None]:
error_ann.plot()

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=ann.predict(X_Train), y_true=y_train)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=ann.predict(X_Train), y_true=y_train))}")

### Performance on CV Set

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=ann.predict(X_CV), y_true=y_cv)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=ann.predict(X_CV), y_true=y_cv))}")

# XGBoost

In [None]:
gs_xgb = GridSearchCV(XGBRegressor(booster='gbtree', subsample=0.75),
                     param_grid={
                         'min_child_weight': [4, 6, 8],
                         'max_depth': [8, 10, 12],
                         'eta': [0.3, 0.03], 
                         'learning_rate': [0.01, 0.1],
                         'reg_alpha': [0.1, 1, 3],
                         'reg_lambda': [0.1, 1, 2, 3]
                     }, 
                     verbose=3, 
                     cv=3)
gs_xgb.fit(X_Train, y_train)

In [None]:
xgb = gs_xgb.best_estimator_

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=xgb.predict(X_Train), y_true=y_train)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=xgb.predict(X_Train), y_true=y_train))}")

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=xgb.predict(X_CV), y_true=y_cv)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=xgb.predict(X_CV), y_true=y_cv))}")

# Performance on Test Set

Let's use the XGBoost model since it had the best scores. 

In [None]:
print(f"Explained Variance Score: {explained_variance_score(y_pred=xgb.predict(X_Test), y_true=y_test)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(y_pred=xgb.predict(X_Test), y_true=y_test))}")