In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import necessary Libraries and Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Loading Raw Data

In [None]:
raw_data = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
raw_data.tail(20)

# Check discriptive Statistics

In [None]:
raw_data.shape

In [None]:
raw_data.describe(include='all').T

### Check the avg population by country

In [None]:
avg_pop_by_country = raw_data.groupby(by=["Country"]).Population.mean()
pd.set_option('display.max_rows', 50)
avg_pop_by_country

# Check correlation of variables

In [None]:
raw_data.corr()

In [None]:
fig, ax = plt.subplots(figsize = (25, 25))

sns.heatmap(ax=ax,
            data=raw_data.corr(), 
            annot=True, 
            cmap='coolwarm', 
            annot_kws={'size': 16}, 
            robust=True)

plt.show()

# Check for Null Values

In [None]:
df_nv = pd.DataFrame(data=raw_data.isna().sum(), columns = ["Number of Null Values"]).reset_index()
df_nv.columns = ["Features", "Number of Null Values"]
df_nv.pivot_table(index=df_nv.Features, aggfunc='sum', margins=True, margins_name='Total')

### Dropping Year as it seems to be insignificant and Country(categorical) as has many unique values.

In [None]:
data_1 = raw_data.drop(columns=["Year"], axis=1)
data_1.head()

# Check the outliers by creating histograms for variables to check the distribution

In [None]:
data_1.info()

In [None]:
cols

In [None]:
cols = list(data_1.columns.values)
cols.remove("Country")
fig, ax = plt.subplots(nrows = len(cols), ncols = 1, figsize = (10, 80))
for i in range(len(cols)):
    sns.histplot(ax=ax[i],
                 data=data_1, 
                 x=cols[i], 
                 kde=True)
    
    ax[i].set_xlabel(cols[i], fontsize = 12)


plt.show()

### Removed extra spaces from the column header name

In [None]:
data_1.columns = data_1.columns.str.replace(' ', '')
data_1.columns

In [None]:
data_1["Schooling"] = data_1["Schooling"].fillna(data_1["Schooling"].mean())
data_1["BMI"] = data_1["BMI"].fillna(data_1["BMI"].mean())
data_1["Alcohol"] = data_1["Alcohol"].fillna(data_1["Alcohol"].mean())
data_1["AdultMortality"] = data_1["AdultMortality"].fillna(data_1["AdultMortality"].mean())
data_1["Incomecompositionofresources"] = data_1["Incomecompositionofresources"].fillna(data_1["Incomecompositionofresources"].mean())
data_1["HepatitisB"] = data_1["HepatitisB"].fillna(data_1["HepatitisB"].mean())
data_1["Lifeexpectancy"] = data_1["Lifeexpectancy"].fillna(data_1["Lifeexpectancy"].mean())
data_1["thinness1-19years"] = data_1["thinness1-19years"].fillna(data_1["thinness1-19years"].mean())
data_1["GDP"] = data_1["GDP"].fillna(data_1["GDP"].mean())
data_1["Polio"] = data_1["Polio"].fillna(data_1["Polio"].mean())
data_1["Totalexpenditure"] = data_1["Totalexpenditure"].fillna(data_1["Totalexpenditure"].mean())
data_1["Diphtheria"] = data_1["Diphtheria"].fillna(0)
data_1["thinness5-9years"] = data_1["thinness5-9years"].fillna(data_1["thinness5-9years"].mean())
data_1["Population"] = data_1["Population"].fillna(data_1["Population"].mean())

In [None]:
data_1.head()

In [None]:
df_nv_1 = pd.DataFrame(data=data_1.isnull().sum(), columns = ["Number of Null Values"]).reset_index()
df_nv_1.columns = ["Features", "Number of Null Values"]
df_nv_1.pivot_table(index=df_nv_1.Features, aggfunc='sum', margins=True, margins_name='Total')

In [None]:
raw_data.head()

In [None]:
cols = list(data_1.columns.values)
cols.remove("Country")
cols.remove("Status")
fig, ax = plt.subplots(nrows = len(cols), ncols = 1, figsize = (7, 80))
for i in range(len(cols)):
    sns.scatterplot(ax=ax[i],
                 data=data_1, 
                 x="Lifeexpectancy", 
                 y=cols[i])
    
    ax[i].set_xlabel("Lifeexpectancy", fontsize = 12)
    ax[i].set_ylabel(cols[i], fontsize = 12)


plt.show()

## Remove outliers using Quantile

99% Quantile
* infantdeaths
* percentageexpenditure
* Measles
* under-fivedeaths
* HIV/AIDS
* GDP
* thinness1-19years
* thinness5-9years

In [None]:
multi_cols = ['infantdeaths', 'percentageexpenditure', 'Measles', 'under-fivedeaths', 'HIV/AIDS', 'GDP', 'thinness1-19years', 'thinness5-9years']
for col in multi_cols:
    q = data_1[col].quantile(0.99)
    data_1 = data_1[data_1[col] < q]


In [None]:
data_1.head()

In [None]:
data_cleaned = data_1.reset_index(drop=True)

In [None]:
data_cleaned.describe(include='all')

## Milticollinearity

In [None]:
variables = data_cleaned.drop(columns=["Country", "Status", "Lifeexpectancy"], axis=1)
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["Features"] = variables.columns
vif

In [None]:
log_pop = np.log(data_cleaned["Population"])
data_cleaned["Log_pop"] = log_pop

In [None]:
log_gdp = np.log(data_cleaned["GDP"])
data_cleaned["log_gdp"] = log_gdp

In [None]:
data_cleaned.head()

In [None]:
data_new = data_cleaned.drop(columns=["Population", "GDP", 'Country'], axis = 1)
data_new.head()

In [None]:
data_with_dummies = pd.get_dummies(data=data_new, drop_first=True)
data_with_dummies.head()

In [None]:
data_with_dummies.columns

In [None]:
variables.shape

In [None]:
variables = data_with_dummies.drop(columns=["Lifeexpectancy"], axis=1)
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["Features"] = variables.columns
vif

In [None]:
#x = data_new[["AdultMortality", "Alcohol", "percentageexpenditure", "Measles", "HIV/AIDS"]]
x = data_with_dummies.drop(columns=["Lifeexpectancy", "thinness1-19years", "thinness5-9years"], axis=1)
y = data_with_dummies["Lifeexpectancy"]

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=365)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
model.score(x_train, y_train)

In [None]:
y_hat = model.predict(x_train)

In [None]:
sns.scatterplot(x=y_train, y=y_hat, alpha=0.5)
plt.xlim(0)
plt.ylim(0)
plt.show()

In [None]:
y_hat_test = model.predict(x_test)

In [None]:
sns.scatterplot(x=y_test, y=y_hat_test)
plt.xlim(0)
plt.ylim(0)
plt.show()

In [None]:
sns.histplot(x=y_train-y_hat, kde=True)

In [None]:
sns.histplot(x=y_test-y_hat_test, kde=True)

In [None]:
residual_df = pd.DataFrame(data=y_hat_test.round(1), columns=["Predicted Value"])
residual_df["Original Value"] = y_test.reset_index(drop=True)
residual_df["Residuals"] = residual_df["Original Value"] - residual_df["Predicted Value"]
residual_df["Residuals %"] = np.absolute(residual_df["Residuals"]/residual_df["Original Value"])*100
residual_df.sort_values(by="Residuals %", ascending=False, inplace=True)
pd.set_option('display.max_rows', 1000)
residual_df