## **IMPORTING NECESSARY MODULES**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
print('Tensorflow version',tf.__version__)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#!pip install rpy2

## **IMPORTING DATA**

In [None]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('/kaggle/input/life-expectancy-who/Life Expectancy Data.csv')
data.head()

In [None]:
print('the size of the dataset', data.shape)
print('the columns of the dataset ', list(data.columns))

In [None]:
data.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Exp","Hepatitis B":"HepatitisB",
                  "Measles ":"Measles"," BMI ":"BMI","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_1to19_years"," thinness 5-9 years":"thinness_5to9_years","Income composition of resources":"Income_Comp_Of_Resources",
                   "Total expenditure":"Tot_Exp"},inplace=True)

In [None]:
print('Number of Countries', len(data['Country'].unique()))

In [None]:
data.info()

In [None]:
data.Country.unique()

In [None]:
#Count of developing and developed countries
data.groupby('Status')["Status"].count()

In [None]:
data.loc[:, ["Life_Expectancy", "Alcohol"]].groupby(data['Country']).describe().loc[['Morocco','Germany','France','Spain'],:]

## **DATA VISUALIZATION**

In [None]:
#Distribution of the response variable
data['Life_Expectancy'].hist()

In [None]:
plt.figure(figsize=(6,6))
plt.bar(data.groupby('Status')['Status'].count().index, data.groupby('Status')['Life_Expectancy'].mean())
plt.xlabel("Status",fontsize=12)
plt.ylabel("Avg Life_Expectancy",fontsize=12)
plt.title("Life_Expectancy w.r.t Status")
plt.show()

In [None]:
data[data.Country.isin(['Morocco','Germany','Spain','France'])].boxplot(by='Country', 
                       column=['Life_Expectancy'], 
                       grid=False)

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
data[data.Country.isin(['Morocco','Germany','Spain','France'])].groupby('Country')['Life_Expectancy'].mean().sort_values().plot(kind='barh', ax=ax1)

In [None]:
# Life_Expectancy w.r.t Year using bar plot.
plt.figure(figsize=(7,5))
plt.bar(data.groupby('Year')['Year'].count().index, data.groupby('Year')['Life_Expectancy'].mean(),color='red',alpha=0.65)
plt.xlabel("Year",fontsize=12)
plt.ylabel("Avg Life_Expectancy",fontsize=12)
plt.title("Life_Expectancy w.r.t Year")
plt.show()

In [None]:
# Using heatmap to observe correlations.
cormat = data.corr()
plt.figure(figsize=(15,15))
sns.heatmap(cormat, square=True, annot=True, linewidths=.5)
plt.title("Correlation matrix among variables")
plt.show()

# **Observation**
* **LifeExpectancy has a negative correlation with adult mortality** 
* **LifeExpectancy has strong correlatios with Schooling and Income composition of resources** 
* **Strong correlation between thinness_1to19_years and thinness_5to9_years** 
* **There is a non-negligible correlation between LifeExpectancy and BMI and body deseases** 

In [None]:
plt.scatter(data["thinness_5to9_years"], data["thinness_1to19_years"])
plt.title("thinness_1to19_years vs thinness_5to9_years")
plt.show()

In [None]:
#Strong correlation between "thinness_1to19_years" and "thinness_5to9_years"
data = data.drop(['thinness_5to9_years'], axis=1)

## **DEALING WITH MISSING DATA**

In [None]:
data.isnull().sum()

In [None]:
# percentage of null values in each column.
data.isnull().sum()*100/data.isnull().count()

In [None]:
country_list = data.Country.unique()
fill_list = ['Life_Expectancy','Adult_Mortality','Alcohol','HepatitisB',
             'BMI','Polio','Tot_Exp','Diphtheria','GDP','Population','thinness_1to19_years','Income_Comp_Of_Resources','Schooling']

In [None]:
# Treat null values using interpolation.
for country in country_list:
    data.loc[data['Country'] == country,fill_list] = data.loc[data['Country'] == country,fill_list].interpolate()

In [None]:
#Droping rows with null target variable
data[np.isnan(data['Life_Expectancy'])]
data = data.drop(data.index[[624, 769, 1650,1715,1812,1909,1958,2167,2216,2713]])

In [None]:
def impute_col(row, col): #MCAR
    mean_col = pd.DataFrame({'mean_col':pd.Series(np.round(data.groupby('Country')[col].mean(), 2))})
    if np.isnan(row[col]):
        cnt = row['Country']
        row[col] = mean_col.loc[cnt][0]
    return row

In [None]:
# Alcohol null values
data[np.isnan(data.Alcohol)]

In [None]:
data = data.apply(impute_col, args=('Alcohol',) , axis=1)
data = data[data.Country != 'South Sudan']

In [None]:
# Hepatitis B null values
data[np.isnan(data['HepatitisB'])]

In [None]:
data = data.drop(['HepatitisB'], axis=1)

In [None]:
#BMI null values
data[np.isnan(data['BMI'])]

In [None]:
data = data[data.Country != 'Sudan']

In [None]:
#Total expenditure null values
data = data.apply(impute_col, args=('Tot_Exp',) , axis=1)

In [None]:
data = data[~data['Country'].isin(["Democratic People's Republic of Korea","Somalia"])]

In [None]:
#GDP null values
data = data.apply(impute_col, args=('GDP',) , axis=1)

In [None]:
data = data.drop(['GDP'], axis=1)

In [None]:
#Population null values
data = data.apply(impute_col, args=('Population',) , axis=1)

In [None]:
data[np.isnan(data.Population)]['Country'].unique()

In [None]:
data = data.drop(['Population'], axis=1)

In [None]:
#Income_Comp_Of_Resources
data = data.apply(impute_col, args=('Income_Comp_Of_Resources',) , axis=1)

In [None]:
data = data.drop(['Income_Comp_Of_Resources'], axis=1)

In [None]:
#Schooling
data = data.apply(impute_col, args=('Schooling',) , axis=1)
data = data.drop(['Schooling'], axis=1)

In [None]:
data.isnull().sum()

# **OUTLIERS**

In [None]:
# Create a dictionary of columns.
col_dict = {'Life_Expectancy':1, 'Adult_Mortality':2,
            'Infant_Deaths':3, 'Alcohol':4,
            'Percentage_Exp':5,'Measles':6,
            'BMI':7,'Under_Five_Deaths':8,
            'Polio':9,'Tot_Exp':10,
            'Diphtheria':11,'HIV/AIDS':12,
            'thinness_1to19_years':13}

# Detect outliers in each variable using box plots.
plt.figure(figsize=(20,30))

for variable,i in col_dict.items():
                     plt.subplot(5,4,i)
                     plt.boxplot(data[variable])
                     plt.title(variable)

plt.show()

In [None]:
data.head()

# **FEATURE SELECTION**

In [None]:
import scipy.stats as stats
stats.ttest_ind(data.loc[data['Status']=='Developed','Life_Expectancy'],data.loc[data['Status']=='Developing','Life_Expectancy'])

**p value is < 0.05
Therefore, the difference of Life Expectancy between Developed and Developing countries is significant. We can consider 'Status' as a feature.**

**Also "Adult mortality" : -0.7, "HIV/AIDS" : -0.56,  "BMI" : 0.57, "Polio" : 0.47, "GDP" : 0.46, "Alcohol" : 0.4, "thinness_1to19_years" : 0.45**

# **FEATURE ENGINEERING**

* **DUMMIFICATION OF STATUS**
* **NORMALIZING NUMERICAL FEATURES**

In [None]:
feature_df = data[['Country','Status','Adult_Mortality','Alcohol','HIV/AIDS','Polio','BMI', 'thinness_1to19_years','Life_Expectancy']]

In [None]:
feature_df = pd.concat([feature_df,pd.get_dummies(feature_df['Status'],drop_first=True)],axis=1)
final = feature_df.drop('Status',axis=1)

In [None]:
final['Adult_Mortality_scaled'] =final['Adult_Mortality'].apply(lambda x : ((x-np.min(final['Adult_Mortality']))/(np.max(final['Adult_Mortality'])-np.min(final['Adult_Mortality'])) * (20)))

In [None]:
final['Polio_scaled'] =final['Polio'].apply(lambda x : ((x-np.min(final['Polio']))/(np.max(final['Polio'])-np.min(final['Polio'])) * (20)))

In [None]:
final.to_csv('./final.csv',index = False) 

In [None]:
final = pd.read_csv('./final.csv')

In [None]:
final.head()

# **EMBED THE COUNTRY FEATURE**

In [None]:
countries = final.Country.unique()
country_dict = {'countries': list(countries)}
country_df = pd.DataFrame(country_dict)

In [None]:
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    return feature_layer(country_dict).numpy()

In [None]:
countries = feature_column.categorical_column_with_vocabulary_list(
 'countries', country_df['countries'])

In [None]:
countries_embedding = feature_column.embedding_column(countries, dimension=4)

In [None]:
countries_embedding =  demo(countries_embedding)

In [None]:
b = []
for embed in countries_embedding:
    b.extend([embed] * 16)

In [None]:
final['countries_embedding'] = pd.Series(b)

In [None]:
final['sum_countries_embedding'] = final['countries_embedding'].apply(lambda x: x.sum())

In [None]:
final =  final.rename(columns={"HIV/AIDS": "hivaids"})

In [None]:
feature_df = ['sum_countries_embedding','Adult_Mortality_scaled','Alcohol','hivaids','Polio_scaled','BMI', 'thinness_1to19_years','Developing']

# **TRAINING**

## **LINEAR REGRESSION**

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
x = final.loc[:,feature_df]
y = final.Life_Expectancy
model.fit(x, y)

In [None]:
print("Model slopes:    ", model.coef_)
print("Model intercept:", model.intercept_)

In [None]:
y_predict = model.predict(x.values)
RMSE = np.sqrt(((y-y_predict)**2).values.mean())

results = pd.DataFrame()
results["Method"] = ["Linear Regression"]
results["RMSE"] = RMSE
results

In [None]:
np.min(final['Life_Expectancy']), np.max(final['Life_Expectancy'])

In [None]:
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(estimator = model)
visualizer.fit(x, y) 
visualizer.poof()

**Residuals should be pretty symmetrically distributed, tending to cluster towards the middle of the plot. Clustered around the lower single digits of the y-axis.
In general, no clear patterns.**

## **MIXED EFFECT MODEL**

In [None]:
#!pip install -q statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
md = smf.mixedlm("Life_Expectancy ~ Adult_Mortality_scaled + Alcohol + Polio_scaled + hivaids + BMI + thinness_1to19_years + Developing", 
                 final, 
                 groups=final["sum_countries_embedding"], re_formula="~Adult_Mortality_scaled + Alcohol + Polio_scaled + hivaids + BMI + thinness_1to19_years + Developing") 
#re_formula To ensure that each country has its own random slope

In [None]:
mdf = md.fit(method=["lbfgs"])
print(mdf.summary())

In [None]:
performance = pd.DataFrame()
performance["residuals"] = mdf.resid.values
performance["predicted"] = mdf.fittedvalues

sns.lmplot(x = "predicted", y = "residuals", data = performance)

In [None]:
y_predict = mdf.fittedvalues
RMSE = np.sqrt(((y-y_predict)**2).values.mean())
results.loc[3] = ["Mixed_Random_Slopes", RMSE]
results

# **USING A NEURAL NETWORK**

In [None]:
final = pd.read_csv('./final.csv')

In [None]:
final.head()

In [None]:
final['Status'] = final['Developing'].map(lambda x: 'Developing' if x==1 else 'Developed')

In [None]:
train, test = train_test_split(final, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Life_Expectancy')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
train_ds = df_to_dataset(train)
val_ds = df_to_dataset(val, shuffle=False, batch_size=16)
test_ds = df_to_dataset(test, shuffle=False, batch_size=16)

In [None]:
final_batch = next(iter(train_ds))[0]

In [None]:
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    return feature_layer(final_batch).numpy()

In [None]:
feature_columns = []
# numeric cols
for header in ['Alcohol', 'HIV/AIDS', 'Polio_scaled','BMI','thinness_1to19_years']:
    feature_columns.append(feature_column.numeric_column(header))

In [None]:
#catgorical cols
status = feature_column.categorical_column_with_vocabulary_list(
      'Status', ['Developing', 'Developed'])

status = feature_column.indicator_column(status)
feature_columns.append(status)

In [None]:
# embedding columns
country = feature_column.categorical_column_with_vocabulary_list(
      'Country', final.Country.unique())
country_embedding = feature_column.embedding_column(country, dimension=6)
feature_columns.append(country_embedding)

In [None]:
#Input layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
#Model architecture
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(64, activation='relu'),
  layers.Dense(32, activation='relu'), 
  layers.Dense(16, activation='relu'), 
  layers.Dense(1)
])
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001), metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [None]:
#Fitting
history = model.fit(train_ds, validation_data=val_ds, epochs=40)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
predictions = model.predict(test_ds)
y = np.concatenate([y for x, y in test_ds], axis=0)

In [None]:
compare = pd.DataFrame({'predictions':predictions.reshape((-1,)), 'True': y})
compare.tail()

In [None]:
model_err = model.evaluate(test_ds)

## **IN THIS PARTICULAR CASE, THE MIXED EFFECT MODEL IS SUPERIOR TO BOTH NORMAL LINEAR REGRESSION AND THE USED NEURAL NETWORK ARCHITECTURE, SINCE IT TAKES INTO ACCOUNT THE DEPENDANCE OF THE DATA.**