# <font color=darkblue> Importing Packages

In [None]:
!pip install dltk_ai

In [None]:
import os #file handling
import dltk_ai
from dltk_ai.dataset_types import Dataset     #importing datasets
from dltk_ai import visualization as vs #importing visualizations
from dltk_ai import preprocessor       #importing preprocessor
import json
from sklearn import datasets #Machine Learning 
import numpy as np #Numerical
import seaborn as sns #plot
import matplotlib.pyplot as plt #plot
from sklearn.model_selection import train_test_split #ML

# <font color=darkblue> Data

Loading the dataset and splitting it into 2 parts, train and test

In [None]:
import numpy as np
import pandas as pd
df =  pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
data, test_data = train_test_split(df, test_size=0.2,random_state = 42) 
test_data.dropna(inplace = True)
test_data.to_csv('Life Expectancy Data test.csv',index = False)

# <font color=darkblue> Sneak Peak at Dataset

Lets see what our data looks like

In [None]:
data.head()

In [None]:
data.describe()

We can see that the data is not following normal distribution and, we also have nulls

In [None]:
data.info()

In [None]:
data.isnull().sum()

Since we have a lot of nulls, we cannot simply drop them

In [None]:
print(data.columns)
print(data.shape)

# <font color = 'darkblue'> Data preprocessing and Feature anlysis

## <font color = green> Correlation Analysis

Heatmap lets us see how different values are correlated, in a visual manner.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
sns.heatmap(data.corr(),annot = True)

## <font color = green> Handling Outliers and missing values

To fill the null values, imputation, grouped mean and dropping were used

Dropping the nulls which are very few in number

In [None]:
# removing null value of 'Adult Morality' and 'Life expectancy ' columns
data['Adult Mortality']=data['Adult Mortality'].fillna(value=data['Adult Mortality'].mean())
data['Life expectancy ']=data['Life expectancy '].fillna(value=data['Life expectancy '].mean())

For imputation, first, the correlation was seen,. for example, column X is highly correlated to column Y.  Now, if column X has null values and column Y is filled fully, then column X can be filled using column Y 

If we see the scatter plot of X and Y we can see where majority of points fall and what is the relation

In [None]:
sns.scatterplot(data=data,x='Life expectancy ',y='Schooling')

In [None]:
# Imputing missing values of 'Schooling' column 
def impute_schooling(c):
    s=c[0]
    l=c[1]
    if pd.isnull(s):
        if l<= 40:
            return 8.0
        elif 40<l<=44:
            return 7.5
        elif 44<l<50:
            return 8.1
        elif 50<l<=60:
            return 8.2
        elif 60<l<=70:
            return 10.5
        elif 70<l<=80:
            return 13.4
        elif l>80:
            return 16.5
    else:
        return s
    
data['Schooling']=data[['Schooling','Life expectancy ']].apply(impute_schooling,axis=1)

In [None]:
sns.scatterplot(data=data,x='Alcohol',y='Schooling')

In [None]:
# Imputing missing values of 'Alcohol' column 
def impute_Alcohol(cols):
    al=cols[0]
    sc=cols[1]
    if pd.isnull(al):
        if sc<=2.5:
            return 4.0
        elif 2.5<sc<=5.0:
            return 1.5
        elif 5.0<sc<=7.5:
            return 2.5
        elif 7.5<sc<=10.0:
            return 3.0
        elif 10.0<sc<=15:
            return 4.0
        elif sc>15:
            return 10.0
    else:
        return al
    
data['Alcohol']=data[['Alcohol','Schooling']].apply(impute_Alcohol,axis=1)

In [None]:
sns.scatterplot(data=data,x='Income composition of resources',y='Life expectancy ')

In [None]:
# Imputing missing values of ''Income composition of resources'' column 
def impute_Income(c):
    i=c[0]
    l=c[1]
    if pd.isnull(i):
        if l<=40:
            return 0.4
        elif 40<l<=50:
            return 0.42
        elif 50<l<=60:
            return 0.402
        elif 60<l<=70:
            return 0.54
        elif 70<l<=80:
            return 0.71
        elif l>80:
            return 0.88
    else:
        return i
        
data['Income composition of resources']=data[['Income composition of resources','Life expectancy ']].apply(impute_Income,axis=1)


In [None]:
sns.scatterplot(data=data,x=' BMI ',y='Life expectancy ')

Creating a function for hadling outliers in certain columns.The function will calculate the z score of all the values in the passed column, if the zscore is greater than the threshold we replace it with the group(country) mean.

In [None]:
def outlier_replace(col):
    for i in countries:
        for j in groups.get_group(i)[col]:
            threshold = 3
            mean = np.mean(groups.get_group(i)[col])
            std = np.std(groups.get_group(i)[col])
            if std != 0:                     
                z_score = (j - mean) / std
                if np.abs(z_score) > threshold:
                    j = data[col][data['Country'] == i].mean()

The other columns, wer replaced by group mean.

If the number of null was less than 10 then the mean of the column was taken, or else, the mean of that value belonging to that country were taken.

In [None]:
data.dropna(subset=['Life expectancy '],inplace = True) 
countries = data['Country'].unique()

# we are creating groups of countries 

groups = data.groupby('Country')

# from the avialble data we know that values depends on country , 
# so we are going to handle the missing values and outliers of some columns  with respect to the country


# creating a new list contains gdp null values greater than 10.. this simply means that we cannot update the null with 
# the respective country mean .. 
gdpnull_c = []
for i in countries:
    if groups.get_group(i)['GDP'].isna().sum() >10:
        gdpnull_c.append(i)
        
        
        
# for countries with less gdp null then fill it with mean of gdp  values with respect to each country
for i in countries:
    if i not in gdpnull_c:
        for j in groups.get_group(i)['GDP']:
            data['GDP'][data['Country'] == i]= groups.get_group(i)['GDP'].fillna(groups.get_group(i)['GDP'].mean()) 
            
# for those countries null values more than 10 fill it with  mean of 'GDP'  of entire dataframe
for i in gdpnull_c:
    data['GDP'][data['Country'] == i]=groups.get_group(i)['GDP'].fillna(data['GDP'].mean())
    
# replacing outlier with mean of the rest values in the respective country:

outlier_replace('GDP')

# there are some countries for which we dont have the 15 years data.. so eventhough we did above steps, we may not replace
# null values of such coutries... 

# so , we are droping rest na values ( 5 rows)
data.dropna(subset=['GDP'],inplace = True) 

In [None]:
# hepatities     outlier  and null analysis

# same process in the case of gdp data handling ( refer )
countries = data['Country'].unique()
groups = data.groupby('Country')
gnull_c = []
for i in countries:
    if groups.get_group(i)['Hepatitis B'].isna().sum() >10:
        gnull_c.append(i)

In [None]:
# treating outlier 'Hepatitis B'values among countries which contain less number of nulls
outlier_replace('Hepatitis B') 

# we replace all null values by mean 'Hepatities B' of the corresponding countries ( countries not in gnull_c)    
for i in countries:
    if i not in gnull_c:
        for j in groups.get_group(i)['Hepatitis B']:
            data['Hepatitis B'][data['Country'] == i]= groups.get_group(i)['Hepatitis B'].fillna(groups.get_group(i)['Hepatitis B'].mean()) 
# for those countries in gnull_c we replace it with mean of 'Hepatitis B'  in the entire dataframe
for i in gnull_c:   
    data['Hepatitis B'][data['Country'] == i]=groups.get_group(i)['Hepatitis B'].fillna(data['Hepatitis B'].mean())

# same processing ( refer gdp data handling process)   
data.dropna(subset=['Hepatitis B'],inplace = True) 

In [None]:


gnull_c = []
for i in countries:
    if groups.get_group(i)['Total expenditure'].isna().sum() >10:
        gnull_c.append(i)

        
outlier_replace('Total expenditure') 


for i in countries:
    if i not in gnull_c:
        for j in groups.get_group(i)['Total expenditure']:
            data['Total expenditure'][data['Country'] == i]= groups.get_group(i)['Total expenditure'].fillna(groups.get_group(i)['Total expenditure'].mean()) 

for i in gnull_c:   
    data['Total expenditure'][data['Country'] == i]=groups.get_group(i)['Total expenditure'].fillna(data['Total expenditure'].mean())

data.dropna(subset=['Total expenditure'],inplace = True) 

## Further outlier removal and missing values

In [None]:
sns.scatterplot(x=' BMI ',y=' thinness  1-19 years',data=data)

In [None]:
# Another imputation technique

data = data.drop(' thinness 5-9 years',axis = 1)
def impute_BMI(c):
    b=c[0]
    l=c[1]
    if pd.isnull(b):
        if l<=50:
            return 25.0
        elif 50<l<=60:
            return 25.0
        elif 60<l<=70:
            return 32.0
        elif 70<l<=80:
            return 46.8
        elif 80<l<=100:
            return 60.0
    else:
        return b
    
data[' BMI ']=data[[' BMI ','Life expectancy ']].apply(impute_BMI,axis=1)

In [None]:
sns.scatterplot(x='Population',y='infant deaths',data=data)

In [None]:
def impute_population(c):
    p=c[0]
    i=c[1]
    if pd.isnull(p):
        if i<=100:
            return 0.19*((10)**9)
        elif 100<i<=250:
            return 0.18*((10)**9)
        elif 250<i<=350:
            return 0.02*((10)**9)
        elif 350<i<=900:
            return 0.1*((10)**9)
        elif 900<i<=1100:
            return 0.18*((10)**9)
        elif 1100<i<=1250:
            return 0.05*((10)**9)
        elif 1250<i<=1500:
            return 0.19*((10)**9)
        elif 1500<i<=1750:
            return 0.05*((10)**9)
        elif i>1750:
            return 0.1*((10)**9)
    else:
        return p
data['Population']=data[['Population','infant deaths']].apply(impute_population,axis=1)

In [None]:
sns.scatterplot(data=data,x=' thinness  1-19 years',y=' BMI ')

In [None]:
def impute_Thin_1(c):
    t=c[0]
    b=c[1]
    if pd.isnull(t):
        if b<=10:
            return 5.0
        elif 10<b<=20:
            return 10.0
        elif 20<b<=30:
            return 8.0
        elif 30<b<=40:
            return 6.0
        elif 40<b<=50:
            return 3.0
        elif 50<b<=70:
            return 4.0
        elif b>70:
            return 1.0
    else:
        return t
    
data[' thinness  1-19 years']=data[[' thinness  1-19 years',' BMI ']].apply(impute_Thin_1,axis=1)

In [None]:

countries = data['Country'].unique()
groups = data.groupby('Country')
gnull_c = []
for i in countries:
    if groups.get_group(i)['Polio'].isna().sum() >10:
        gnull_c.append(i)

outlier_replace('Polio') 

for i in countries:
    if i not in gnull_c:
        for j in groups.get_group(i)['Polio']:
            data['Polio'][data['Country'] == i]= groups.get_group(i)['Polio'].fillna(groups.get_group(i)['Polio'].mean()) 
for i in gnull_c:   
    data['Polio'][data['Country'] == i]=groups.get_group(i)['Polio'].fillna(data['Polio'].mean())
data.dropna(subset=['Polio'],inplace = True) 

In [None]:

countries = data['Country'].unique()
groups = data.groupby('Country')
gnull_c = []
for i in countries:
    if groups.get_group(i)['Diphtheria '].isna().sum() >10:
        gnull_c.append(i)

outlier_replace('Diphtheria ') 

for i in countries:
    if i not in gnull_c:
        for j in groups.get_group(i)['Diphtheria ']:
            data['Diphtheria '][data['Country'] == i]= groups.get_group(i)['Diphtheria '].fillna(groups.get_group(i)['Diphtheria '].mean())
            
            
for i in gnull_c:   
    data['Diphtheria '][data['Country'] == i]=groups.get_group(i)['Diphtheria '].fillna(data['Diphtheria '].mean())
data.dropna(subset=['Diphtheria '],inplace = True) 

In [None]:
life = data['Life expectancy ']

## More Exploration 

Creating a dummy dataframe, so that the original dataframe doesnt get destroyed

In [None]:
dataplt = data
dataplt = dataplt.drop(['Status'],axis=1)
dataplt = dataplt.drop(['Country'],axis=1)

Dropping the categorical data

A pairplot, helps in understanding all the feature correlation in one single frame, which is useful for feature analysis.

In [None]:
#sns.pairplot(dataplt,palette='flare')

Boxplots are helpful in understanding outliers and central tendency

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=7, figsize=(30, 30))
index = 0
axs = axs.flatten()
for k,v in dataplt.items():
    sns.boxplot(y=k, data=dataplt, ax=axs[index],color = '#7c4780')
    index += 1

We can see that there are many outliers, but since we have already done outlier detection, we can say these are true values, and dropping them can mess with data.

A distribution plot is helpful for analysing the distribution that a certain attribute holds.

Here we are looping through all the columns and making their distplot.

In [None]:
for cols in dataplt:
    sns.displot(dataplt[cols],color= '#7c4780')
    plt.title('Distribution Plot of '+cols)
    plt.show()

Plotting highly correlarted data

Income Composition and schooling

In [None]:
sns.set_style("whitegrid")

cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
g = sns.relplot(
    data=dataplt,
    color= '#7c4780',
    x="Schooling", y="Income composition of resources",
    hue="Year", size="Life expectancy ",
    palette=cmap, sizes=(10, 200),
)

g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
g.despine(left=True, bottom=True) 

We can see a nice relation between both

Schooling and Life Expectancy:

This also follows a beautiful relationship, it can be thought that, since schooling increases awareness and hence awareness about health also increases, and hence the results

In [None]:
sns.set_style("darkgrid")

g = sns.jointplot(y="Schooling", x="Life expectancy ", data=dataplt,
                  kind="reg", truncate=False,
                  color="#7c4780", height=7)

Income Composition and Life expectancy

The linear relationship can be because, the with increasing in income, health facility become better and hence better life expectancy.

In [None]:

g = sns.jointplot(y="Income composition of resources", x="Life expectancy ", data=dataplt,
                  kind="reg", truncate=False,
                  color="#7c4780", height=7)

# Pre Model Data Processing

Dropping the categorical data.

And since dltk_ai can take upto 20 parameters, it is better to drop this column

In [None]:
## Applying scalar transformation 
data = data.drop('Country',axis = 1)

Dropping catergorical variable and target variable

In [None]:
I = data
from sklearn.preprocessing import MinMaxScaler
I = I.drop(['Status','Life expectancy '],axis = 1)

Using MinMax scaler to scale down the data to a normalized form

$x_i-min(x)/(max(x)-min(x)) $ 

In [None]:
scaler=MinMaxScaler()
scaler.fit(I)
scaled_data=scaler.transform(I)
scaled_data = pd.DataFrame(scaled_data)
life = np.array(data['Life expectancy '])[:,np.newaxis]
scaled_data['target'] = life
scaled_data.columns = ['Year','Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', 'Income composition of resources','Schooling','Life expectancy ']
data= scaled_data.copy()
data.to_csv('processed.csv',index=False)

<font color=blue> API key setup

We need to provide APIkey to connect to DLTK

In [None]:
# initialize dltk client with API key
client = dltk_ai.DltkAiClient('7dbbe2f4-3fb4-4a2a-95d3-1454bb6bc09e')

<font color=blue> Uploading training data

In [None]:
train_data_store_response = client.store('processed.csv', Dataset.TRAIN_DATA)
print(train_data_store_response)
train_data = train_data_store_response['fileUrl']

Next step after uploading the dataset is to train a model using Train Dataset.

# <font color=red> Model One

## Creating Model

In [None]:
# Create ML Model
# Its a regression problem, where we need to predict  "Life Expectancy" which is a continous value
task = "regression"

# Library to use (scikit, weka, h2o)
library = 'weka'
algorithm = "LinearRegression"

removed_features =  ['Measles ', 'percentage expenditure',
                     'infant deaths','Diphtheria ', 'Total expenditure',
                     'Population'
                     'Hepatitis B']
np.random.seed(24)
# features to be used for training
feature = ['Adult Mortality',
       'Alcohol', ' BMI ',
       'under-five deaths ', 'Polio',
       ' HIV/AIDS', ' thinness  1-19 years', 'Schooling','Income composition of resources']  
# Label to predict
label = 'Life expectancy '
# Train-test split percentage
train_percentage = 90

# Save model 
save_model = 'true'
train_response = client.train(task,
                              algorithm,
                              train_data,
                              label,
                              feature,
                              "Life Expectancy Prediction Model",
                              library,
                              train_percentage,
                              save_model)
print(train_response)

## Training

In [None]:
train_job_status_response = client.job_status(train_response['data']['jobId'])
print(train_job_status_response)
print(json.dumps(train_job_status_response, indent=2))

In [None]:
# Model Evaluation Metrics
train_job_output_response = client.job_output(train_response['data']['jobId'])
train_job_output_response 

In [None]:
# Error rate if predictions are given based in mean of the target variable
print(data[label].mean())
print(data[label].std())

# here label is the target variable 

In [None]:
# load the predictions data set and preprocess it as per training
life_exp_predictions = preprocessor.read_csv('Life Expectancy Data test.csv',usecols= feature)
j = preprocessor.read_csv('Life Expectancy Data test.csv')
actual = j['Life expectancy ']

# further processing of predictions_data set  
scaler.fit(life_exp_predictions)
scaled_data_t=scaler.transform(life_exp_predictions)
life_exp_predictions= pd.DataFrame(scaled_data_t, columns = life_exp_predictions.columns)
life_exp_predictions.to_csv('life_exp_predictions.csv',index=False)

In [None]:
test_file_store_response = client.store('life_exp_predictions.csv', Dataset.TEST_DATA)
print(test_file_store_response)
test_data = test_file_store_response['fileUrl']

## Testing

In [None]:
# load the model built
model = train_job_output_response['output']['modelUrl']
model

In [None]:
# Predict using created ML Model
predict_response = client.predict(task, test_data, model, library,features=feature)
predict_response

In [None]:
predict_job_status_response = client.job_status(predict_response['data']['jobId'])
predict_job_status_response

In [None]:
predict_job_output_response = client.job_output(predict_response['data']['jobId'])
predict_job_output_response

In [None]:
pred_file = predict_job_output_response['output']['predFileUrl']
response = client.download(pred_file)

In [None]:
from io import StringIO
import pandas as pd
pred_data = StringIO(response.text)
df = pd.read_csv(pred_data, sep=",")
df

In [None]:
# creating a dataframe for comparing model predictions and actual value
actual_predicted = pd.DataFrame(df['Life expectancy '])
actual_predicted['actual'] = actual
actual_predicted.columns = ['model_prediction', 'actual']
actual_predicted

In [None]:
# for regression problems we use R^2 metric 
# using sklearn packagge for calculating r^2 value
from sklearn.metrics import r2_score 
r2_score(actual,df['_score'])

In [None]:
sns.regplot(x=actual,y=df['_score'])

# <font color=red> Model Two 

## Creating Model

In [None]:
# Create ML Model
# Its a regression problem, where we need to predict  "Life Expectancy" which is a continous value
task = "regression"

# Library to use (scikit, weka, h2o)
library = 'weka'
algorithm = "RandomForest"

removed_features =  ['percentage expenditure',
                     
                     'Population','Hepatitis B'
                    ]
np.random.seed(42)
# features to be used for training
features = ['Adult Mortality',
       'Alcohol', ' BMI ',
       'under-five deaths ', 'Polio',
       ' HIV/AIDS', ' thinness  1-19 years', 'infant deaths','Schooling','Total expenditure','Measles ','Diphtheria ','Income composition of resources']  
# Label to predict
label = 'Life expectancy '
# Train-test split percentage
train_percentage = 80

# Save model 
save_model = 'true'
train_response = client.train(task,
                              algorithm,
                              train_data,
                              label,
                              features,
                              "Life Expectancy Prediction Model",
                              library,
                              train_percentage,
                              save_model)
print(train_response)

## Training

<font color=blue>Checking training status</font>

As training a model might take lot of time depending on size of dataset, we can check current status of model training using below functions

In [None]:
train_job_status_response = client.job_status(train_response['data']['jobId'])
print(train_job_status_response)
print(json.dumps(train_job_status_response, indent=2))

In [None]:
# Model Evaluation Metrics
train_job_output_response = client.job_output(train_response['data']['jobId'])
train_job_output_response

In [None]:
# Error rate if predictions are given based in mean of the target variable
print(data[label].mean())
print(data[label].std())

# here label is the target variable 

In [None]:
# load the predictions data set and preprocess it as per training
life_exp_predictions = preprocessor.read_csv('Life Expectancy Data test.csv',usecols= features)
j = preprocessor.read_csv('Life Expectancy Data test.csv')
actual = j['Life expectancy ']

In [None]:
# further processing of predictions_data set  

In [None]:
scaler.fit(life_exp_predictions)
scaled_data_t=scaler.transform(life_exp_predictions)
life_exp_predictions= pd.DataFrame(scaled_data_t, columns = life_exp_predictions.columns)
life_exp_predictions.to_csv('life_exp_predictions.csv',index=False)

In [None]:
life_exp_predictions.head()

In [None]:
# Upload test dataset
test_file_store_response = client.store('life_exp_predictions.csv', Dataset.TEST_DATA)
print(test_file_store_response)
test_data = test_file_store_response['fileUrl']

## Testing

In [None]:
# load the model built
model = train_job_output_response['output']['modelUrl']
model

In [None]:
# Predict using created ML Model
predict_response = client.predict(task, test_data, model, library)
predict_response

In [None]:
predict_job_status_response = client.job_status(predict_response['data']['jobId'])
predict_job_status_response

In [None]:
predict_job_output_response = client.job_output(predict_response['data']['jobId'])
predict_job_output_response

In [None]:
pred_file = predict_job_output_response['output']['predFileUrl']
response = client.download(pred_file)

In [None]:
from io import StringIO
import pandas as pd
pred_data = StringIO(response.text)
df = pd.read_csv(pred_data, sep=",")
df

In [None]:
# creating a dataframe for comparing model predictions and actual value
actual_predicted = pd.DataFrame(df['Life expectancy '])
actual_predicted['actual'] = actual
actual_predicted.columns = ['model_prediction', 'actual']
actual_predicted

In [None]:
# for regression problems we use R^2 metric 
# using sklearn packagge for calculating r^2 value
from sklearn.metrics import r2_score 
r2_score(actual,df['_score']) 

In [None]:
sns.regplot(x=actual,y=df['_score'])



# <font color=red> Model Three

## Creating Model

In [None]:
# Create ML Model
# Its a regression problem, where we need to predict  "Life Expectancy" which is a continous value
task = "regression"

# Library to use (scikit, weka, h2o)
library = 'weka'
algorithm = "RandomForest"

removed_features =  ['Measles ', 'percentage expenditure',
                     'infant deaths','Diphtheria ', 'Total expenditure',
                     'Population'
                     'Hepatitis B']
np.random.seed(42)
# features to be used for training
features = ['Income composition of resources', 
            'Schooling',
            ' thinness  1-19 years',
            ' HIV/AIDS',
            'Adult Mortality']  
# Label to predict
label = 'Life expectancy '
# Train-test split percentage
train_percentage = 95

# Save model 
save_model = True
train_response = client.train(task,
                              algorithm,
                              train_data,
                              label,
                              features,
                              "Life Expectancy Prediction Model",
                              library,
                              train_percentage,
                              save_model)
print(train_response)

## Training

In [None]:
train_job_status_response = client.job_status(train_response['data']['jobId'])
print(train_job_status_response)
print(json.dumps(train_job_status_response, indent=2))

In [None]:
# Model Evaluation Metrics
train_job_output_response = client.job_output(train_response['data']['jobId'])
train_job_output_response

In [None]:
# Error rate if predictions are given based in mean of the target variable
print(data[label].mean())
print(data[label].std())

# here label is the target variable 

In [None]:
# load the predictions data set and preprocess it as per training
life_exp_predictions = preprocessor.read_csv('Life Expectancy Data test.csv',usecols= features)
j = preprocessor.read_csv('Life Expectancy Data test.csv')
actual = j['Life expectancy ']

# further processing of predictions_data set  
scaler.fit(life_exp_predictions)
scaled_data_t=scaler.transform(life_exp_predictions)
life_exp_predictions= pd.DataFrame(scaled_data_t, columns = life_exp_predictions.columns)
life_exp_predictions.to_csv('life_exp_predictions.csv',index=False)

In [None]:
# Upload test dataset
test_file_store_response = client.store('life_exp_predictions.csv', Dataset.TEST_DATA)
print(test_file_store_response)
test_data = test_file_store_response['fileUrl']

## Testing

In [None]:
# load the model built
model = train_job_output_response['output']['modelUrl']
model

In [None]:
# Predict using created ML Model
predict_response = client.predict(task, test_data, model, library)
predict_response

In [None]:
predict_job_status_response = client.job_status(predict_response['data']['jobId'])
predict_job_status_response

In [None]:
predict_job_output_response = client.job_output(predict_response['data']['jobId'])
predict_job_output_response

In [None]:
pred_file = predict_job_output_response['output']['predFileUrl']
response = client.download(pred_file)

In [None]:
from io import StringIO
import pandas as pd
pred_data = StringIO(response.text)
df = pd.read_csv(pred_data, sep=",")
df

In [None]:
# creating a dataframe for comparing model predictions and actual value
actual_predicted = pd.DataFrame(df['Life expectancy '])
actual_predicted['actual'] = actual
actual_predicted.columns = ['model_prediction', 'actual']
actual_predicted

In [None]:
# for regression problems we use R^2 metric 
# using sklearn packagge for calculating r^2 value
from sklearn.metrics import r2_score 
r2_score(actual,df['_score']) 

In [None]:
actual_predicted.columns

In [None]:
sns.regplot(x=actual,y=df['_score'])

# <font color=red> Model Four

## Creating Model

In [None]:
# Create ML Model
# Its a regression problem, where we need to predict  "Life Expectancy" which is a continous value
task = "regression"

# Library to use (scikit, weka, h2o)
library = 'weka'
algorithm = "RandomForest"

removed_features =  ['Measles ', 
                     'percentage expenditure',
                     'infant deaths',
                     'Diphtheria ', 
                     'Total expenditure',
                     'Population',
                     ' HIV/AIDS', 
                     'Schooling',
                     'Hepatitis B']
np.random.seed(42)
# features to be used for training
features = ['Adult Mortality',
       'Alcohol', ' BMI ',
       'under-five deaths ', 'Polio',
        ' thinness  1-19 years','Income composition of resources']  
# Label to predict
label = 'Life expectancy '
# Train-test split percentage
train_percentage = 80

# Save model 
save_model = True
train_response = client.train(task,
                              algorithm,
                              train_data,
                              label,
                              features,
                              "Life Expectancy Prediction Model1",
                              library,
                              train_percentage,
                              save_model)
print(train_response)

## Training

<font color=blue>Checking training status</font>

In [None]:
train_job_status_response = client.job_status(train_response['data']['jobId'])
print(train_job_status_response)
print(json.dumps(train_job_status_response, indent=2))

In [None]:
# Model Evaluation Metrics
train_job_output_response = client.job_output(train_response['data']['jobId'])
train_job_output_response 

In [None]:
# Error rate if predictions are given based in mean of the target variable
print(data[label].mean())
print(data[label].std())

# here label is the target variable 

In [None]:
# load the predictions data set and preprocess it as per training
life_exp_predictions = preprocessor.read_csv('Life Expectancy Data test.csv',usecols= features)
j = preprocessor.read_csv('Life Expectancy Data test.csv')
actual = j['Life expectancy ']

# further processing of predictions_data set  
scaler.fit(life_exp_predictions)
scaled_data_t=scaler.transform(life_exp_predictions)
life_exp_predictions= pd.DataFrame(scaled_data_t, columns = life_exp_predictions.columns)
life_exp_predictions.to_csv('life_exp_predictions.csv',index=False)

In [None]:
test_file_store_response = client.store('life_exp_predictions.csv', Dataset.TEST_DATA)
print(test_file_store_response)
test_data = test_file_store_response['fileUrl']

## Testing

In [None]:
# load the model built
model = train_job_output_response['output']['modelUrl']
model

In [None]:
# Predict using created ML Model
predict_response = client.predict(task, test_data, model, library,features=features)
predict_response

In [None]:
predict_job_status_response = client.job_status(predict_response['data']['jobId'])
predict_job_status_response

In [None]:
predict_job_output_response = client.job_output(predict_response['data']['jobId'])
predict_job_output_response

In [None]:
pred_file = predict_job_output_response['output']['predFileUrl']
response = client.download(pred_file)

In [None]:
from io import StringIO
import pandas as pd
pred_data = StringIO(response.text)
df = pd.read_csv(pred_data, sep=",")
df

In [None]:
# creating a dataframe for comparing model predictions and actual value
actual_predicted = pd.DataFrame(df['Life expectancy '])
actual_predicted['actual'] = actual
actual_predicted.columns = ['model_prediction', 'actual']
actual_predicted

In [None]:
# for regression problems we use R^2 metric 
# using sklearn packagge for calculating r^2 value
from sklearn.metrics import r2_score 
r2_score(actual,df['_score'])

In [None]:
sns.regplot(x=actual,y=df['_score'])

# <font color=red> Model Five

## Creating Model

In [None]:
# Create ML Model
# Its a regression problem, where we need to predict  "Life Expectancy" which is a continous value
task = "regression"

# Library to use (scikit, weka, h2o)
library = 'weka'
algorithm = "RandomForest"

removed_features =  [ 
                   'Total expenditure','Schooling','under-five deaths ', ' BMI ', 'Polio', 'Measles ' 
                   , 'Population', 'percentage expenditure'
                     ]
np.random.seed(24)
# features to be used for training


features = ['Adult Mortality','Hepatitis B',' thinness  1-19 years',
            ' HIV/AIDS','Diphtheria ','Income composition of resources','infant deaths'
       , 'Alcohol'] 


# Label to predict
label = 'Life expectancy '
# Train-test split percentage
train_percentage = 90

# Save model 
save_model = 'true'
train_response = client.train(task,
                              algorithm,
                              train_data,
                              label,
                              features,
                              "Life Expectancy Prediction Model",
                              library,
                              train_percentage,
                              save_model)
print(train_response)

## Training

In [None]:
train_job_status_response = client.job_status(train_response['data']['jobId'])
print(train_job_status_response)
print(json.dumps(train_job_status_response, indent=2))

In [None]:
# Model Evaluation Metrics
train_job_output_response = client.job_output(train_response['data']['jobId'])
train_job_output_response 

In [None]:
# Error rate if predictions are given based in mean of the target variable
print(data[label].mean())
print(data[label].std())

# here label is the target variable 

In [None]:
# load the predictions data set and preprocess it as per training
life_exp_predictions = preprocessor.read_csv('Life Expectancy Data test.csv',usecols= features)
j = preprocessor.read_csv('Life Expectancy Data test.csv')
actual = j['Life expectancy ']

# further processing of predictions_data set  
scaler.fit(life_exp_predictions)
scaled_data_t=scaler.transform(life_exp_predictions)
life_exp_predictions= pd.DataFrame(scaled_data_t, columns = life_exp_predictions.columns)
life_exp_predictions.to_csv('life_exp_predictions.csv',index=False)

In [None]:
test_file_store_response = client.store('life_exp_predictions.csv', Dataset.TEST_DATA)
print(test_file_store_response)
test_data = test_file_store_response['fileUrl']

## Testing

In [None]:
# load the model built
model = train_job_output_response['output']['modelUrl']
model

In [None]:
# Predict using created ML Model
predict_response = client.predict(task, test_data, model, library,features=features)
predict_response

In [None]:
predict_job_status_response = client.job_status(predict_response['data']['jobId'])
predict_job_status_response

In [None]:
predict_job_output_response = client.job_output(predict_response['data']['jobId'])
predict_job_output_response

In [None]:
pred_file = predict_job_output_response['output']['predFileUrl']
response = client.download(pred_file)

In [None]:
from io import StringIO
import pandas as pd
pred_data = StringIO(response.text)
df = pd.read_csv(pred_data, sep=",")
df

In [None]:
# creating a dataframe for comparing model predictions and actual value
actual_predicted = pd.DataFrame(df['Life expectancy '])
actual_predicted['actual'] = actual
actual_predicted.columns = ['model_prediction', 'actual']
actual_predicted

In [None]:
# for regression problems we use R^2 metric 
# using sklearn packagge for calculating r^2 value
from sklearn.metrics import r2_score 
r2_score(actual,df['_score'])

In [None]:
sns.regplot(x=actual,y=df['_score'])

# <font color=red> Model Six

## Creating Model

In [None]:
# Create ML Model
# Its a regression problem, where we need to predict  "Life Expectancy" which is a continous value
task = "regression"

# Library to use (scikit, weka, h2o)
library = 'weka'
algorithm = "RandomForest"

removed_features =  [ 
                   'Total expenditure','Schooling','under-five deaths ', ' BMI ', 'Measles ' 
                   ,  'percentage expenditure','Population'
                     ]
np.random.seed(24)
# features to be used for training


features = ['Adult Mortality','Hepatitis B',' thinness  1-19 years','Year','under-five deaths ','Polio',
            ' HIV/AIDS','Diphtheria ','Income composition of resources','infant deaths', 'Alcohol'] 


# Label to predict
label = 'Life expectancy '
# Train-test split percentage
train_percentage = 90

# Save model 
save_model = 'true'
train_response = client.train(task,
                              algorithm,
                              train_data,
                              label,
                              features,
                              "Life Expectancy Prediction Model",
                              library,
                              train_percentage,
                              save_model)
print(train_response)

## Training

In [None]:
train_job_status_response = client.job_status(train_response['data']['jobId'])
print(train_job_status_response)
print(json.dumps(train_job_status_response, indent=2))

In [None]:
# Model Evaluation Metrics
train_job_output_response = client.job_output(train_response['data']['jobId'])
train_job_output_response 

In [None]:
# Error rate if predictions are given based in mean of the target variable
print(data[label].mean())
print(data[label].std())

# here label is the target variable 

In [None]:
# load the predictions data set and preprocess it as per training
life_exp_predictions = preprocessor.read_csv('Life Expectancy Data test.csv',usecols= features)
j = preprocessor.read_csv('Life Expectancy Data test.csv')
actual = j['Life expectancy ']

# further processing of predictions_data set  
scaler.fit(life_exp_predictions)
scaled_data_t=scaler.transform(life_exp_predictions)
life_exp_predictions= pd.DataFrame(scaled_data_t, columns = life_exp_predictions.columns)
life_exp_predictions.to_csv('life_exp_predictions.csv',index=False)

In [None]:
test_file_store_response = client.store('life_exp_predictions.csv', Dataset.TEST_DATA)
print(test_file_store_response)
test_data = test_file_store_response['fileUrl']

## Testing

In [None]:
# load the model built
model = train_job_output_response['output']['modelUrl']
model

In [None]:
# Predict using created ML Model
predict_response = client.predict(task, test_data, model, library,features=features)
predict_response

In [None]:
predict_job_status_response = client.job_status(predict_response['data']['jobId'])
predict_job_status_response

In [None]:
predict_job_output_response = client.job_output(predict_response['data']['jobId'])
predict_job_output_response

In [None]:
pred_file = predict_job_output_response['output']['predFileUrl']
response = client.download(pred_file)

In [None]:
from io import StringIO
import pandas as pd
pred_data = StringIO(response.text)
df = pd.read_csv(pred_data, sep=",")
df

In [None]:
# creating a dataframe for comparing model predictions and actual value
actual_predicted = pd.DataFrame(df['Life expectancy '])
actual_predicted['actual'] = actual
actual_predicted.columns = ['model_prediction', 'actual']
actual_predicted

In [None]:
# for regression problems we use R^2 metric 
# using sklearn packagge for calculating r^2 value
from sklearn.metrics import r2_score 
r2_score(actual,df['_score'])

In [None]:
sns.regplot(x=actual,y=df['_score'])

# Summary

First, the dataset was cleaned and scaled. For scaling min-max scaler was used.

All the outliers were removed, using three different techniques, imputation, grouped mean and dropping.
Specific attributes were found to be highly correlated to the target variable, which was 'Life Expectancy'. 

Models were built by tweaking the parameters, to receive the highest accuracy.
The highest accuracy reached was 97.02% on test data.
