# Predicting The Age of Death - Linear Regression
Dataset : https://www.kaggle.com/kumarajarshi/life-expectancy-who

# Importing Libraries

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import ShuffleSplit, cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Import Dataset

```
DATASET_URL = 'https://raw.githubusercontent.com/avinash-218/Life-Expectancy-WHO/master/Life_Expectancy_Data.csv'
req = requests.get('https://raw.githubusercontent.com/avinash-218/Life-Expectancy-WHO/master/Life_Expectancy_Data.csv')
url_content = req.content
csv_file = open('Life_Expectancy_Data.csv','wb')
csv_file.write(url_content)
csv_file.close()
```

In [None]:
dataset = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
dataset

# Data Cleaning

#### Removing Trailing Spaces in Column Names
Some column names in the dataset contains trailing space. So let's remove the trailing spaces

In [None]:
list(dataset.columns)

In [None]:
def rename_col_names(x):
    out={}
    for i in x:
        out[i] = i.rstrip()
    return out

In [None]:
col_names = list(dataset.columns)
dataset.rename(columns=rename_col_names(col_names), inplace=True)

In [None]:
col_names = list(dataset.columns)
col_names

Let's Analyse and continue the preprocessing

# Data Analysis

### Numeric Columns

In [None]:
dataset

In [None]:
numeric_cols = list(dataset.select_dtypes(include=np.number).columns)
numeric_cols.remove('Life expectancy') #target column
cnt_numeric_cols = len(numeric_cols)
fig, axes = plt.subplots(nrows=cnt_numeric_cols, ncols=3, figsize=(25,150))
fig.tight_layout(pad=3)

for i in range(cnt_numeric_cols):
    col = numeric_cols[i]
    axes[i,0].set_title('{} Distribution'.format(col))
    axes[i,0].set_xlabel(col)
    sns.histplot(ax=axes[i,0], x=dataset[col])
      
    axes[i,1].set_title('{} Boxplot'.format(col))
    axes[i,1].set_xlabel(col)
    sns.boxplot(ax=axes[i,1], x=dataset[col])
        
    axes[i,2].set_title('{} Scatterplot'.format(col))
    axes[i,2].set_xlabel(col)
    sns.scatterplot(ax=axes[i,2], data=dataset, x=col, y='Life expectancy')
plt.show()
fig.savefig('Numerical Data Visualisation.jpeg', pil_kwargs={'quality': 95})

<b> Note :</b><br>
Axes, title might not be visible in the saved image if your windows is in dark mode and the image launcher is the default windows program. Try to open with paint if these are not visible.<bt>
Opening in the notebook also helps.

Below Code displays nunique, unique, % of nunique in the column, % of nan for each columns.

In [None]:
print('column \t nunique \t unique \t % of nunique in column \t % of nan')
print('-'*100)
for i in dataset[numeric_cols]:
    print(i,':',dataset[i].nunique(),dataset[i].unique(),dataset[i].nunique()*100/13320,dataset[i].isna().sum()*100/13320,end='\n\n\n')

### Categorical Columns
Lesser categorical valued columns only can be visualized properly.<br>
So visualize only those columns with lesser categories (but let's say threshold 200 here -explained in Note).<br>
<b>Note:</b> This throws error when only one categorical column satisfy with threshold since it will be 1D which contradicts with the below code for 2D.<br>
The plots for the categorical column country is included below because there will be only one categorical column that satisfy lesser nuniques which throws error.

In [None]:
categorical_cols = list(dataset.select_dtypes('object'))
less_category_cols = dataset[categorical_cols].columns[dataset[categorical_cols].nunique() < 200]
cnt_less_category_cols = len(less_category_cols)
fig, axes = plt.subplots(nrows=cnt_less_category_cols, ncols=3, figsize=(35, 20))
fig.tight_layout(pad=3)

for i in range(cnt_less_category_cols):
    col = less_category_cols[i]
      
    axes[i,0].set_title('{} Bargraph'.format(col))
    axes[i,0].set_xlabel(col)
    sns.barplot(ax=axes[i,0], data=dataset, x=col, y='Life expectancy')
    
    axes[i,1].set_title('{} Box Plot'.format(col))
    axes[i,1].set_xlabel(col)
    sns.boxplot(ax=axes[i,1], x=dataset[col], y=dataset['Life expectancy'])
        
    axes[i,2].set_title('{} Scatter Plot'.format(col))
    axes[i,2].set_xlabel(col)
    sns.scatterplot(ax=axes[i,2], data=dataset, x=col, y='Life expectancy')
    
plt.show()
fig.savefig('Categorical Data Visualisation.jpeg', pil_kwargs={'quality': 95})

#### Null Values Tratement

Percentage of null values in each columns

Fill missing values grouped by countries.<br>
Eg: Fill missing values in GDP based on the same country.<br>
Remove the rows in which the target column is NaN

In [None]:
dataset = dataset[dataset['Life expectancy'].notna()].copy()
dataset

In [None]:
dataset.isna().sum()

In [None]:
dataset.isna().sum() / dataset.shape[0] * 100

In [None]:
columns_with_null = list(dataset.columns[dataset.isna().any()])
dataset[columns_with_null] = dataset.groupby('Country')[columns_with_null].transform(lambda x:x.fillna(x.mean()))
dataset

In [None]:
dataset.isna().sum()

In [None]:
dataset.isna().sum() / dataset.shape[0] * 100

Even now some data are NaN. This is because for some countries these columns were not measured. So just drop them

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset.isna().sum() / dataset.shape[0] * 100

In [None]:
dataset

#### Encoding Status Column

In [None]:
def status_encode(x):
    if(x=='Developed'):
        return 1
    else:
        return 0

In [None]:
dataset['Status'] = dataset['Status'].apply(status_encode)
dataset

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(dataset[['Country']])

In [None]:
encoded_cols = list(encoder.get_feature_names())
encoded_cols

In [None]:
dataset[encoded_cols] = encoder.transform(dataset[['Country']])
dataset = dataset.drop('Country',axis=1)
dataset

# Identifying Input & Target Column(s)

In [None]:
def identify_cols(dataset):
    col_names = list(dataset.columns)
    input_cols = col_names.copy()
    input_cols.remove('Life expectancy')
    target_col = 'Life expectancy'
    #encoded_cols
    X = dataset[input_cols]
    Y = dataset[target_col]
    return  X, Y

In [None]:
X, Y = identify_cols(dataset)

# Feature Engineering
scikit-learn's mutual_info_regression and mutual_info_classif treat discrete and continuous values differently. So it is required to inform which are discrete columns.

##### Base Model

In [None]:
model_df=pd.DataFrame()
def train_validate(X, Y,stri):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2)
    #X_train.shape, Y_train.shape, X_test.shape, Y_test.shape
    model = LinearRegression()
    model.fit(X_train, Y_train)
    df = pd.DataFrame({'Train Accuracy':[model.score(X_train, Y_train)*100],'Test Accuracy':[model.score(X_test, Y_test)*100]},index=[stri])
    return df

In [None]:
model_df = pd.concat([model_df, train_validate(X,Y,'Base Model')])
model_df

In [None]:
base_acc = abs(model_df['Train Accuracy'][0]-model_df['Test Accuracy'][0])
base_acc

## MI Scores

In [None]:
discrete_features = (X.dtypes == 'int64') #finding the discrete columns
def find_mi_scores(X, Y, discrete_features):
    mi_scores = mutual_info_regression(X, Y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name='MI Scores', index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mi_scores = find_mi_scores(X, Y, discrete_features)
mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks=list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title('Mutual Information Scores')
plt.figure(dpi=100, figsize=(8,5))
plot_mi_scores(mi_scores[:20])

The bar graph is plotted in descending order.<br>
So, the rest of the encoded Country Columns are still lesser.<br>
So it is not necessary to visualize the MI Score of them too.

### Model without Categorical Columns (manual)

##### Model without Country Column(original + encoded)

In [None]:
X, Y = identify_cols(dataset.drop(encoded_cols, axis=1))
train_validate(X, Y, 'Without Country')

Accuray decreases so this column should be considered

### Model without Numerical Columns (Auto)

```
col_names = list(mi_scores.index)
for i in range(len(col_names)-1,-1,-1):
    col = col_names[i]
    if(col not in encoded_cols):
        X, Y = identify_cols(dataset.drop(col, axis=1))
        model_df = pd.concat([model_df, train_validate(X,Y,'Without ' + col + ' :')])
```

In [None]:
model_df.sort_values(by=['Train Accuracy','Test Accuracy'], ascending=False)

### Feature Selection
Discard the features which are causing irrelevant contribution to the dataset.<br>
(Negative Impact or no Impact)

In [None]:
col_names = list(mi_scores.index)
for i in range(len(col_names)-1,-1,-1):
    col = col_names[i]
    if(col not in encoded_cols):
        val_col = dataset[col].copy()
        dataset = dataset.drop(col, axis=1)
        X, Y = identify_cols(dataset)
        tr,te = train_validate(X,Y,'Without ' + col + ' :')['Train Accuracy'],train_validate(X,Y,'Without ' + col + ' :')['Test Accuracy']
        err = abs(tr[0]-te[0])
        if(base_acc <= err):
            base_acc = err
        else:
            dataset[col] = val_col

In [None]:
col_to_consider =[i for i in dataset.columns.to_list() if i not in encoded_cols]
print('Columns Which are to be considered (after Feature Engineering)\n',col_to_consider)

In [None]:
len(col_to_consider)

# Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

In [None]:
model = LinearRegression()
model.fit(X_train, Y_train)
print('Training Accuracy :',model.score(X_train, Y_train)*100)
print('Test Accuracy :',model.score(X_test, Y_test)*100)

# K-Fold Cross Validation

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_sc = cross_val_score(LinearRegression(), X, Y, cv=kf)
print('Accuracy :',kf_sc.mean()*100)

# Cross Validation

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
cv_sc = cross_val_score(LinearRegression(), X, Y, cv=cv)
print('Accuracy :',cv_sc.mean()*100)

# Grid Search CV

In [None]:
def find_best_model(X, Y):
    algos={
        'linear_reg':{'model':LinearRegression(), 'params':{'normalize':[True, False]}},
        'lasso':{'model':Lasso(), 'params':{'alpha':[1,2], 'selection':['random', 'cyclic']}},
        'decision_tree':{'model':DecisionTreeRegressor(), 'params':{'criterion':['mse','friedman_mse'],'splitter':['best','random']}},
        'random_forest':{'model':RandomForestRegressor(), 'params':{'n_jobs':[-1], 'n_estimators':[10, 50, 100],'max_depth':[5,10,20], 'max_leaf_nodes':[50, 100]}},
        'xgb':{'model':XGBRegressor(), 'params':{'n_jobs':[-1], 'n_estimators':[10,50,100],'max_depth':[5,10,20],'max_leaf_nodes':[50,100],'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.55],'booster':['gblinear']}}
            }
    scores = []
    cv = ShuffleSplit(n_splits=5, random_state=42, test_size=0.2)
    for algo, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, Y)
        scores.append({
            'model':algo,
            'best_score': gs.best_score_,
            'best_params':gs.best_params_
        })
    return pd.DataFrame(scores, columns=['model','best_score', 'best_params'])
models_summary = find_best_model(X, Y)

In [None]:
models_summary

From the summary above,<br>
<b>Linear Regression</b> with parameter(s) : normalize:'False' gives the best result

# Final Best Model

In [None]:
model = LinearRegression(normalize=True)
model.fit(X_train, Y_train)
print('Training Accuracy :',model.score(X_train, Y_train)*100)
print('Test Accuracy :',model.score(X_test, Y_test)*100)

In [None]:
Y_train.values,model.predict(X_train)

In [None]:
Y_test.values,model.predict(X_test)