# Life Expectancy

### Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

### Importing the dataset

In [None]:
df = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
df.head()

In [None]:
df.shape

In [None]:
# Statistical info
df.describe()

In [None]:
# Datatypes of Attributes
df.info()

In [None]:
#Fixing column names
df.rename(columns = {" BMI " :"BMI", 
                                  "Life expectancy ": "Life_expectancy",
                                  "Adult Mortality":"Adult_mortality",
                                  "infant deaths":"Infant_deaths",
                                  "percentage expenditure":"Percentage_expenditure",
                                  "Hepatitis B":"HepatitisB",
                                  "Measles ":"Measles",
                                  "under-five deaths ": "Under_five_deaths",
                                  "Total expenditure":"Total_expenditure",
                                  "Diphtheria ": "Diphtheria",
                                  " thinness  1-19 years":"Thinness_1-19_years",
                                  " thinness 5-9 years":"Thinness_5-9_years",
                                  " HIV/AIDS":"HIV/AIDS",
                                  "Income composition of resources":"Income_composition_of_resources"}, inplace = True)

## Data Preprocessing

In [None]:
# check for categorical attributes
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)
cat_col

In [None]:
# Check the unique values in dataset
df.apply(lambda x: len(x.unique()))

In [None]:
y = df["Life_expectancy"]
df1 = df.drop(["Life_expectancy"], axis=1)

categorical = df1.select_dtypes(include= "O")
numerical = df1.select_dtypes(exclude= "O")

## Exporatory Data Analysis:

### Graphical EDA

In [None]:
for feature in categorical.columns:
    sns.countplot(categorical[feature], dodge=True)
    plt.show()


In [None]:
for feature in numerical.columns:
    sns.distplot(numerical[feature])
    plt.xticks(fontsize= 12)
    plt.yticks(fontsize=12)
    plt.ylabel("Count", fontsize= 13, fontweight="bold")
    plt.xlabel(feature, fontsize=13, fontweight="bold")
    plt.show()

In [None]:
sns.distplot(y)
plt.ylabel("Count")
plt.xlabel("Life Expectancy")

In [None]:
sns.boxplot(x = df1["Status"], y = y)
plt.ylabel("Life Expectancy")
plt.xlabel("Status")
plt.show()

In [None]:
for feature in numerical.columns:
    sns.scatterplot(x = numerical[feature], y = y, hue = categorical.Status)
    plt.xticks(rotation=90)
    plt.ylabel("Life Expectancy")
    plt.xlabel(feature)
    plt.show()

In [None]:
life_numeric_data = df.drop(columns=["Year","Country","Status"])

#### Dealing with the Outliers

In [None]:
def outlier_count(col, data = df):
    
    print("\n"+15*'-' + col + 15*'-'+"\n")
    
    q75, q25 = np.percentile(data[col], [75, 25])
    iqr = q75 - q25
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])
    outlier_percent = round(outlier_count/len(data[col])*100, 2)
    print('Number of outliers: {}'.format(outlier_count))
    print('Percent of data that is outlier: {}%'.format(outlier_percent))

In [None]:
cont_vars = list(life_numeric_data)
for col in cont_vars:
    outlier_count(col)

### Feature Engineering

In [None]:
# To check the null values
def checkna(df):
    missing_values = df.isna().sum().reset_index()
    missing_values.columns = ["Features", "Missing_Values"]
    missing_values["Missing_Percent"]= round(missing_values.Missing_Values/len(df)*100,2)
    return missing_values[missing_values.Missing_Values > 0 ]

In [None]:
checkna(df)

##### Imputer

* Univariate feature imputation.
* The SimpleImputer class provides basic strategies for imputing missing values. Missing values can be imputed with a provided   constant value, or using the statistics (mean, median or most frequent) of each column in which the missing values are located.

In [None]:
def imputer(df, feature, method):
    if method == "mode":
        df[feature] = df[feature].fillna(df[feature].mode()[0])
        
    elif method == "median":
        df[feature] = df[feature].fillna(df[feature].median())
        
    else:
        df[feature] = df[feature].fillna(df[feature].mean())

In [None]:
features_missing= df.columns[df.isna().any()]
for feature in features_missing:
    imputer(df, feature= feature, method= "mean")

In [None]:
y.fillna(y.median(), inplace=True)

In [None]:
checkna(df)

* NO missing values found

### Correlation Matrix

In [None]:
plt.figure(figsize = (24,16))
sns.heatmap(pd.concat([df,y], axis=1).corr(), annot=True, cmap="coolwarm")

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
columns = ["Status"]
for feature in columns:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

In [None]:
X = df.drop(['Country', 'Year', 'Infant_deaths', 'Life_expectancy'],1)

In [None]:
X.shape

 ### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

### Feacture Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Function to apply Regression algorithms and return the results of models

# libraries for ML Models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# libraries for model evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error

def predictive_models():
    algorithms = [SVR(), KNeighborsRegressor(), DecisionTreeRegressor(random_state = 0), 
                  RandomForestRegressor(n_estimators = 100, random_state = 0)]
    
    algorithm_names = ["SVR", "KNeighbors Regressor", "Decision-Tree Regressor", "Random-Forest Regressor"]
    
    # Errors for training data
    Mean_Squared_Error_Training = []
    Mean_Absolute_Error_Training = []
    Accuracy_Training = []
    
    # Errors for testing data
    Mean_Squared_Error_Testing = []
    Mean_Absolute_Error_Testing = []
    Accuracy_Testing = []
    
    # Regression models
    for i in algorithms:
        model = i
        model.fit(X_train,y_train)
    
        y_test_predict = model.predict(X_test)
        y_train_predict = model.predict(X_train)
            
        mse_1 = round(mean_squared_error(y_train, y_train_predict),4)
        mae_1 = round(mean_absolute_error(y_train, y_train_predict),4)
        
        mse_2 = round(mean_squared_error(y_test, y_test_predict),4)
        mae_2 = round(mean_absolute_error(y_test, y_test_predict),4)
        
        # Appending the Errors into the list for training data
        Mean_Squared_Error_Training.append(mse_1)
        Mean_Absolute_Error_Training.append(mae_1)
                
        # Appending the Errors into the list for training data
        Mean_Squared_Error_Testing.append(mse_2)
        Mean_Absolute_Error_Testing.append(mae_2)
        
    # Creating DataFrame for Logs of Models and their errors    
    results = pd.DataFrame({"Models":algorithm_names,
                            "Mean Squared Error Training":Mean_Squared_Error_Training,
                            "Mean Absolute Error Training":Mean_Absolute_Error_Training,      
                            "Mean Squared Error Testing":Mean_Squared_Error_Testing,
                            "Mean Absolute Error Testing":Mean_Absolute_Error_Testing})

    return results

In [None]:
results = predictive_models()
results

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'bootstrap': [True], 'max_depth': [5, 10, None], 'max_features': ['auto', 'log2'],
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}
rfr = RandomForestRegressor(random_state = 1)

grid_search = GridSearchCV(estimator = rfr, param_grid = param_grid, 
                           cv = 5, n_jobs = -1, verbose = 0, return_train_score=True)
grid_search.fit(X_train, y_train);
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)