# STEP 1 - FRAME THE PROBLEM
The purpose is to establish a reason for collecting and exploring a set of data.

In [None]:
# You just became the leader of a country and your main goal is to increase the life expectancy of your citizens. 
# What factors are most important for achieving this goal?

# STEP 2 - COLLECT RAW DATA
The purpose of this step is to collect reliable and robust data that will help understand the problem. Sometimes we need to collect it ourselves and need the proper resources (time, money, talent). Other times we can get it from a reliable third party (World Health Organization, US Census)

In [None]:
### Load the Python environment that is loaded on Kaggle's servers

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    
### Load (import) the tools that will be used to explore the data.
#   We might not know exactly what tools we will need, therefore, they can be called upon as we need them.
#   The data analysis will dictate this.
    
import numpy as np # tool for linear algebra
import pandas as pd # tool for data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # tool for data visualization
import seaborn as sns # tool for advanced data visualization
from scipy import stats # tool for statistics

df = pd.read_csv("/kaggle/input/life-expectancy-who/Life Expectancy Data.csv") # df defines the entire dataset

df.head()

# STEP 3 - PROCESS THE DATA
The purpose of this step is to prepare the dataframe for data analysis. We address outliers, missing data, unnecessary/duplicate columns, value input errors, renaming, etc.

### Data Characteristics

In [None]:
print("Original Shape: ", df.shape)

### RENAME COLUMNS

df.columns=['Country', 'Year', 'Status', 'Life_Expectancy', 'Adult_Mortality', 'Infant_Deaths', 'Alcohol', 
            'Percent_Expenditure', 'Hep_B', 'Measles', 'BMI', 'Under_5_Deaths', 'Polio', 'Total_Expenditure', 
            'Diptheria', 'AIDS', 'GDP', 'Population', 'Thinness', 'Thinness_5-9yrs', 
            'Income', 'Schooling'] 

### DROP UNNECESSARY COLUMNS

df = df.drop(columns = ['Adult_Mortality', 'Infant_Deaths','Under_5_Deaths','Percent_Expenditure',
                        'Measles','Thinness_5-9yrs', 'Country','Year', 'Status']) 

### CHANGE INCOME SCALE FROM 0-1 TO 1-100
df.Income = df.Income*100 

print("Post Changes Shape: ", df.shape)

df.head()

### Missing Data

In [None]:
### MISSING DATA DETECTION AND REMOVAL

print("Pre Missing Values Qty: ", df.isnull().sum()) 

df = df.dropna(thresh=13) # drop any rows having less than 13 columns with non-missing values

print("Post Missing Values Qty: ", df.isnull().sum()) 

print("Post Missing Data Shape: ", df.shape)

### Other Cleanups

In [None]:
### CONVERT ALL CONTINUOUS NUMERICAL VALUES (FLOAT) TO DISCRETE NUMERICAL VALUES (INTEGER)

df2 = df.select_dtypes(include=['float64']) # detects columns defined as float
for col in df2.columns.values: df2[col] = df2[col].astype('int64') # changes them to integer

df2.info()
    
df2.head()

### Outliers

In [None]:
#   We will only remove outliers for Life_Expectancy because it takes into account a combination of all the 
#   other independent variables. Outliers here would signify a combination of extraordinary generational 
#   circumstances (war, epidemic, natural disaster, nuclear disaster, etc.). Outliers in the independent 
#   variables are left alone because any single variable wouldn't extremely affect the Life_Expectancy.

fig, ax = plt.subplots(figsize=(3, 1.25)) # visualization size
sns.boxplot(x=df2['Life_Expectancy']) # forms a box and whisker plot

In [None]:
print(" Pre Outliers Shape: ", df2.shape) # shows 1649 observations

Q1 = df2.quantile(0.25) # defines 25th %ile
Q3 = df2.quantile(0.75) # defines 75th %ile
IQR = Q3 - Q1 # defines interquartile range
Upper = Q3 + 1.5*IQR # defines upper limit for outliers
Lower = Q1 - 1.5*IQR # defines lower limit for outliers

print("UPPER LIMIT", Upper) # shows the value of upper limit so that we can use it
print("LOWER LIMIT", Lower) # shows the value of the lower limit so that we can use it

outlier = df2[(df2['Life_Expectancy'] >= 91)|(df2['Life_Expectancy'] <= 49)].index # identifies the outlier values
df2.drop(outlier, inplace=True) # drops the outliers

print("After Outliers Shape: ", df2.shape) 

fig, ax = plt.subplots(figsize=(3, 1.25)) # visualization size
sns.boxplot(x=df2['Life_Expectancy']) # box and whisker plot

# STEP 4 - EXPLORATORY DATA ANALYSIS
The purpose of this step is to understand the main characteristics of the data. This approach enables us to begin mapping out a rough plan for analyzing the data.

In [None]:
### ADDITIONAL BOX & WHISKER PLOTS
#   To explore more of the variables in the data set.

fig, ax = plt.subplots(figsize=(3, 1.25))
sns.boxplot(x=df2['Income'])
fig, ax = plt.subplots(figsize=(3, 1.25))
sns.boxplot(x=df2['BMI'])
fig, ax = plt.subplots(figsize=(3, 1.25))
sns.boxplot(x=df2['GDP'])

In [None]:
### LOOKING AT THE DATA
#   A small variety of views of the data as well as basic descriptive statistics.

print(df2.head()) # shows first 5 rows of every column

print()
df2.info() # shows column names, number of recorded values, data type

print()
print(df2) # shows the entire data set with the first and last 5 rows

## Descriptive Statistics

print()
df2.describe() # count, min, max, mean, median, Q1, Q3, STD

In [None]:
## Correlation

df2.corr() # shows the numerical correlation fit among all the variables (columns)

In [None]:
## Correlation

fig, ax = plt.subplots(figsize=(6, 4.5))
sns.heatmap(df2.corr(),center=0,vmin=-1.0,vmax=1.0,cmap="PiYG") # Uses color instead of values to show corr

# STEP 5 - IN DEPTH ANALYSIS
The purpose of this step is to dive deeply into a few interesting correlations among variables.

## a. Bivariate Scatterplots

In [None]:
### FROM THE HEAT MAP OF CORRELATIONS 
#   These variables seemed either highly correlated or inspired interesting questions.

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="Schooling", y="Life_Expectancy", data=df2, scatter_kws={"color": "gray"}, line_kws={"color": "green"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="Income", y="Life_Expectancy", data=df2, scatter_kws={"color": "gray"}, line_kws={"color": "green"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="BMI", y="Life_Expectancy", data=df2, scatter_kws={"color": "gray"}, line_kws={"color": "green"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="Thinness", y="Life_Expectancy", data=df2, scatter_kws={"color": "gray"}, line_kws={"color": "green"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="GDP", y="Life_Expectancy", data=df2, logx=True, scatter_kws={"color": "gray"}, line_kws={"color": "green"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="Life_Expectancy", y="AIDS", data=df2, logx=True, scatter_kws={"color": "gray"}, line_kws={"color": "green"})


In [None]:
### FROM SCATTERPLOTS 
#   Greater Alcohol consumption and BMI correlate with having a longer life
#   This was unexpected and deserves further investigation. 
#   Perhaps income has something to do with it.

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="Alcohol", y="Income", data=df, logx=True, scatter_kws={"color": "black"}, line_kws={"color": "pink"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="BMI", y="Income", data=df, scatter_kws={"color": "black"}, line_kws={"color": "pink"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="Schooling", y="Income", data=df, scatter_kws={"color": "black"}, line_kws={"color": "pink"})

fig, ax = plt.subplots(figsize=(5, 3))
sns.regplot(x="GDP", y="Income", data=df, logx=True, scatter_kws={"color": "black"}, line_kws={"color": "pink"})

## b. Machine Learning

In [None]:
import sklearn # used to perform supervised and unsupervised machine learning
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split # used to split data into training and testing

# SPLITTING THE DATA
y = df2.Life_Expectancy # define dependent variable 
print(y)
x = df2.drop("Life_Expectancy", axis=1) # define the independent variables
print(x)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) # split the data into 20% test & 80% train
print(x_train.info())
print(x_test.info())
print(y_train.describe())
print(y_test.describe())

### Random Forest Regression
This model is good for establishing which independent variables are most important when in achieving the desired goal for the dependent variable.

In [None]:
### FEATURE SCALING

from sklearn.preprocessing import StandardScaler # tool for scaling values

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### BUILD THE RFR MODEL

from sklearn.ensemble import RandomForestRegressor # tool for building random forest models

RFR = RandomForestRegressor(n_estimators=50, random_state=0, max_depth=3)
RFR.fit(x_train, y_train)
y_pred = RFR.predict(x_test)

### EVALUATE PERFORMANCE

from sklearn.model_selection import cross_val_score # tool for evaluating training model using testing data

RFRscore = cross_val_score(RFR,x_train,y_train,cv=5) # plugs test data into the RFR model and measures the success rate

print('RFR Scores: ', RFRscore)
print('RFR Cross Val Score: ', RFRscore.mean())

In [None]:
### SHOW TABLE OF VARIABLE IMPORTANCE

from sklearn.ensemble import RandomForestClassifier # tool for building random forest classifier

RFC = RandomForestClassifier()
RFC.fit(x, y)
pd.DataFrame({'Variable':x.columns,
              'Importance':RFC.feature_importances_}).sort_values('Importance', ascending=False)

In [None]:
# VISUALIZE A SIGLE RANDOM FOREST WALK

from sklearn import tree # tool for compiling tree diagrams of forest walks

len(RFC.estimators_)
plt.figure(figsize=(25,25))
_ = tree.plot_tree(RFC.estimators_[0], feature_names=x.columns, filled=True)

### Linear Regression
Once the important independent variables are established, we can use this regression model and use it to guide decision-making.

In [None]:
### FEATURE SCALING

from sklearn.preprocessing import StandardScaler # tool for scaling data

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### BUILD THE LR MODEL

from sklearn.linear_model import LinearRegression # tool for building linear regression models

LR = LinearRegression().fit(x_train, y_train) 

print("intercept: ", LR.intercept_)
print("coefficient: ", LR.coef_)

### EVALUATE PERFORMANCE

from sklearn.model_selection import cross_val_score # tool for evaluating training model using testing data

LRscore = cross_val_score(LR,x_train,y_train,cv=5) # plugs test data to the LR model and measures it's success rate

print('LR Scores: ', LRscore)
print('LR Score Mean: ', LRscore.mean())

# STEP 6 - COMMUNICATION AND VISUALIZATION
The purpose of this step is to build effective deliverables and visuals that clearly and effectively communicate our findings from the data analysis to interested stakeholders.

In [None]:
# Ideally, this is something that is done thruought the data analysis process 
# as we have done in this walkthrough. Other visuals can also be compiled here if they haven't been already.