# Recommended Installations

#### pip install plotly
#### pip install seaborn==0.11.0

# Recommended Installations

#### pip install plotly
#### pip install seaborn==0.11.0

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import plotly.express as px
import plotly.express as go
from scipy.stats.mstats import winsorize
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 300)
sns.__version__

# Importing and Reading CSV Files

In [None]:
df_250_countries = pd.read_csv("../input/all-250-country-data/250 Country Data.csv")
df_life_expectancy = pd.read_csv("../input/life-expectancy-who/Life Expectancy Data.csv")
df_worldHappiness_2015 = pd.read_csv("../input/world-happiness/2015.csv")
df_worldHappiness_2016 = pd.read_csv("../input/world-happiness/2016.csv")
df_worldHappiness_2017 = pd.read_csv("../input/world-happiness/2017.csv")
df_worldHappiness_2018 = pd.read_csv("../input/world-happiness/2018.csv")
df_worldHappiness_2019 = pd.read_csv("../input/world-happiness/2019.csv")

# Exploring Data

# All Countries Data

In [None]:
df_250_countries.drop(df_250_countries.columns[df_250_countries.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df_250_countries.head(-5)
# all250Countries_df.shape

### Renaming Columns For Easier Access

In [None]:

df_250_countries.rename(columns={"name":"Name","region":"Region","area":"Area",
                   "gini":"Gini",'subregion':"Subregion","population":"Population"},inplace=True)
df_250_countries.head(1)

# Dataset Analysis

## Column Descriptions
- Name (Nominal) - The name of country in which the indicators are from (i.e. United States of America or Congo)
- Region (Nominal) - The name of region in which the country lies.
- Subregion (Nominal) - The name of subregion which is a division of the region.
- Population (Ratio) - The population count of the country.
- Area (Ratio) - The area of the country.
- Gini (Ratio) - The measure of the distribution of income across a population.
- Real Growth Rating(%) (Ratio) - The rate at which a nation's Gross Domestic product (GDP) changes/grows from one year to another.
- Literacy Rate(%) (Ratio) - The percentage of the population that can read and write.
- Inflation(%) (Ratio) - The percentage that represents the decline of purchasing power of a given currency over time.
- Unemployement(%) (Ratio) - The percentage of the country that does not have a job.

In [None]:
#Removing strings and invalid characters from {Real Growth Rating(%),Literacy Rate(%),Inflation(%),Unemployement(%)}

Growth_subdata = df_250_countries['Real Growth Rating(%)']
def findPer(x):
    if(x!=x or 'n.a' in x):
        return np.NaN
    else:
        return float(re.findall('[\d.-]*%', x)[0][:-1])
Growth_series = Growth_subdata.apply(findPer)
df_250_countries['Real Growth Rating(%)'] = Growth_series

Literacy_subdata = df_250_countries['Literacy Rate(%)']
def findPer(x):
    if(x!=x or 'n.a' in x):
        return np.NaN
    else:
        return float(re.findall('[\d.-]*%', x)[0][:-1])
Literacy_series = Literacy_subdata.apply(findPer)
df_250_countries['Literacy Rate(%)'] = Literacy_series

Inflation_subdata = df_250_countries['Inflation(%)']
def findPer(x):
    if(x!=x or 'n.a' in x):
        return np.NaN
    else:
        return float(re.findall('[\d.-]*%', x)[0][:-1])
Inflation_series = Inflation_subdata.apply(findPer)
df_250_countries['Inflation(%)'] = Inflation_series

Unemployement_subdata = df_250_countries['Unemployement(%)']
def findPer(x):
    if(x!=x or 'n.a' in x or 'N.A' in x ):
        return np.NaN
    if(not('%' in x)):
        return np.NaN
    else:
        un_refined_result = re.findall('[\s\d.-]*%', x)
        semi_refined_result = un_refined_result[0]
        while(True):
            if(semi_refined_result[-1].isnumeric()):
                break
            semi_refined_result = semi_refined_result[:-1]
        return float(semi_refined_result)
Unemployement_series =Unemployement_subdata.apply(findPer)
df_250_countries['Unemployement(%)'] = Unemployement_series

df_250_countries

In [None]:
# Information about the columns data-types
df_250_countries.info()

In [None]:
# Showing statistics for every column
df_250_countries.describe()

In [None]:
# Collecting the number of nulls in each column
df_250_countries.isnull().sum()

In [None]:
# Null values as a percentage per column ----------> Inorder to know which data needs to be dropped
df_250_countries.isnull().sum()*100/df_250_countries.isnull().count()

### From the analysis of the null value percentages and the statistics of each column:
- The Gini,Real Growth Rating(%),Literacy Rate(%),Inflation(%),Unemployement(%) columns have a huge number of null values where the null percentage starts from 31.2% and reaches 40.4%.
- Probably there are some low outliers disturbing the calculations

In [None]:
# Real Growth Rating(%) Before Interpolation

df_Growth = df_250_countries[['Real Growth Rating(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(-15, 60) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Growth)
plt.show()

In [None]:
# Literacy Rate(%) Before Interpolation

df_Literacy = df_250_countries[['Literacy Rate(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(15, 110) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Literacy)
plt.show()

In [None]:
# Inflation(%) Before Interpolation

df_Inflation = df_250_countries[['Inflation(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(-5, 30) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Inflation)
plt.show()

In [None]:
# Unemployement(%) Before Interpolation

df_Unemployement = df_250_countries[['Unemployement(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(-5, 100) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Unemployement)
plt.show()

In [None]:
# Defining the data with null values that needs to be filled 
country_list = df_250_countries.Name.unique()
fill_list = ['Real Growth Rating(%)','Literacy Rate(%)','Inflation(%)','Unemployement(%)']

In [None]:
for country in country_list:
    df_250_countries.loc[df_250_countries['Name'] == country,fill_list] = df_250_countries.loc[df_250_countries['Name'] == country,fill_list].interpolate()
    
# Drop remaining null values after interpolation.
# df_life_expectancy.dropna(inplace=True)

In [None]:
df_250_countries = df_250_countries.fillna(df_250_countries.mean())

In [None]:
df_250_countries.isnull().sum()

In [None]:
df_250_countries.describe()

In [None]:
# Real Growth Rating(%) After Interpolation

df_Growth = df_250_countries[['Real Growth Rating(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(-15, 60) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Growth)
plt.show()

In [None]:
# Literacy Rate(%) After Interpolation

df_Literacy = df_250_countries[['Literacy Rate(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(15, 110) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Literacy)
plt.show()

In [None]:
# Inflation(%) After Interpolation

df_Inflation = df_250_countries[['Inflation(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(-5, 30) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Inflation)
plt.show()

In [None]:
# Unemployement(%) After Interpolation

df_Unemployement = df_250_countries[['Unemployement(%)']]
plt.figure(figsize=(20,10))
plt.subplots_adjust(hspace=0.5)
plt.ylim(-5, 100) 
plt.xticks(np.arange(0, 280, 10))
sns.lineplot(data=df_Unemployement)
plt.show()

In [None]:
# Create a dictionary of columns.
col_dict = {'Population':1,'Area':2,'Gini':3,'Real Growth Rating(%)':4,'Literacy Rate(%)':5,'Inflation(%)':6,'Unemployement(%)':7}

# Detect outliers in each variable using box plots.
plt.figure(figsize=(20,30))

for variable,i in col_dict.items():
                     plt.subplot(5,4,i)
                     plt.boxplot(df_250_countries[variable],whis=1.5)
                     plt.title(variable)

plt.show()

## From the analysis of Boxplots:

- Just as we assumed the data has lots of outliers that corrupts the statistics of the dataset
- Following we will try to remove these outliers and see the data after being cleaned

In [None]:
def outlier_count(col, data):
    print(15*'-' + col + 15*'-')
    q75, q25 = np.percentile(data[col], [75, 25])
    iqr = q75 - q25
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])
    outlier_percent = round(outlier_count/len(data[col])*100, 2)
    print('Number of outliers: {}'.format(outlier_count))
    print('Percent of data that is outlier: {}%'.format(outlier_percent))

In [None]:
col_dict = ['Population','Area','Gini','Real Growth Rating(%)','Literacy Rate(%)','Inflation(%)','Unemployement(%)']
for col in col_dict:
    outlier_count(col,df_250_countries)

In [None]:
# Removing Outliers in the variables using Winsorization technique.
# Winsorize Population
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Population = df_250_countries['Population']
plt.boxplot(original_Population)
plt.title("original_Population")

plt.subplot(1,2,2)
winsorized_Population = winsorize(df_250_countries['Population'],(0,0.12))
plt.boxplot(winsorized_Population)
plt.title("winsorized_Population")

plt.show()

In [None]:
# Winsorize Area
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Area = df_250_countries['Area']
plt.boxplot(original_Area)
plt.title("original_Area")

plt.subplot(1,2,2)
winsorized_Area = winsorize(df_250_countries['Area'],(0,0.099))
plt.boxplot(winsorized_Area)
plt.title("winsorized_Area")

plt.show()

In [None]:
# Winsorize Gini
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Gini = df_250_countries['Gini']
plt.boxplot(original_Gini)
plt.title("original_Gini")

plt.subplot(1,2,2)
winsorized_Gini = winsorize(df_250_countries['Gini'],(0.088,0.157))
plt.boxplot(winsorized_Gini)
plt.title("winsorized_Gini")

plt.show()

In [None]:
# Winsorize Real Growth Rating(%)
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Real_Growth_Rating = df_250_countries['Real Growth Rating(%)']
plt.boxplot(original_Real_Growth_Rating)
plt.title("original_Real_Growth_Rating")

plt.subplot(1,2,2)
winsorized_Real_Growth_Rating = winsorize(df_250_countries['Real Growth Rating(%)'],(0.08,0.11))
plt.boxplot(winsorized_Real_Growth_Rating)
plt.title("winsorized_Real_Growth_Rating")

plt.show()

In [None]:
# Winsorize Literacy Rate(%)
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Literacy_Rate = df_250_countries['Literacy Rate(%)']
plt.boxplot(original_Literacy_Rate)
plt.title("original_Literacy_Rate")

plt.subplot(1,2,2)
winsorized_Literacy_Rate = winsorize(df_250_countries['Literacy Rate(%)'],(0.124,0))
plt.boxplot(winsorized_Literacy_Rate)
plt.title("winsorized_Literacy_Rate")

plt.show()

In [None]:
# Winsorize Inflation(%)
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Inflation = df_250_countries['Inflation(%)']
plt.boxplot(original_Inflation)
plt.title("original_Inflation")

plt.subplot(1,2,2)
winsorized_Inflation = winsorize(df_250_countries['Inflation(%)'],(0.02,0.11))
plt.boxplot(winsorized_Inflation)
plt.title("winsorized_Inflation")

plt.show()

In [None]:
# Winsorize Unemployement(%)
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Unemployement = df_250_countries['Unemployement(%)']
plt.boxplot(original_Unemployement)
plt.title("original_Unemployement")

plt.subplot(1,2,2)
winsorized_Unemployement = winsorize(df_250_countries['Unemployement(%)'],(0,0.11))
plt.boxplot(winsorized_Unemployement)
plt.title("winsorized_Unemployement")

plt.show()

In [None]:
# Check number of Outliers after Winsorization for each variable.
win_list = [winsorized_Population,winsorized_Area,winsorized_Gini,winsorized_Real_Growth_Rating,winsorized_Literacy_Rate,winsorized_Inflation,winsorized_Unemployement]
for variable in win_list:
    q75, q25 = np.percentile(variable, [75 ,25])
    iqr = q75 - q25

    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    
    print("Number of outliers after winsorization : {}".format(len(np.where((variable > max_val) | (variable < min_val))[0])))

In [None]:
# Adding winsorized variables to the data frame.
df_250_countries['winsorized_Population'] = winsorized_Population
df_250_countries['winsorized_Area'] = winsorized_Area
df_250_countries['winsorized_Gini'] = winsorized_Gini
df_250_countries['winsorized_Real_Growth_Rating(%)'] = winsorized_Real_Growth_Rating
df_250_countries['winsorized_Literacy_Rate(%)'] = winsorized_Literacy_Rate
df_250_countries['winsorized_Inflation(%)'] = winsorized_Inflation
df_250_countries['winsorized_Unemployement(%)'] = winsorized_Unemployement

In [None]:
df_250_countries.describe()

In [None]:
# Distribution of each numerical variable after removing outliers.
all_col = ['Population','winsorized_Population','Area','winsorized_Area','Gini','winsorized_Gini',
           'Real Growth Rating(%)','winsorized_Real_Growth_Rating(%)',
           'Literacy Rate(%)', 'winsorized_Literacy_Rate(%)',
           'Inflation(%)','winsorized_Inflation(%)',
           'Unemployement(%)','winsorized_Unemployement(%)']

plt.figure(figsize=(15,75))

for i in range(len(all_col)):
    plt.subplot(18,2,i+1)
    plt.hist(df_250_countries[all_col[i]])
    plt.title(all_col[i])

plt.show()


# Life Expectancy Data

In [None]:
print(df_life_expectancy.shape)
df_life_expectancy.head(-5)

In [None]:
df_life_expectancy.tail()

### Renaming Columns For Easier Access

In [None]:
# Renaming thinness_1to19_years to thinness_10to19_years since it is the column representing these range of ages
df_life_expectancy.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Expenditure","Hepatitis B":"HepatitisB",
                  "Measles ":"Measles"," BMI ":"BMI","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_10to19_years"," thinness 5-9 years":"thinness_5to9_years","Income composition of resources":"Income_Composition_Of_Resources",
                   "Total expenditure":"Total_Expenditure"},inplace=True)

# Dataset Analysis

## Column Descriptions
- country (Nominal) - the country in which the indicators are from (i.e. United States of America or Congo)
- year (Ordinal) - the calendar year the indicators are from (ranging from 2000 to 2015)
- status (Nominal) - whether a country is considered to be 'Developing' or 'Developed' by WHO standards
- life_expectancy (Ratio) - the life expectancy of people in years for a particular country and year
- adult_mortality (Ratio) - the adult mortality rate per 1000 population (i.e. number of people dying between 15 and 60 years per 1000 population)
- infant_deaths (Ratio) - number of infant deaths per 1000 population
- alcohol (Ratio) - a country's alcohol consumption rate measured as liters of pure alcohol consumption per capita
- percentage_expenditure (Ratio) - expenditure on health as a percentage of Gross Domestic Product (gdp)
- hepatitis_b (Ratio) - number of 1 year olds with Hepatitis B immunization over all 1 year olds in population
- measles (Ratio) - number of reported Measles cases per 1000 population
- bmi (Interval/Ordinal) - average Body Mass Index (BMI) of a country's total population
- under-five_deaths (Ratio) - number of people under the age of five deaths per 1000 population
- polio (Ratio) - number of 1 year olds with Polio immunization over the number of all 1 year olds in population
- total_expenditure (Ratio) - government expenditure on health as a percentage of total government expenditure
- diphtheria (Ratio) - Diphtheria tetanus toxoid and pertussis (DTP3) immunization rate of 1 year olds
- hiv/aids (Ratio) - deaths per 1000 live births caused by HIV/AIDS for people under 5
- gdp (Ratio) - Gross Domestic Product per capita
- population (Ratio) - population of a country
- thinness_10-19_years (Ratio) - rate of thinness among people aged *10-19*
- thinness_5-9_years (Ratio) - rate of thinness among people aged 5-9
- income_composition_of_resources (Ratio) - Human Development Index in terms of income composition of resources
- schooling (Ratio) - average number of years of schooling of a population

In [None]:
# Information about the columns data-types
df_life_expectancy.info()

In [None]:
# Showing statistics for every column
df_life_expectancy.describe()

In [None]:
# Collecting the number of nulls in each column
df_life_expectancy.isnull().sum()

In [None]:
# Null values as a percentage per column ----------> Inorder to know which data needs to be dropped
df_life_expectancy.isnull().sum()*100/df_life_expectancy.isnull().count()

### From the analysis of the null value percentages and the statistics of each column:
- It is obvious that there are multiple columns with unrealistic data such as Under_Five_Deaths , Adult_Mortality and Infant_Deaths, Where the minimum of these columns are either 1 or 0 which is impossible to happen throughout the years and the number of countries listed.
- Probably there are some low outliers disturbing the calculations 
- The Population column has a lot of inaccurate values that gives a false estimation to the minimum population (34) and a lot of null values making it 22% of the actual column which would let us drop the column if it null values couldn't be replaced properly.
- BMI having the minimum as 1 and max as 87 is weird giving lots of assumptions about the data (Outliers, False Values)


In [None]:
# Before Interpolation

# df_life_expectancy.plot(figsize=(5,80))
df_life = df_life_expectancy[['Life_Expectancy']]
plt.figure(figsize=(20,10))
# plt.margins(.05)
plt.subplots_adjust(hspace=0.5)
plt.ylim(30, 90) 
# plt.xlim(0, 3000) 
plt.xticks(np.arange(0, 3000, 100))
sns.lineplot(data=df_life)

plt.show()

In [None]:
# Distoritions Before Interpolation

all_col = ['Life_Expectancy','Adult_Mortality','Infant_Deaths','Alcohol','Percentage_Expenditure','HepatitisB','Under_Five_Deaths','Polio','Total_Expenditure'
         ,'Diphtheria','HIV/AIDS','GDP',
         'Population','thinness_10to19_years','thinness_5to9_years','Income_Composition_Of_Resources',
         'Schooling']

plt.figure(figsize=(50,20))

for i in range(len(all_col)):
    plt.tight_layout(pad=.01)
    plt.subplot(20,1,i+1)
    sns.lineplot(data=df_life_expectancy[all_col[i]])
    plt.xlabel(all_col[i], fontsize=8)

plt.show()


In [None]:
# Defining the data with null values that needs to be filled 
country_list = df_life_expectancy.Country.unique()
fill_list = ['Life_Expectancy','Adult_Mortality','Alcohol','HepatitisB','BMI','Polio','Total_Expenditure','Diphtheria','GDP','Population','thinness_10to19_years','thinness_5to9_years','Income_Composition_Of_Resources','Schooling']

In [None]:
for country in country_list:
    df_life_expectancy.loc[df_life_expectancy['Country'] == country,fill_list] = df_life_expectancy.loc[df_life_expectancy['Country'] == country,fill_list].interpolate()
    
# Drop remaining null values after interpolation.
# df_life_expectancy.dropna(inplace=True)

In [None]:
# imputed_data = []
# for Year in list(df_life_expectancy.Year.unique()):
#     print(Year)
#     year_data = df_life_expectancy[df_life_expectancy.Year == Year].copy()
#     for col in list(year_data.columns)[3:]:
#         year_data[col] = year_data[col].fillna(year_data[col].dropna().mean()).copy()
#     imputed_data.append(year_data)
#     print(imputed_data)
# # df_life_expectancy = pd.concat(imputed_data).copy()
df_life_expectancy = df_life_expectancy.fillna(df_life_expectancy.mean())

In [None]:
df_life_expectancy.isnull().sum()

In [None]:
df_life_expectancy.describe()

In [None]:
# No Distortions After Interpolation

all_col = ['Life_Expectancy','Adult_Mortality','Infant_Deaths','Alcohol','Percentage_Expenditure','HepatitisB','Under_Five_Deaths','Polio','Total_Expenditure'
         ,'Diphtheria','HIV/AIDS','GDP',
         'Population','thinness_10to19_years','thinness_5to9_years','Income_Composition_Of_Resources',
         'Schooling']

plt.figure(figsize=(50,20))

for i in range(len(all_col)):
    plt.tight_layout(pad=.01)
    plt.subplot(20,1,i+1)
    sns.lineplot(data=df_life_expectancy[all_col[i]])
    plt.xlabel(all_col[i], fontsize=8)

plt.show()


### Detecting and Removing Outliers

In [None]:
# Create a dictionary of columns.
col_dict = {'Life_Expectancy':1,'Adult_Mortality':2,'Infant_Deaths':3,'Alcohol':4,'Percentage_Expenditure':5,'HepatitisB':6,'Measles':7,'BMI':8,'Under_Five_Deaths':9,'Polio':10,'Total_Expenditure':11,'Diphtheria':12,'HIV/AIDS':13,'GDP':14,'Population':15,'thinness_10to19_years':16,'thinness_5to9_years':17,'Income_Composition_Of_Resources':18,'Schooling':19}

# Detect outliers in each variable using box plots.
plt.figure(figsize=(20,30))

for variable,i in col_dict.items():
                     plt.subplot(5,4,i)
                     plt.boxplot(df_life_expectancy[variable],whis=1.5)
                     plt.title(variable)

plt.show()

## From the analysis of Boxplots:

- Just as we assumed the data has lots of outliers that corrupts the statistics of the dataset
- Following we will try to remove these outliers and see the data after being cleaned

In [None]:
def outlier_count(col, data):
    print(15*'-' + col + 15*'-')
    q75, q25 = np.percentile(data[col], [75, 25])
    iqr = q75 - q25
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])
    outlier_percent = round(outlier_count/len(data[col])*100, 2)
    print('Number of outliers: {}'.format(outlier_count))
    print('Percent of data that is outlier: {}%'.format(outlier_percent))

In [None]:
col_dict = ['Life_Expectancy','Adult_Mortality','Infant_Deaths','Alcohol','Percentage_Expenditure','HepatitisB','Measles','BMI','Under_Five_Deaths','Polio','Total_Expenditure','Diphtheria','HIV/AIDS','GDP','Population','thinness_10to19_years','thinness_5to9_years','Income_Composition_Of_Resources','Schooling']
for col in col_dict:
    outlier_count(col,df_life_expectancy)

In [None]:
# Removing Outliers in the variables using Winsorization technique.
# Winsorize Life_Expectancy
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Life_Expectancy = df_life_expectancy['Life_Expectancy']
plt.boxplot(original_Life_Expectancy)
plt.title("original_Life_Expectancy")

plt.subplot(1,2,2)
winsorized_Life_Expectancy = winsorize(df_life_expectancy['Life_Expectancy'],(0.1,0))
plt.boxplot(winsorized_Life_Expectancy)
plt.title("winsorized_Life_Expectancy")

plt.show()

In [None]:
# Winsorize Adult_Mortality
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Adult_Mortality = df_life_expectancy['Adult_Mortality']
plt.boxplot(original_Adult_Mortality)
plt.title("original_Adult_Mortality")

plt.subplot(1,2,2)
winsorized_Adult_Mortality = winsorize(df_life_expectancy['Adult_Mortality'],(0,0.1))
plt.boxplot(winsorized_Adult_Mortality)
plt.title("winsorized_Adult_Mortality")

plt.show()

In [None]:
# Winsorize Infant_Deaths
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Infant_Deaths = df_life_expectancy['Infant_Deaths']
plt.boxplot(original_Infant_Deaths)
plt.title("original_Infant_Deaths")

plt.subplot(1,2,2)
winsorized_Infant_Deaths = winsorize(df_life_expectancy['Infant_Deaths'],(0,0.11))
plt.boxplot(winsorized_Infant_Deaths)
plt.title("winsorized_Infant_Deaths")

plt.show()

In [None]:
# Winsorize Alcohol
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Alcohol = df_life_expectancy['Alcohol']
plt.boxplot(original_Alcohol)
plt.title("original_Alcohol")

plt.subplot(1,2,2)
winsorized_Alcohol = winsorize(df_life_expectancy['Alcohol'],(0,0.1))
plt.boxplot(winsorized_Alcohol)
plt.title("winsorized_Alcohol")

plt.show()

In [None]:
# Winsorize Percentage_Exp
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Percentage_Exp = df_life_expectancy['Percentage_Expenditure']
plt.boxplot(original_Percentage_Exp)
plt.title("original_Percentage_Exp")

plt.subplot(1,2,2)
winsorized_Percentage_Exp = winsorize(df_life_expectancy['Percentage_Expenditure'],(0,0.15))
plt.boxplot(winsorized_Percentage_Exp)
plt.title("winsorized_Percentage_Exp")

plt.show()

In [None]:
# Winsorize HepatitisB
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_HepatitisB = df_life_expectancy['HepatitisB']
plt.boxplot(original_HepatitisB)
plt.title("original_HepatitisB")

plt.subplot(1,2,2)
winsorized_HepatitisB = winsorize(df_life_expectancy['HepatitisB'],(0.11,0))
plt.boxplot(winsorized_HepatitisB)
plt.title("winsorized_HepatitisB")

plt.show()

In [None]:
# Winsorize Measles
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Measles = df_life_expectancy['Measles']
plt.boxplot(original_Measles)
plt.title("original_Measles")

plt.subplot(1,2,2)
winsorized_Measles = winsorize(df_life_expectancy['Measles'],(0,0.19))
plt.boxplot(winsorized_Measles)
plt.title("winsorized_Measles")

plt.show()
# Needs to be dropped since it didnot enhance the data

In [None]:
# Winsorize Under_Five_Deaths
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Under_Five_Deaths = df_life_expectancy['Under_Five_Deaths']
plt.boxplot(original_Under_Five_Deaths)
plt.title("original_Under_Five_Deaths")

plt.subplot(1,2,2)
winsorized_Under_Five_Deaths = winsorize(df_life_expectancy['Under_Five_Deaths'],(0,0.135))
plt.boxplot(winsorized_Under_Five_Deaths)
plt.title("winsorized_Under_Five_Deaths")

plt.show()

In [None]:
##### Winsorize Polio
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Polio = df_life_expectancy['Polio']
plt.boxplot(original_Polio)
plt.title("original_Polio")

plt.subplot(1,2,2)
winsorized_Polio = winsorize(df_life_expectancy['Polio'],(0.1,0))
plt.boxplot(winsorized_Polio)
plt.title("winsorized_Polio")

plt.show()

In [None]:
# Winsorize Tot_Exp
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Tot_Exp = df_life_expectancy['Total_Expenditure']
plt.boxplot(original_Tot_Exp)
plt.title("original_Tot_Exp")

plt.subplot(1,2,2)
winsorized_Tot_Exp = winsorize(df_life_expectancy['Total_Expenditure'],(0,0.02))
plt.boxplot(winsorized_Tot_Exp)
plt.title("winsorized_Tot_Exp")

plt.show()

In [None]:
# Winsorize Diphtheria
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Diphtheria = df_life_expectancy['Diphtheria']
plt.boxplot(original_Diphtheria)
plt.title("original_Diphtheria")

plt.subplot(1,2,2)
winsorized_Diphtheria = winsorize(df_life_expectancy['Diphtheria'],(0.105,0))
plt.boxplot(winsorized_Diphtheria)
plt.title("winsorized_Diphtheria")

plt.show()

In [None]:
# Winsorize HIV/AIDS
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_HIV = df_life_expectancy['HIV/AIDS']
plt.boxplot(original_HIV)
plt.title("original_HIV")

plt.subplot(1,2,2)
winsorized_HIV = winsorize(df_life_expectancy['HIV/AIDS'],(0,0.19))
plt.boxplot(winsorized_HIV)
plt.title("winsorized_HIV")

plt.show()

In [None]:
# Winsorize thinness_10to19_years
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_thinness_10to19_years = df_life_expectancy['thinness_10to19_years']
plt.boxplot(original_thinness_10to19_years)
plt.title("original_thinness_10to19_years")

plt.subplot(1,2,2)
winsorized_thinness_10to19_years = winsorize(df_life_expectancy['thinness_10to19_years'],(0,0.1))
plt.boxplot(winsorized_thinness_10to19_years)
plt.title("winsorized_thinness_10to19_years")

plt.show()


In [None]:
# Winsorize GDP
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_GDP = df_life_expectancy['GDP']
plt.boxplot(original_GDP)
plt.title("original_GDP")

plt.subplot(1,2,2)
winsorized_GDP = winsorize(df_life_expectancy['GDP'],(0,0.13))
plt.boxplot(winsorized_GDP)
plt.title("winsorized_GDP")

plt.show()

In [None]:
# Winsorize Population
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Population = df_life_expectancy['Population']
plt.boxplot(original_Population)
plt.title("original_Population")

plt.subplot(1,2,2)
winsorized_Population = winsorize(df_life_expectancy['Population'],(0,0.1))
plt.boxplot(winsorized_Population)
plt.title("winsorized_Population")

plt.show()

In [None]:
# Winsorize thinness_5to9_years
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_thinness_5to9_years = df_life_expectancy['thinness_5to9_years']
plt.boxplot(original_thinness_5to9_years)
plt.title("original_thinness_5to9_years")

plt.subplot(1,2,2)
winsorized_thinness_5to9_years = winsorize(df_life_expectancy['thinness_5to9_years'],(0,0.1))
plt.boxplot(winsorized_thinness_5to9_years)
plt.title("winsorized_thinness_5to9_years")

plt.show()

In [None]:
# Winsorize Income_Comp_Of_Resources
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Income_Comp_Of_Resources = df_life_expectancy['Income_Composition_Of_Resources']
plt.boxplot(original_Income_Comp_Of_Resources)
plt.title("original_Income_Comp_Of_Resources")

plt.subplot(1,2,2)
winsorized_Income_Comp_Of_Resources = winsorize(df_life_expectancy['Income_Composition_Of_Resources'],(0.05,0))
plt.boxplot(winsorized_Income_Comp_Of_Resources)
plt.title("winsorized_Income_Comp_Of_Resources")

plt.show()

In [None]:
# Winsorize Schooling
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Schooling = df_life_expectancy['Schooling']
plt.boxplot(original_Schooling)
plt.title("original_Schooling")

plt.subplot(1,2,2)
winsorized_Schooling = winsorize(df_life_expectancy['Schooling'],(0.025,0.01))
plt.boxplot(winsorized_Schooling)
plt.title("winsorized_Schooling")

plt.show()

In [None]:
# Check number of Outliers after Winsorization for each variable.
win_list = [winsorized_Life_Expectancy,winsorized_Adult_Mortality,winsorized_Infant_Deaths,winsorized_Alcohol,
            winsorized_Percentage_Exp,winsorized_HepatitisB,winsorized_Under_Five_Deaths,winsorized_Polio,winsorized_Tot_Exp,winsorized_Diphtheria,winsorized_HIV,winsorized_GDP,winsorized_Population,winsorized_thinness_10to19_years,winsorized_thinness_5to9_years,winsorized_Income_Comp_Of_Resources,winsorized_Schooling]

for variable in win_list:
    q75, q25 = np.percentile(variable, [75 ,25])
    iqr = q75 - q25

    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    
    print("Number of outliers after winsorization : {}".format(len(np.where((variable > max_val) | (variable < min_val))[0])))

In [None]:
# Adding winsorized variables to the data frame.
df_life_expectancy['winsorized_Life_Expectancy'] = winsorized_Life_Expectancy
df_life_expectancy['winsorized_Adult_Mortality'] = winsorized_Adult_Mortality
df_life_expectancy['winsorized_Infant_Deaths'] = winsorized_Infant_Deaths
df_life_expectancy['winsorized_Alcohol'] = winsorized_Alcohol
df_life_expectancy['winsorized_Percentage_Expenditure'] = winsorized_Percentage_Exp
df_life_expectancy['winsorized_HepatitisB'] = winsorized_HepatitisB
df_life_expectancy['winsorized_Under_Five_Deaths'] = winsorized_Under_Five_Deaths
df_life_expectancy['winsorized_Polio'] = winsorized_Polio
df_life_expectancy['winsorized_Total_Expenditure'] = winsorized_Tot_Exp
df_life_expectancy['winsorized_Diphtheria'] = winsorized_Diphtheria
df_life_expectancy['winsorized_HIV'] = winsorized_HIV
df_life_expectancy['winsorized_GDP'] = winsorized_GDP
df_life_expectancy['winsorized_Population'] = winsorized_Population
df_life_expectancy['winsorized_thinness_10to19_years'] = winsorized_thinness_10to19_years
df_life_expectancy['winsorized_thinness_5to9_years'] = winsorized_thinness_5to9_years
df_life_expectancy['winsorized_Income_Composition_Of_Resources'] = winsorized_Income_Comp_Of_Resources
df_life_expectancy['winsorized_Schooling'] = winsorized_Schooling

In [None]:
df_life_expectancy.describe()

In [None]:
# Distribution of each numerical variable after removing outliers.
all_col = ['Life_Expectancy','winsorized_Life_Expectancy','Adult_Mortality','winsorized_Adult_Mortality','Infant_Deaths',
         'winsorized_Infant_Deaths','Alcohol','winsorized_Alcohol','Percentage_Expenditure','winsorized_Percentage_Expenditure','HepatitisB',
         'winsorized_HepatitisB','Under_Five_Deaths','winsorized_Under_Five_Deaths','Polio','winsorized_Polio','Total_Expenditure',
         'winsorized_Total_Expenditure','Diphtheria','winsorized_Diphtheria','HIV/AIDS','winsorized_HIV','GDP','winsorized_GDP',
         'Population','winsorized_Population','thinness_10to19_years','winsorized_thinness_10to19_years','thinness_5to9_years',
         'winsorized_thinness_5to9_years','Income_Composition_Of_Resources','winsorized_Income_Composition_Of_Resources',
         'Schooling','winsorized_Schooling']

plt.figure(figsize=(15,75))

for i in range(len(all_col)):
    plt.subplot(18,2,i+1)
    plt.hist(df_life_expectancy[all_col[i]])
    plt.title(all_col[i])

plt.show()


# Searching For Countries With Invalid Characters
Cleaning the countries column by removing all unwanted characters and following the same pattern.

In [None]:
countries = df_life_expectancy['Country']

# Drop all the duplicates from countries
countries = countries.drop_duplicates()
# Write the regular expression: pattern
pattern = '^[A-Za-z\.\s]*$'

# Create the Boolean vector: mask
mask = countries.str.contains(pattern)

# Invert the mask: mask_inverse
mask_inverse = ~mask

# Subset countries using mask_inverse: invalid_countries
invalid_countries = countries.loc[mask_inverse]

# Print invalid_countries
print(invalid_countries)


- How much the average life expectancy changes over each year?

In [None]:
# Life_Expectancy and Year using bar plot.
plt.figure(figsize=(7,5))
plt.bar(df_life_expectancy.groupby('Year')['Year'].count().index,df_life_expectancy.groupby('Year')['winsorized_Life_Expectancy'].mean(),color='black',alpha=0.65)
plt.xlabel("Year",fontsize=12)
plt.ylabel("Average Life_Expectancy",fontsize=12)
plt.title("Life_Expectancy and Year")
plt.show()

In [None]:
# Visualizing the life expectancy of the current dataset
# df_life_expectancy.Life_Expectancy.plot(kind = 'hist')

# df_egypt = df_life_expectancy.loc[df_life_expectancy['Country'] == "Egypt"]
# lifeExpectancy_df
# df_2015 = df_life_expectancy.loc[df_life_expectancy['Year'] == 2015]
# df_egypt

# lifeExpectancy_df[lifeExpectancy_df["Status"] == "Developed"]
# lifeExpectancy_df.head()

## GDP on health can be used in the coorelation with the all countries Data on the Gini column/country
## Population Column Outliers 
## Total Expenditure the sum of general government health expenditure and private health expenditure in a given year, calculated in national currency units in current prices (totla money spent on health).
## Tidy Thinnes : Value , Age Range
## Income Composition Of Resources ... time the population and area can refer to gini
# lifeExpectancy_df[lifeExpectancy_df["Country"]=="Egypt"]
# Life_Expectancy w.r.t Country using bar plot.
df_life_expectancy_Countries = df_life_expectancy.groupby('Country')['winsorized_Life_Expectancy'].mean()
df_life_expectancy_Countries.plot(kind='bar', figsize=(50,15), fontsize=15)
plt.title("Life_Expectancy and Countries",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Average Life_Expectancy",fontsize=35)
plt.show()

# World Of Happiness Data

In [None]:
print("2015:",df_worldHappiness_2015.shape,"2016:",df_worldHappiness_2016.shape,"2017:",df_worldHappiness_2017.shape,"2018:",df_worldHappiness_2018.shape,"2019:",df_worldHappiness_2019.shape)

# World Happiness 2015:


## Column Descriptions
- Country - The Name of the country.
- Region - Region the country belongs to.
- Happiness Score - The average score that came from a survey where each paritcepant rate his/her happiness from 0 to 10 for the current country
- Happiness Rank - Based on the Happines Score where does the country lie
- Standard Error - The Standard Error in happiness score(+ or -)
- Economy (GDP per Captia) - The countries GDP divided by it total population number
- Family  - 
- Health(Life Expectancy) - Average age a person is expected to die
- Freedom  - this factor is determined based on the survey result of the Gallup World Poll. Applicants were asked this question:“Are you satisfied or dissatisfied with your freedom to choose what you do with your life?”


- Trust (Government Corruption) - Perceptions of corruption” is also determined as a key factor in the Happiness Index. It is calculated by averaging the answers to the following 2 questions:“Is corruption widespread throughout the government or not?”&“Is corruption widespread within businesses or not?”

- Generosity - Generosity was also determined by the Gallup World Poll survey results, by having respondents answer this question:“Have you donated money to a charity in the past month?”Again, the average of all responses (yes being 1 and no being 0) determines the output of this key factor.

- Dystopia Residual - 

In [None]:
df_worldHappiness_2015

In [None]:
print("Total Number of entries :",len(df_worldHappiness_2015))
print(df_worldHappiness_2015.dtypes)
print("\nNull Values : \n"+str(df_worldHappiness_2015.isnull().sum()))

In [None]:
dict_worldHappiness_2015_unique_values= {'Column Name':[], 'Unique values': []}
for column in df_worldHappiness_2015:
    dict_worldHappiness_2015_unique_values['Column Name'].append(column)
    dict_worldHappiness_2015_unique_values['Unique values'].append(len(df_worldHappiness_2015[column].unique()))
    
df_worldHappiness_2015_unique_values=pd.DataFrame(data=dict_worldHappiness_2015_unique_values)
df_worldHappiness_2015_unique_values

In [None]:
df_worldHappiness_2015_statistics=df_worldHappiness_2015.describe()
df_worldHappiness_2015_statistics

In [None]:
dict_worldHappiness_2015_outlier_columns={}
for column in df_worldHappiness_2015:
    if df_worldHappiness_2015.dtypes[column]=='O':
        continue
    minimum=df_worldHappiness_2015_statistics[column]['min']
    maximum=df_worldHappiness_2015_statistics[column]['max']
    IQR=1.5*(df_worldHappiness_2015_statistics[column]['75%']-df_worldHappiness_2015_statistics[column]['25%'])
    left_whihsker_min=df_worldHappiness_2015_statistics[column]['25%']-IQR
    right_whihsker_max=df_worldHappiness_2015_statistics[column]['75%']+IQR
    
    
    if(maximum>right_whihsker_max or minimum<left_whihsker_min):
        dict_worldHappiness_2015_outlier_columns[column]={'count':0,'left_whihsker_min':0,'right_whihsker_max':0}
#         print(column)
#         print(minimum)
#         print(left_whihsker_min)
#         print(maximum)
#         print(right_whihsker_max)
        
        dict_worldHappiness_2015_outlier_columns[column]["count"]=len(df_worldHappiness_2015[df_worldHappiness_2015[column]>right_whihsker_max]) 
        dict_worldHappiness_2015_outlier_columns[column]["count"]+=len(df_worldHappiness_2015[df_worldHappiness_2015[column]<left_whihsker_min])
        dict_worldHappiness_2015_outlier_columns[column]["left_whihsker_min"]=left_whihsker_min
        dict_worldHappiness_2015_outlier_columns[column]["right_whihsker_max"]=right_whihsker_max

for column in dict_worldHappiness_2015_outlier_columns:
        print(column,":",dict_worldHappiness_2015_outlier_columns[column])
   


In [None]:
fig = plt.figure(figsize=(14,22))
number_of_rows=len(dict_worldHappiness_2015_outlier_columns)
i=1;
for column in dict_worldHappiness_2015_outlier_columns:
    plt.subplot(number_of_rows,3,i)
    sns.boxplot(x=df_worldHappiness_2015[column],whis=1.5)
    i+=1
    plt.subplot(number_of_rows,3,i)
    #plt.hist(df_worldHappiness_2015[column])
    sns.distplot(df_worldHappiness_2015[column])
    #plt.title(column)
    i+=1
    plt.subplot(number_of_rows,3,i)
    #plt.hist(df_worldHappiness_2015[column])
    sns.histplot(df_worldHappiness_2015[column])
    i+=1

plt.show()



In [None]:
for column in dict_worldHappiness_2015_outlier_columns:
    rw=dict_worldHappiness_2015_outlier_columns[column]['right_whihsker_max']
    lw=dict_worldHappiness_2015_outlier_columns[column]['left_whihsker_min']
    print(column)
    display(df_worldHappiness_2015[(df_worldHappiness_2015[column]<lw) | (df_worldHappiness_2015[column]>rw)])


In [None]:
# Winsorize Family
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Family = df_worldHappiness_2015['Family']
plt.boxplot(original_Family)
plt.title("original_Family")

plt.subplot(1,2,2)
winsorized_Family = winsorize(df_worldHappiness_2015['Family'],(0.05,0))
plt.boxplot(winsorized_Family)
plt.title("winsorized_Family")

plt.show()

In [None]:
# Winsorize Trust
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Trust = df_worldHappiness_2015['Trust (Government Corruption)']
plt.boxplot(original_Trust)
plt.title("original_Trust")

plt.subplot(1,2,2)
winsorized_Trust = winsorize(df_worldHappiness_2015['Trust (Government Corruption)'],(0,0.1))
plt.boxplot(winsorized_Trust)
plt.title("winsorized_Trust")

plt.show()

In [None]:
# Winsorize Generosity
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Generosity = df_worldHappiness_2015['Generosity']
plt.boxplot(original_Generosity)
plt.title("original_Generosity")

plt.subplot(1,2,2)
winsorized_Generosity = winsorize(df_worldHappiness_2015['Generosity'],(0,0.1))
plt.boxplot(winsorized_Generosity)
plt.title("winsorized_Generosity")

plt.show()

In [None]:
# Winsorize Dystopia
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
original_Dystopia = df_worldHappiness_2015['Dystopia Residual']
plt.boxplot(original_Dystopia)
plt.title("original_Dystopia")

plt.subplot(1,2,2)
winsorized_Dystopia = winsorize(df_worldHappiness_2015['Dystopia Residual'],(0.05,0.05))
plt.boxplot(winsorized_Dystopia)
plt.title("winsorized_Dystopia")

plt.show()

In [None]:
df_worldHappiness_2015['Dystopia Residual'] = winsorized_Dystopia
df_worldHappiness_2015['Family'] = winsorized_Family
df_worldHappiness_2015['Generosity'] = winsorized_Generosity
df_worldHappiness_2015['Trust (Government Corruption)'] = winsorized_Trust


In [None]:
fig = plt.figure(figsize=(14,22))
number_of_rows=len(dict_worldHappiness_2015_outlier_columns)
i=1;
for column in dict_worldHappiness_2015_outlier_columns:
    plt.subplot(number_of_rows,3,i)
    sns.boxplot(x=df_worldHappiness_2015[column],whis=1.5)
    i+=1
    plt.subplot(number_of_rows,3,i)
    #plt.hist(df_worldHappiness_2015[column])
    sns.distplot(df_worldHappiness_2015[column])
    #plt.title(column)
    i+=1
    plt.subplot(number_of_rows,3,i)
    #plt.hist(df_worldHappiness_2015[column])
    sns.histplot(df_worldHappiness_2015[column])
    i+=1

plt.show()

# Data Integration

In [None]:
print(df_250_countries.shape)
df_250_countries

In [None]:
df_250_countries_cleaned = df_250_countries.drop(columns=['Population', 'Area','Gini','Real Growth Rating(%)','Literacy Rate(%)','Inflation(%)','Unemployement(%)'])
df_250_countries_cleaned

In [None]:
df_250_countries_cleaned = df_250_countries_cleaned.rename(columns={'winsorized_Population':'Population','winsorized_Area':'Area',
         'winsorized_Gini':'Gini','winsorized_Real_Growth_Rating(%)':'Real Growth Rating(%)','winsorized_Literacy_Rate(%)':'Literacy Rate(%)',
         'winsorized_Inflation(%)':'Inflation(%)','winsorized_Unemployement(%)':'Unemployement(%)'})
df_250_countries_cleaned

In [None]:
df_250_countries_cleaned = df_250_countries_cleaned.rename(columns={'Name':'Country'})
df_250_countries_cleaned['Country'].replace({'United Kingdom of Great Britain and Northern Ireland':'United Kingdom'},inplace=True)
df_250_countries_cleaned

In [None]:
df_worldHappiness_2015 = df_worldHappiness_2015
df_worldHappiness_2015['Country'].replace({'United States': 'United States of America', 'Vietnam': 'Viet Nam', 'Venezuela': 'Venezuela (Bolivarian Republic of)','South Korea':'Republic of Korea','Bolivia':'Bolivia (Plurinational State of)','Moldova':'Republic of Moldova','Russia':'Russian Federation','Somaliland region':'Somalia','Laos':"Lao People's Democratic Republic",'Iran':'Iran (Islamic Republic of)','Congo (Kinshasa)':'Congo','Congo (Brazzaville)':'Democratic Republic of the Congo','Tanzania':'United Republic of Tanzania','Syria':'Syrian Arab Republic'},inplace=True)
df_worldHappiness_2015

In [None]:
df_all_countries_world_happiness = pd.merge(df_worldHappiness_2015, df_250_countries_cleaned, how='inner', on=['Country'])
df_all_countries_world_happiness

In [None]:
print(df_life_expectancy.shape)
df_life_expectancy

In [None]:
df_life_expectancy_2015 = df_life_expectancy[(df_life_expectancy['Year']==2015)]
df_life_expectancy_2015

In [None]:
df_life_expectancy_Cleaned = df_life_expectancy.drop(columns=['Life_Expectancy', 'Adult_Mortality','Infant_Deaths','Alcohol','Percentage_Expenditure','HepatitisB','Under_Five_Deaths','Polio','Total_Expenditure','Diphtheria','HIV/AIDS','GDP','Population','thinness_10to19_years','thinness_5to9_years','Income_Composition_Of_Resources','Schooling'])
df_life_expectancy_2015_Cleaned = df_life_expectancy_2015.drop(columns=['Life_Expectancy', 'Adult_Mortality','Infant_Deaths','Alcohol','Percentage_Expenditure','HepatitisB','Under_Five_Deaths','Polio','Total_Expenditure','Diphtheria','HIV/AIDS','GDP','Population','thinness_10to19_years','thinness_5to9_years','Income_Composition_Of_Resources','Schooling'])
df_life_expectancy_2015_Cleaned

In [None]:
df_life_expectancy_Cleaned = df_life_expectancy_Cleaned.rename(columns={'winsorized_Life_Expectancy':'Life_Expectancy','winsorized_Adult_Mortality':'Adult_Mortality',
         'winsorized_Infant_Deaths':'Infant_Deaths','winsorized_Alcohol':'Alcohol','winsorized_Percentage_Expenditure':'Percentage_Expenditure',
         'winsorized_HepatitisB':'HepatitisB','winsorized_Under_Five_Deaths':'Under_Five_Deaths','winsorized_Polio':'Polio',
         'winsorized_Total_Expenditure':'Total_Expenditure','winsorized_Diphtheria':'Diphtheria','winsorized_HIV':'HIV/AIDS','winsorized_GDP':'GDP'
         ,'winsorized_Population':'Population','winsorized_thinness_10to19_years':'thinness_10to19_years',
         'winsorized_thinness_5to9_years':'thinness_5to9_years','winsorized_Income_Composition_Of_Resources':'Income_Composition_Of_Resources'
         ,'winsorized_Schooling':'Schooling'})
df_life_expectancy_2015_Cleaned = df_life_expectancy_2015_Cleaned.rename(columns={'winsorized_Life_Expectancy':'Life_Expectancy','winsorized_Adult_Mortality':'Adult_Mortality',
         'winsorized_Infant_Deaths':'Infant_Deaths','winsorized_Alcohol':'Alcohol','winsorized_Percentage_Expenditure':'Percentage_Expenditure',
         'winsorized_HepatitisB':'HepatitisB','winsorized_Under_Five_Deaths':'Under_Five_Deaths','winsorized_Polio':'Polio',
         'winsorized_Total_Expenditure':'Total_Expenditure','winsorized_Diphtheria':'Diphtheria','winsorized_HIV':'HIV/AIDS','winsorized_GDP':'GDP'
         ,'winsorized_Population':'Population','winsorized_thinness_10to19_years':'thinness_10to19_years',
         'winsorized_thinness_5to9_years':'thinness_5to9_years','winsorized_Income_Composition_Of_Resources':'Income_Composition_Of_Resources'
         ,'winsorized_Schooling':'Schooling'})
df_life_expectancy_2015_Cleaned

In [None]:
df_life_expectancy_2015

# Integrating all data on countries

In [None]:
print(df_life_expectancy_2015_Cleaned.shape)
df_life_expectancy_countries = df_life_expectancy_2015_Cleaned['Country']

In [None]:
df_worldHappiness_2015.shape

In [None]:
# Getting matching and unmatching countries
countries_df = pd.merge(df_worldHappiness_2015['Country'], df_life_expectancy_countries, how='outer', indicator='Exist')
countries_df

In [None]:
# Getting difference between dataframes in countries
diff_df = countries_df.loc[countries_df['Exist'] != 'both']
diff_df

In [None]:
df_life_expectancy_2015_Cleaned['Country'].replace({'Czechia': 'Czech Republic','The former Yugoslav republic of Macedonia':'Macedonia','United Kingdom of Great Britain and Northern Ireland':'United Kingdom'},inplace=True)

In [None]:
# Getting matching and unmatching countries
countries_df = pd.merge(df_worldHappiness_2015['Country'], df_life_expectancy_2015_Cleaned['Country'], how='outer', indicator='Exist')
countries_df

In [None]:
# Getting difference between dataframes in countries
diff_df = countries_df.loc[countries_df['Exist'] != 'both']
diff_df

In [None]:
df_integrated = pd.merge(df_life_expectancy_2015_Cleaned, df_worldHappiness_2015,on='Country')
print(df_integrated.shape)
df_integrated

In [None]:
#Mergin With Countries
df_all_integrated = pd.merge(df_integrated,df_250_countries_cleaned,on="Country")
df_all_integrated

In [None]:
df_all_integrated.info()

In [None]:
df_all_integrated.describe()

# Questions To Be Asked
- ### World Happiness
- Which Region has the highest levels of happiness?
- How likely does the Economy of a country affect the Hapiness Ratio?
- Is there a relation between the economy of a country and its Health?
- Does the health ratio have a major impact on the happiness ratios of a country?
- Is having a higher economy(gdp ratio) implies having a higher family rates in a country?
- How likely is it that a country with hight trust levels tend to have more freedom ?
- ### All Countries and World Happiness
- Is there a relation between Literacy Rate & Economy?
- Is there a relation between Literacy Rate & Happiness Score?
- Which Subregion has the highest Happiness Score? And which one has the lowest?
- In each Subregion, how do the Freedom and Government Corruption correlate with Happiness Scores?
- ### Life Expectancy
- Which country has the highest life expectancy ratios?
- Why did those countries reach these high ratios?
- Is there a relation between the life expectancy and the income composition of resources?
- Is there a relation between the life expectancy and schooling (Average number of years of schooling) ?
- What is the likelihood that the countries with a higher polio and hepatitisB immunization ratios tend to have higher life expectancy?
- Does alcohol affect the life expectancy ?
- ### Life Expectancy and World Happiness
- Is happiness of a country highly depends on the life expectancy ratio of it?
- What is the likelihood that high polio immunization ratios increases the happines of a country?
- How likely does the both polio and hepatitisB immunization increase life expectancy ?
- Does the freedom percentage affect how the behaviour of the life in a Country?

# All Countries Data Visualization

In [None]:
fig=px.choropleth(data_frame=df_250_countries_cleaned, locations='Country',locationmode='country names',
                    color_continuous_scale='Electric',color='Gini',
                    hover_name='Country',
                    hover_data=['Region','Subregion','Population','Area',
                                'Gini','Real Growth Rating(%)','Literacy Rate(%)','Inflation(%)',
                               'Unemployement(%)'],
                   scope='world',projection='natural earth')

fig.show()

# World Of Happiness Data Visualization

In [None]:
plt.figure(figsize=(30,12))
sns.barplot(x="Region", y="Happiness Score", data=df_worldHappiness_2015)

In [None]:
plt.figure(figsize=(10,120))
sns.barplot(y="Country", x="Happiness Score",hue='Region', data=df_worldHappiness_2015)

In [None]:

fig=px.choropleth(data_frame=df_worldHappiness_2015, locations='Country',locationmode='country names',
                    color_continuous_scale='Electric',color='Happiness Score',
                    hover_name='Country',
                    hover_data=['Happiness Score','Standard Error','Happiness Rank','Economy (GDP per Capita)',
                                'Family','Health (Life Expectancy)','Freedom','Trust (Government Corruption)',
                               'Generosity','Dystopia Residual'],
                   scope='world',projection='natural earth')



fig.show()

In [None]:
plt.figure(figsize=(14,7))

corr = df_worldHappiness_2015.corr()
f, ax = plt.subplots(figsize=(20, 8))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr,linewidths=.5, annot= True)

In [None]:
fig = plt.figure(figsize=(14,22))

plt.subplot(5,2,1)
sns.scatterplot(y='Happiness Score',x='Economy (GDP per Capita)', data=df_worldHappiness_2015)
plt.subplot(5,2,2)
sns.regplot(y='Happiness Score',x='Economy (GDP per Capita)', data=df_worldHappiness_2015)


plt.subplot(5,2,3)
sns.scatterplot(y='Happiness Score',x='Family', data=df_worldHappiness_2015)
plt.subplot(5,2,4)
sns.regplot(y='Happiness Score',x='Family', data=df_worldHappiness_2015)

plt.subplot(5,2,5)
sns.scatterplot(y='Happiness Score',x='Health (Life Expectancy)', data=df_worldHappiness_2015)
plt.subplot(5,2,6)
sns.regplot(y='Happiness Score',x='Health (Life Expectancy)', data=df_worldHappiness_2015)

plt.subplot(5,2,7)
sns.scatterplot(y='Happiness Score',x='Freedom', data=df_worldHappiness_2015)
plt.subplot(5,2,8)
sns.regplot(y='Happiness Score',x='Freedom', data=df_worldHappiness_2015)

plt.subplot(5,2,9)
sns.scatterplot(y='Happiness Score',x='Dystopia Residual', data=df_worldHappiness_2015)
plt.subplot(5,2,10)
sns.regplot(y='Happiness Score',x='Dystopia Residual', data=df_worldHappiness_2015)

In [None]:
fig = plt.figure(figsize=(14,22))

plt.subplot(5,2,1)
sns.scatterplot(y='Economy (GDP per Capita)',x='Family', data=df_worldHappiness_2015)
plt.subplot(5,2,2)
sns.regplot(y='Economy (GDP per Capita)',x='Family', data=df_worldHappiness_2015)


plt.subplot(5,2,3)
sns.scatterplot(y='Economy (GDP per Capita)',x='Health (Life Expectancy)', data=df_worldHappiness_2015)
plt.subplot(5,2,4)
sns.regplot(y='Economy (GDP per Capita)',x='Health (Life Expectancy)', data=df_worldHappiness_2015)



In [None]:
fig = plt.figure(figsize=(10,8))
plt.subplot(2,2,1)
sns.scatterplot(y='Freedom',x='Trust (Government Corruption)', data=df_worldHappiness_2015)
plt.subplot(2,2,2)
sns.regplot(y='Freedom',x='Trust (Government Corruption)', data=df_worldHappiness_2015)


plt.subplot(2,2,3)
sns.scatterplot(x='Freedom',y='Trust (Government Corruption)', data=df_worldHappiness_2015)
plt.subplot(2,2,4)
sns.regplot(x='Freedom',y='Trust (Government Corruption)', data=df_worldHappiness_2015)

# Life Expectancy Data Visualization

# As Shown Below
- Developed Countries Tend To Have Better Life Expectancy Value

In [None]:
plt.figure(figsize=(10,120))
sns.barplot(y="Country", x="Life_Expectancy",hue='Status', data=df_life_expectancy_2015_Cleaned)

In [None]:
plt.figure(figsize=(6,6))
plt.bar(df_life_expectancy_2015_Cleaned.groupby('Status')['Status'].count().index,df_life_expectancy_2015_Cleaned.groupby('Status')['Life_Expectancy'].mean())
plt.xlabel("Status",fontsize=12)
plt.ylabel("Average Life_Expectancy",fontsize=12)
plt.title("Life_Expectancy and Status")
plt.show()

In [None]:

fig=px.choropleth(data_frame=df_life_expectancy_2015_Cleaned, locations='Country',locationmode='country names',
                    color_continuous_scale='Electric',color='Life_Expectancy',
                    hover_name='Country',
                    hover_data=['Life_Expectancy'],
                   scope='world',projection='natural earth')



fig.show()

In [None]:
plt.figure(figsize=(14,7))

corr = df_life_expectancy_Cleaned.corr()
f, ax = plt.subplots(figsize=(20, 8))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr,linewidths=.5, annot= True)

# Correlation Between Life Expectancy Data
### As Shown Above:
- It is clear that there is a strong correlation between life expectancy and (schooling and income_composition).
- Also there is a positive correlation between life expectancy and (Polio and HepatitisB) Immunization.
- A strong correlation between the income_composition and schooling.

# More Corelations for the year 2015:
- Infant_Deaths and Under_Five_Deaths are correlated.
- HIV has negative relation with Life_Expectancy.
- Adult_Mortality has negative relation with Life_Expectancy.

In [None]:
plt.figure(figsize=(10,10))

corr = df_life_expectancy_2015_Cleaned.corr()
f, ax = plt.subplots(figsize=(20, 14))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr,linewidths=.5, annot= True)

In [None]:
fig = plt.figure(figsize=(14,22))

plt.subplot(5,2,1)
sns.scatterplot(y='Life_Expectancy',x='Income_Composition_Of_Resources', data=df_life_expectancy_2015_Cleaned)
plt.subplot(5,2,2)
sns.regplot(y='Life_Expectancy',x='Income_Composition_Of_Resources', data=df_life_expectancy_2015_Cleaned)


plt.subplot(5,2,3)
sns.scatterplot(y='Life_Expectancy',x='Schooling', data=df_life_expectancy_2015_Cleaned)
plt.subplot(5,2,4)
sns.regplot(y='Life_Expectancy',x='Schooling', data=df_life_expectancy_2015_Cleaned)

plt.subplot(5,2,5)
sns.scatterplot(y='Life_Expectancy',x='Alcohol', data=df_life_expectancy_2015_Cleaned)
plt.subplot(5,2,6)
sns.regplot(y='Life_Expectancy',x='Alcohol', data=df_life_expectancy_2015_Cleaned)

plt.subplot(5,2,7)
sns.scatterplot(y='Life_Expectancy',x='Polio', data=df_life_expectancy_2015_Cleaned)
plt.subplot(5,2,8)
sns.regplot(y='Life_Expectancy',x='Polio', data=df_life_expectancy_2015_Cleaned)

plt.subplot(5,2,9)
sns.scatterplot(y='Life_Expectancy',x='HepatitisB', data=df_life_expectancy_2015_Cleaned)
plt.subplot(5,2,10)
sns.regplot(y='Life_Expectancy',x='HepatitisB', data=df_life_expectancy_2015_Cleaned)

# From the above plots :
- Unexpectedly alcohol does not affect the life expectancy measurements
- Polio also does not signifcantly have a strong impact on the life expectancy rate.

In [None]:
fig = plt.figure(figsize=(20,12))
plt.subplot(3,2,1)
sns.scatterplot(y='Income_Composition_Of_Resources',x='Schooling', data=df_life_expectancy_2015_Cleaned)
plt.subplot(3,2,2)
sns.regplot(y='Income_Composition_Of_Resources',x='Schooling', data=df_life_expectancy_2015_Cleaned)


plt.subplot(3,2,3)
sns.scatterplot(y='Income_Composition_Of_Resources',x='Polio', data=df_life_expectancy_2015_Cleaned)
plt.subplot(3,2,4)
sns.regplot(y='Income_Composition_Of_Resources',x='Polio', data=df_life_expectancy_2015_Cleaned)
plt.subplot(3,2,5)
sns.scatterplot(y='Life_Expectancy',x='BMI', data=df_life_expectancy_2015_Cleaned)
plt.subplot(3,2,6)
sns.regplot(y='Life_Expectancy',x='BMI', data=df_life_expectancy_2015_Cleaned)

# Observing the attributes that have a correlation with the Life Expectancy
- We can see that schooling has a positive relation with the Income Composition
- And a slightly negative impact between Schooling and Adult Mortality
- And also a negative relation between Income Compostion and Adult Mortality (Meaning that as the income increasing the deat of people between 15 and 60 years tend to decrease)

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(2,3,1)
plt.scatter(df_life_expectancy_2015_Cleaned["Schooling"], df_life_expectancy_2015_Cleaned["Adult_Mortality"])
plt.title("Schooling vs AdultMortality")

plt.subplot(2,3,2)
plt.scatter(df_life_expectancy_2015_Cleaned["Schooling"], df_life_expectancy_2015_Cleaned["Income_Composition_Of_Resources"])
plt.title("Schooling vs Income_Comp_Of_Resources")

plt.subplot(2,3,3)
plt.scatter(df_life_expectancy_2015_Cleaned["Adult_Mortality"], df_life_expectancy_2015_Cleaned["Income_Composition_Of_Resources"])
plt.title("AdultMortality vs Income_Comp_Of_Resources")
plt.subplot(2,3,4)
plt.scatter(df_life_expectancy_2015_Cleaned["BMI"], df_life_expectancy_2015_Cleaned["Income_Composition_Of_Resources"])
plt.title("BMI vs Income_Comp_Of_Resources")

# Correlation Between All Countries & World Of Happiness 

### As shown below:
- There is a Strong correlation between Literacy Rate and Economy
- The hexmap also indicates that not much countries have (relative) average in both, they either have both high or both low

In [None]:
sns.jointplot(y='Literacy Rate(%)',x='Economy (GDP per Capita)',data=df_all_countries_world_happiness,kind='reg')
plt.show()

sns.jointplot(y='Literacy Rate(%)',x='Economy (GDP per Capita)',data=df_all_countries_world_happiness,kind='hex')
plt.show()

- There is a strong positive correlation between literacy rate and happiness score

In [None]:
fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.scatterplot(y='Literacy Rate(%)',x='Happiness Score', data=df_all_countries_world_happiness)
plt.subplot(1,2,2)
sns.regplot(y='Literacy Rate(%)',x='Happiness Score', data=df_all_countries_world_happiness)

- It is clear that the highest Happiness Score region is Australia nad New Zealand, and the near second is Northern America
- The lowest regions in Happiness Score are the ones in Africa (specially Middle Africa), along with Southern Asia

In [None]:
# storing the order of the subregions according to the mean of the happiness score
temp_sort = df_all_countries_world_happiness.groupby(["Subregion"])['Happiness Score'].aggregate(np.mean).reset_index().sort_values('Happiness Score', ascending=False)
temp_sort.set_index('Subregion')

fig = plt.figure(figsize=(7,5))
sns.barplot(y='Subregion', x='Happiness Score', data=df_all_countries_world_happiness, orient='h', order=temp_sort['Subregion'])

- In order to show the correlation between the Freedom and Government Corruption, and the Happiness Score, we first need to aggregate all these variables into a new DataFrame

In [None]:
temp_aggregate = pd.merge(temp_sort, df_all_countries_world_happiness.groupby(["Subregion"])['Freedom'].aggregate(np.mean).reset_index())
temp_aggregate = pd.merge(temp_aggregate, df_all_countries_world_happiness.groupby(["Subregion"])['Trust (Government Corruption)'].aggregate(np.mean).reset_index())
temp_aggregate.head(10)

- Then we display the Freedom and Government Corruption of each Subregion in the same order as the original plot of Subregions vs Happiness Score to spot the correlation if it exists


**Results:**
- It appears that there is a positive correlation between the Freedom and Happiness Score for each Subregion, which is evident from having the Freedom values distribution very similar to the one of Happiness Score
- A positive correlation between Government Corruption and Hapiness Score is noticed for highest and lowest Subregions in Hapiness Score, but this correlation is not consistent for middle values

In [None]:
fig = plt.figure(figsize=(7,10))

plt.subplot(2,1,1)
sns.barplot(y='Subregion', x='Freedom', data=temp_aggregate, orient='h', order=temp_sort['Subregion'])
plt.subplot(2,1,2)
sns.barplot(y='Subregion', x='Trust (Government Corruption)', data=temp_aggregate, orient='h', order=temp_sort['Subregion'])

# Correlation Between Life Expectancy & World Of Happiness 

- As shown below there is a positive correlation between life expectancy of a country and its happiness score.

In [None]:
fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.scatterplot(y='Life_Expectancy',x='Happiness Score', data=df_all_integrated)
plt.subplot(1,2,2)
sns.regplot(y='Life_Expectancy',x='Happiness Score', data=df_all_integrated)


- We will try to see if polio and hepatitisB immunization system affect the life expectancy that would inturn affect (Decrease or Increase) the happiness

- Getting the average of both polio and hepatitisB to see if both of them affect the life expectancy or not

In [None]:
df_all_integrated['Polio_HepatitisB_Avg'] = df_all_integrated[['Polio', 'HepatitisB']].mean(axis=1)
df_all_integrated

- As you can see below there is zero correlation between the freedom and the life expectancy which tells us that life does not highly depend on freedom.
- Polio and Hepatitis have a positive relation with life expectancy which in turn affects the happiness score.

In [None]:
fig = plt.figure(figsize=(10,15))

plt.subplot(4,2,1)
sns.scatterplot(y='Life_Expectancy',x='Freedom', data=df_all_integrated)
plt.subplot(4,2,2)
sns.regplot(y='Life_Expectancy',x='Freedom', data=df_all_integrated)


plt.subplot(4,2,3)
sns.scatterplot(y='Life_Expectancy',x='Polio_HepatitisB_Avg', data=df_all_integrated)
plt.subplot(4,2,4)
sns.regplot(y='Life_Expectancy',x='Polio_HepatitisB_Avg', data=df_all_integrated)

plt.subplot(4,2,5)
sns.scatterplot(y='Happiness Score',x='Polio_HepatitisB_Avg', data=df_all_integrated)
plt.subplot(4,2,6)
sns.regplot(y='Happiness Score',x='Polio_HepatitisB_Avg', data=df_all_integrated)

# Feature Engineering
#### We should start by separating the columns that has a correlation and its the data to be used for the feature engineering since it is the data that answers the questions we're seeking to find an answer for.

### All Countries Features

In [None]:
df_all_countries_features = df_250_countries_cleaned[['Country','Population','Area','Gini','Literacy Rate(%)','Unemployement(%)']]
df_250_countries_cleaned

#### 1st feature: Population/Area
We will add a column that will represent the population divided by area of each country

In [None]:
df_all_countries_features['Population/Area'] = (df_all_countries_features['Population'])/(df_all_countries_features['Area'])
df_all_countries_features

#### Using this feature we can answer two questions:


- Is there a relation between the amount of population per kilometer squared and Unemployement(%)?

In [None]:
fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.scatterplot(x='Population/Area',y='Unemployement(%)', data=df_all_countries_features)
plt.subplot(1,2,2)
sns.regplot(x='Population/Area',y='Unemployement(%)', data=df_all_countries_features)


There is a negative relation between the amount of population per kilometer squared and Unemployement(%)



- Is there a relation between the amount of population per kilometer squared and Literacy Rate(%)?

In [None]:
fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.scatterplot(x='Population/Area',y='Literacy Rate(%)', data=df_all_countries_features)
plt.subplot(1,2,2)
sns.regplot(x='Population/Area',y='Literacy Rate(%)', data=df_all_countries_features)


There is a positive relation between the amount of population per kilometer squared and Literacy Rate(%)

### Life Expectancy Features

In [None]:
df_life_expectancy_feature = df_life_expectancy_Cleaned[['Country','Status','Life_Expectancy','Income_Composition_Of_Resources','HIV/AIDS','Adult_Mortality','BMI']]
df_life_expectancy_feature_2015= df_life_expectancy_2015_Cleaned[['Country','Status','Life_Expectancy','Income_Composition_Of_Resources','HIV/AIDS','Adult_Mortality','BMI']]
df_life_expectancy_feature_2015

# Starting with converting the categorical values to numerical values
- Status can be transferred to one hot encoding where 1 means Developing and 0 means Developed which would give us a better visualization and help us when we need to apply machine learning techniques.

In [None]:
df_life_expectancy_feature = pd.concat([df_life_expectancy_feature,pd.get_dummies(df_life_expectancy_feature['Status'],drop_first=True)],axis=1)
df_life_expectancy_feature_engineering = df_life_expectancy_feature.drop('Status',axis=1)
df_life_expectancy_feature_2015 = pd.concat([df_life_expectancy_feature_2015,pd.get_dummies(df_life_expectancy_feature_2015['Status'],drop_first=True)],axis=1)
df_life_expectancy_feature_engineering_2015 = df_life_expectancy_feature_2015.drop('Status',axis=1)
df_life_expectancy_feature_engineering_2015

- Create an indicator feature that tells us whether the person is overweight or not based on a threshold.
- Where 1 is overweight and 0 is underweight

In [None]:
df_life_expectancy_feature_engineering['Obesity_Indicator'] = (df_life_expectancy_feature_engineering['BMI'] >= 25)*1
df_life_expectancy_feature_engineering_2015['Obesity_Indicator'] = (df_life_expectancy_feature_engineering_2015['BMI'] >= 25)*1
df_life_expectancy_feature_engineering_2015

# Showing the positive relation between Income_Composition_Of_Resources and Obesity
- Where the increase in the Income_Composition_Of_Resources tends to increase the Obesity of people

In [None]:
countOverWeight = df_life_expectancy_feature_engineering_2015[(df_life_expectancy_feature_engineering_2015['Obesity_Indicator']==1)]
countUnderWeight = df_life_expectancy_feature_engineering_2015[(df_life_expectancy_feature_engineering_2015['Obesity_Indicator']==0)]
plt.figure(figsize=(6,6))
plt.bar(df_life_expectancy_feature_engineering_2015.groupby('Obesity_Indicator')['Obesity_Indicator'].count().index,df_life_expectancy_feature_engineering_2015.groupby('Obesity_Indicator')['Income_Composition_Of_Resources'].mean())
plt.xlabel("Obesity_Indicator",fontsize=12)
plt.ylabel("Average Income_Composition_Of_Resources",fontsize=12)
plt.title("Income_Composition_Of_Resources and Obesity")
plt.show()

In [None]:
df_feature_integration = pd.merge(df_life_expectancy_feature_engineering_2015,df_all_countries_features,on="Country")
df_feature_integration

In [None]:
df_all_integrated = df_all_integrated.drop(columns=['Population_y', 'Region_x'])
df_all_integrated = df_all_integrated.rename(columns={'Population_x':'Population','Region_y':'Region'})
df_all_integrated

In [None]:
df_all_integrated['Population/Area'] = (df_all_integrated['Population'])/(df_all_integrated['Area'])
df_all_integrated


# Using One Hot Encoding For Regions and Subregions Since They Correlate with each others and other attributes such as population

In [None]:
df_all_integrated = pd.get_dummies(df_all_integrated, columns=['Subregion','Region'], drop_first=False)
df_all_integrated

In [None]:
fig = plt.figure(figsize=(14,22))

plt.subplot(5,2,1)
sns.scatterplot(x='Population/Area',y='Unemployement(%)', data=df_all_integrated)
plt.subplot(5,2,2)
sns.regplot(x='Population/Area',y='Unemployement(%)', data=df_all_integrated)

plt.subplot(5,2,3)
sns.scatterplot(x='Population/Area',y='Freedom', data=df_all_integrated)
plt.subplot(5,2,4)
sns.regplot(x='Population/Area',y='Freedom', data=df_all_integrated)

plt.subplot(5,2,5)
sns.scatterplot(x='Population/Area',y='Income_Composition_Of_Resources', data=df_all_integrated)
plt.subplot(5,2,6)
sns.regplot(x='Population/Area',y='Income_Composition_Of_Resources', data=df_all_integrated)

plt.subplot(5,2,7)
sns.scatterplot(x='Population/Area',y='Happiness Score', data=df_all_integrated)
plt.subplot(5,2,8)
sns.regplot(x='Population/Area',y='Happiness Score', data=df_all_integrated)

plt.subplot(5,2,9)
sns.scatterplot(x='Happiness Score',y='Life_Expectancy', data=df_all_integrated)
plt.subplot(5,2,10)
sns.regplot(x='Happiness Score',y='Life_Expectancy', data=df_all_integrated)

# Observations:
- It is obvious now how the population per area significantly affects the countries assets from (Income , Freedom) which in turn affects the unemployment rates and eventually the happiness of that country.
- The last figure is a direct answer on the relation between happiness and life where its clear that there is a positive relation between the two ... which infact gives us some insights on what life expectancy depends on rather than the obvious aspects (Infections,Inflammations..etc)

In [None]:
df_max_happiness = df_all_integrated[df_all_integrated['Happiness Score'] == df_all_integrated['Happiness Score'].max()] 
x = df_max_happiness['Country'].values[0]
x

In [None]:
df_min_happiness = df_all_integrated[df_all_integrated['Happiness Score'] == df_all_integrated['Happiness Score'].min()] 
df_min_happiness

In [None]:
df_all_integrated.to_csv('All_Integrated')