In [1]:
%matplotlib notebook

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

In [3]:
life_expectancy_clean = "resources/life_expectancy_clean.csv"

life_expectancy = pd.read_csv(life_expectancy_clean)

In [4]:
life_expectancy.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality (%),Infant Deaths (%),Alcohol Consumpter per Capita (ltr),Percentage Expenditure,Hepatitis B Vaccines (%),Measles Cases (%),...,Polio Vaccines (%),Total Expenditure (%),Diphtheria Vaccines (%),HIV/AIDS Deaths Under 5yo (%),GDP Per Capita ($),Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
0,Afghanistan,2015,Developing,65.0,26.3,6.2,0.01,71.28,65.0,115.4,...,6.0,8.16,65.0,0.01,584.26,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,27.1,6.4,0.01,73.52,62.0,49.2,...,58.0,8.18,62.0,0.01,612.7,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,26.8,6.6,0.01,73.22,64.0,43.0,...,62.0,8.13,64.0,0.01,631.74,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,27.2,6.9,0.01,78.18,67.0,278.7,...,67.0,8.52,67.0,0.01,669.96,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,27.5,7.1,0.01,7.1,68.0,301.3,...,68.0,7.87,68.0,0.01,63.54,2978599.0,18.2,18.2,0.454,9.5


In [5]:
life_expectancy.describe()

Unnamed: 0,Year,Life Expectancy,Adult Mortality (%),Infant Deaths (%),Alcohol Consumpter per Capita (ltr),Percentage Expenditure,Hepatitis B Vaccines (%),Measles Cases (%),BMI,Under-5yo Deaths (%),Polio Vaccines (%),Total Expenditure (%),Diphtheria Vaccines (%),HIV/AIDS Deaths Under 5yo (%),GDP Per Capita ($),Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
count,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2088.0,2569.0,2569.0,2569.0,2562.0,2563.0,2562.0,2569.0,2311.0,2114.0,2569.0,2569.0,2569.0,2569.0
mean,2007.016349,69.341845,16.218529,2.960802,4.538248,842.91093,80.903736,224.422849,38.234566,4.100662,82.845433,5.870773,82.742779,0.182935,7597.286058,12903990.0,4.903893,4.942156,0.627501,12.038926
std,4.332164,9.391106,12.40417,12.286989,4.011904,2105.196623,24.956422,1075.217547,19.767293,16.731129,22.945914,2.374245,23.195479,0.536235,14518.21886,62981740.0,4.50504,4.596621,0.210369,3.254761
min,2000.0,36.3,0.1,0.0,0.01,0.0,2.0,0.0,1.0,0.0,3.0,0.37,2.0,0.01,1.68,34.0,0.1,0.1,0.0,0.0
25%,2003.0,63.7,7.3,0.0,0.85,18.93,77.0,0.0,19.1,0.0,78.0,4.26,79.0,0.01,453.735,188047.2,1.6,1.6,0.494,10.1
50%,2007.0,72.2,14.2,0.3,3.76,102.14,92.0,1.4,43.8,0.3,93.0,5.71,93.0,0.01,1753.35,1345220.0,3.3,3.3,0.677,12.3
75%,2011.0,75.4,22.3,1.9,7.53,544.45,97.0,33.6,55.8,2.4,97.0,7.455,97.0,0.07,5925.55,7268290.0,7.3,7.3,0.779,14.2
max,2015.0,89.0,72.3,180.0,17.87,19479.91,99.0,21218.3,77.1,250.0,99.0,14.39,99.0,5.06,119172.74,1293859000.0,27.7,28.6,0.948,20.7


In [6]:
mean = life_expectancy.groupby('Country')["Life Expectancy"].mean()
median = life_expectancy.groupby('Country')["Life Expectancy"].median()
var = life_expectancy.groupby('Country')["Life Expectancy"].var()
std = life_expectancy.groupby('Country')["Life Expectancy"].std()
summary_statistics = pd.DataFrame({"Mean": mean, "Median": median, "Variance": var,"Standard Dev.": std})
summary_statistics.head()

Unnamed: 0_level_0,Mean,Median,Variance,Standard Dev.
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,58.19375,57.8,5.667292,2.380607
Albania,75.15625,75.6,3.373292,1.836652
Algeria,73.486667,73.8,2.284095,1.511322
Angola,48.793333,48.2,7.763524,2.78631
Antigua and Barbuda,74.966667,75.0,0.720952,0.849089


In [7]:
life_expectancy["Status"].unique()

array(['Developing', 'Developed'], dtype=object)

In [8]:
developed_countries = life_expectancy.loc[life_expectancy["Status"] == "Developed",:]
developed_countries.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality (%),Infant Deaths (%),Alcohol Consumpter per Capita (ltr),Percentage Expenditure,Hepatitis B Vaccines (%),Measles Cases (%),...,Polio Vaccines (%),Total Expenditure (%),Diphtheria Vaccines (%),HIV/AIDS Deaths Under 5yo (%),GDP Per Capita ($),Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
107,Australia,2014,Developed,82.7,0.6,0.1,9.71,10769.36,91.0,34.0,...,92.0,9.42,92.0,0.01,62214.69,2346694.0,0.6,0.6,0.936,20.4
108,Australia,2013,Developed,82.5,6.1,0.1,9.87,11734.85,91.0,15.8,...,91.0,9.36,91.0,0.01,67792.34,23117353.0,0.6,0.6,0.933,20.3
109,Australia,2012,Developed,82.3,6.1,0.1,10.03,11715.0,91.0,19.9,...,92.0,9.36,92.0,0.01,67677.63,22728254.0,0.6,0.6,0.93,20.1
110,Australia,2011,Developed,82.0,6.3,0.1,10.3,10986.27,92.0,19.0,...,92.0,9.2,92.0,0.01,62245.13,223424.0,0.6,0.6,0.927,19.8
111,Australia,2010,Developed,81.9,6.4,0.1,10.52,8875.79,92.0,7.0,...,92.0,9.2,92.0,0.01,51874.85,223175.0,0.7,0.6,0.927,19.5


In [9]:
mean_life_expectancy_alcohol_developed = developed_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Alcohol Consumpter per Capita (ltr)" : np.mean})                                                             

mean_life_expectancy_alcohol_developed.head()


Unnamed: 0,Country,Life Expectancy,Alcohol Consumpter per Capita (ltr)
0,Australia,81.746667,10.155333
1,Austria,81.48,12.236
2,Belgium,80.653333,11.042667
3,Bulgaria,72.74,10.865333
4,Croatia,75.993333,12.448


In [28]:
x = mean_life_expectancy_alcohol_developed["Alcohol Consumpter per Capita (ltr)"]
y = mean_life_expectancy_alcohol_developed["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumvioletred", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg Alcohol Consumption per Liter for Developed Countries")
plt.xlabel("Alcohol Consumpter per Capita (ltr)")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.ylim(70, 90)
plt.xlim(6,13)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [11]:
mean_life_expectancy_bmi_developed = developed_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "BMI" : np.mean})                                                             

mean_life_expectancy_bmi_developed.head()

Unnamed: 0,Country,Life Expectancy,BMI
0,Australia,81.746667,55.146667
1,Austria,81.48,47.666667
2,Belgium,80.653333,50.04
3,Bulgaria,72.74,53.753333
4,Croatia,75.993333,51.6


In [29]:
x = mean_life_expectancy_bmi_developed["BMI"]
y = mean_life_expectancy_bmi_developed["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg BMI for Developed Countries")
plt.xlabel("BMI")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.ylim(70, 90)
plt.xlim(40,70)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [13]:
developing_countries = life_expectancy.loc[life_expectancy["Status"] == "Developing",:]
developing_countries.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality (%),Infant Deaths (%),Alcohol Consumpter per Capita (ltr),Percentage Expenditure,Hepatitis B Vaccines (%),Measles Cases (%),...,Polio Vaccines (%),Total Expenditure (%),Diphtheria Vaccines (%),HIV/AIDS Deaths Under 5yo (%),GDP Per Capita ($),Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
0,Afghanistan,2015,Developing,65.0,26.3,6.2,0.01,71.28,65.0,115.4,...,6.0,8.16,65.0,0.01,584.26,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,27.1,6.4,0.01,73.52,62.0,49.2,...,58.0,8.18,62.0,0.01,612.7,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,26.8,6.6,0.01,73.22,64.0,43.0,...,62.0,8.13,64.0,0.01,631.74,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,27.2,6.9,0.01,78.18,67.0,278.7,...,67.0,8.52,67.0,0.01,669.96,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,27.5,7.1,0.01,7.1,68.0,301.3,...,68.0,7.87,68.0,0.01,63.54,2978599.0,18.2,18.2,0.454,9.5


In [14]:
developing_countries.columns

Index(['Country', 'Year', 'Status', 'Life Expectancy', 'Adult Mortality (%)',
       'Infant Deaths (%)', 'Alcohol Consumpter per Capita (ltr)',
       'Percentage Expenditure', 'Hepatitis B Vaccines (%)',
       'Measles Cases (%)', 'BMI', 'Under-5yo Deaths (%)',
       'Polio Vaccines (%)', 'Total Expenditure (%)',
       'Diphtheria Vaccines (%)', 'HIV/AIDS Deaths Under 5yo (%)',
       'GDP Per Capita ($)', 'Population', 'Thinness  1-19 yrs',
       'Thinness 5-9 yrs', 'Income Composition of Resources', 'Schooling'],
      dtype='object')

In [15]:
mean_life_expectancy_alcohol_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Alcohol Consumpter per Capita (ltr)" : np.mean})                                                             

mean_life_expectancy_alcohol_developing.head()


Unnamed: 0,Country,Life Expectancy,Alcohol Consumpter per Capita (ltr)
0,Afghanistan,58.19375,0.014375
1,Albania,75.15625,4.84875
2,Algeria,73.486667,0.406667
3,Angola,48.793333,5.740667
4,Antigua and Barbuda,74.966667,7.949333


In [30]:
x = mean_life_expectancy_alcohol_developing["Alcohol Consumpter per Capita (ltr)"]
y = mean_life_expectancy_alcohol_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumvioletred", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg Alcohol Consumption per Liter for Developing Countries")
plt.xlabel("Alcohol Consumpter per Capita (ltr)")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [17]:
mean_life_expectancy_bmi_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "BMI" : np.mean})                                                             

mean_life_expectancy_bmi_developing.head()

Unnamed: 0,Country,Life Expectancy,BMI
0,Afghanistan,58.19375,15.51875
1,Albania,75.15625,49.06875
2,Algeria,73.486667,48.026667
3,Angola,48.793333,17.666667
4,Antigua and Barbuda,74.966667,37.806667


In [31]:
x = mean_life_expectancy_bmi_developing["BMI"]
y = mean_life_expectancy_bmi_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg BMI for Developing Countries")
plt.xlabel("BMI")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
# plt.ylim(70, 90)
# plt.xlim(40,70)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [41]:
x_values = mean_life_expectancy_bmi_developing["BMI"]
y_values = mean_life_expectancy_bmi_developing["Life Expectancy"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(10,80),fontsize=16,color="red")
plt.ylabel('Life Expectancy (Yrs)')
plt.xlabel('BMI')
plt.title('Avg Life Expectancy vs BMI for Developing Countries')
print(f"R squared: {rvalue**2}")
plt.show()

<IPython.core.display.Javascript object>

R squared: 0.49175086857304273


In [19]:
mean_life_expectancy_dip_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Diphtheria Vaccines (%)" : np.mean})                                                             

mean_life_expectancy_dip_developing.head()


Unnamed: 0,Country,Life Expectancy,Diphtheria Vaccines (%)
0,Afghanistan,58.19375,52.3125
1,Albania,75.15625,98.0625
2,Algeria,73.486667,91.666667
3,Angola,48.793333,46.6
4,Antigua and Barbuda,74.966667,98.266667


In [36]:
x = mean_life_expectancy_dip_developing["Diphtheria Vaccines (%)"]
y = mean_life_expectancy_dip_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Diphtheria Vaccinations for Developing Countries")
plt.xlabel("Diphtheria Vaccines (%)")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
# plt.ylim(70, 90)
# plt.xlim(40,70)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [39]:
x_values = mean_life_expectancy_dip_developing["Diphtheria Vaccines (%)"]
y_values = mean_life_expectancy_dip_developing["Life Expectancy"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(30,80),fontsize=16,color="red")
plt.ylabel('Life Expectancy (Yrs)')
plt.xlabel('Diphtheria Vaccines (%)')
plt.title('Avg Life Expectancy vs Diphtheria Vaccinations for Developing Countries')
print(f"R squared: {rvalue**2}")
plt.show()

<IPython.core.display.Javascript object>

R squared: 0.3845846769715367


In [21]:
developing_countries.columns

Index(['Country', 'Year', 'Status', 'Life Expectancy', 'Adult Mortality (%)',
       'Infant Deaths (%)', 'Alcohol Consumpter per Capita (ltr)',
       'Percentage Expenditure', 'Hepatitis B Vaccines (%)',
       'Measles Cases (%)', 'BMI', 'Under-5yo Deaths (%)',
       'Polio Vaccines (%)', 'Total Expenditure (%)',
       'Diphtheria Vaccines (%)', 'HIV/AIDS Deaths Under 5yo (%)',
       'GDP Per Capita ($)', 'Population', 'Thinness  1-19 yrs',
       'Thinness 5-9 yrs', 'Income Composition of Resources', 'Schooling'],
      dtype='object')

In [22]:
mean_life_expectancy_hepb_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Hepatitis B Vaccines (%)" : np.mean})                                                             

mean_life_expectancy_hepb_developing.head()


Unnamed: 0,Country,Life Expectancy,Hepatitis B Vaccines (%)
0,Afghanistan,58.19375,64.5625
1,Albania,75.15625,98.0
2,Algeria,73.486667,76.454545
3,Angola,48.793333,71.0
4,Antigua and Barbuda,74.966667,98.214286


In [32]:
x = mean_life_expectancy_hepb_developing["Hepatitis B Vaccines (%)"]
y = mean_life_expectancy_hepb_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Hepatitis B Vaccinations for Developing Countries")
plt.xlabel("Hepatitis B Vaccines (%)")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [25]:
mean_life_expectancy_polio_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Polio Vaccines (%)" : np.mean})                                                             

mean_life_expectancy_polio_developing.head()


Unnamed: 0,Country,Life Expectancy,Polio Vaccines (%)
0,Afghanistan,58.19375,48.375
1,Albania,75.15625,98.125
2,Algeria,73.486667,91.533333
3,Angola,48.793333,48.733333
4,Antigua and Barbuda,74.966667,97.666667


In [34]:
x = mean_life_expectancy_polio_developing["Polio Vaccines (%)"]
y = mean_life_expectancy_polio_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Polio Vaccinations for Developing Countries")
plt.xlabel("Polio Vaccines (%)")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.tight_layout()

<IPython.core.display.Javascript object>

***Need to run regressions and get correlation coefficients for polio, hep b, diphtheria and bmi against avg life expectancy***

In [35]:
#polio regression

x_values = mean_life_expectancy_polio_developing["Polio Vaccines (%)"]
y_values = mean_life_expectancy_polio_developing["Life Expectancy"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(35,80),fontsize=16,color="red")
plt.ylabel('Life Expectancy (Yrs)')
plt.xlabel('Polio Vaccines (%)')
plt.title('Avg Life Expectancy vs Polio Vaccinations for Developing Countries')
print(f"R squared: {rvalue**2}")
plt.show()

<IPython.core.display.Javascript object>

R squared: 0.38702727171440066
