In [39]:
# Hypothesis and three questions

In [40]:
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

In [41]:
life_expectancy_clean = "resources/life_expectancy_clean.csv"

life_expectancy = pd.read_csv(life_expectancy_clean)

In [42]:
life_expectancy.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality (%),Infant Deaths (%),Alcohol Consumpter per Capita (ltr),Percentage Expenditure,Hepatitis B Vaccines (%),Measles Cases (%),...,Polio Vaccines (%),Total Expenditure (%),Diphtheria Vaccines (%),HIV/AIDS Deaths Under 5yo (%),GDP Per Capita ($),Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
0,Afghanistan,2015,Developing,65.0,26.3,6.2,0.01,71.28,65.0,115.4,...,6.0,8.16,65.0,0.01,584.26,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,27.1,6.4,0.01,73.52,62.0,49.2,...,58.0,8.18,62.0,0.01,612.7,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,26.8,6.6,0.01,73.22,64.0,43.0,...,62.0,8.13,64.0,0.01,631.74,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,27.2,6.9,0.01,78.18,67.0,278.7,...,67.0,8.52,67.0,0.01,669.96,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,27.5,7.1,0.01,7.1,68.0,301.3,...,68.0,7.87,68.0,0.01,63.54,2978599.0,18.2,18.2,0.454,9.5


In [43]:
mean = life_expectancy.groupby('Country')["Life Expectancy"].mean()
median = life_expectancy.groupby('Country')["Life Expectancy"].median()
var = life_expectancy.groupby('Country')["Life Expectancy"].var()
std = life_expectancy.groupby('Country')["Life Expectancy"].std()
summary_statistics = pd.DataFrame({"Mean": mean, "Median": median, "Variance": var,"Standard Dev.": std})
summary_statistics.head()

Unnamed: 0_level_0,Mean,Median,Variance,Standard Dev.
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,58.19375,57.8,5.667292,2.380607
Albania,75.15625,75.6,3.373292,1.836652
Algeria,73.486667,73.8,2.284095,1.511322
Angola,48.793333,48.2,7.763524,2.78631
Antigua and Barbuda,74.966667,75.0,0.720952,0.849089


In [44]:
developing_countries = life_expectancy.loc[life_expectancy["Status"] == "Developing",:]
developing_countries.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality (%),Infant Deaths (%),Alcohol Consumpter per Capita (ltr),Percentage Expenditure,Hepatitis B Vaccines (%),Measles Cases (%),...,Polio Vaccines (%),Total Expenditure (%),Diphtheria Vaccines (%),HIV/AIDS Deaths Under 5yo (%),GDP Per Capita ($),Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
0,Afghanistan,2015,Developing,65.0,26.3,6.2,0.01,71.28,65.0,115.4,...,6.0,8.16,65.0,0.01,584.26,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,27.1,6.4,0.01,73.52,62.0,49.2,...,58.0,8.18,62.0,0.01,612.7,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,26.8,6.6,0.01,73.22,64.0,43.0,...,62.0,8.13,64.0,0.01,631.74,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,27.2,6.9,0.01,78.18,67.0,278.7,...,67.0,8.52,67.0,0.01,669.96,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,27.5,7.1,0.01,7.1,68.0,301.3,...,68.0,7.87,68.0,0.01,63.54,2978599.0,18.2,18.2,0.454,9.5


In [45]:
#Do early life vaccinations affect average life expectancy in developing countries?
#In particular a regression will be done on polio and diphtheria vaccines to measure the strength of the correlation

In [46]:
mean_life_expectancy_dip_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Diphtheria Vaccines (%)" : np.mean})                                                             
mean_life_expectancy_dip_developing.head()

Unnamed: 0,Country,Life Expectancy,Diphtheria Vaccines (%)
0,Afghanistan,58.19375,52.3125
1,Albania,75.15625,98.0625
2,Algeria,73.486667,91.666667
3,Angola,48.793333,46.6
4,Antigua and Barbuda,74.966667,98.266667


In [58]:
x_values = mean_life_expectancy_dip_developing["Diphtheria Vaccines (%)"]
y_values = mean_life_expectancy_dip_developing["Life Expectancy"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values,marker = "o", edgecolors = "purple", facecolors="lavender")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(30,80),fontsize=16,color="red")
plt.ylabel('Life Expectancy in Years')
plt.xlabel('Diphtheria Vaccines (%)')
plt.title('Average Life Expectancy vs Diphtheria Vaccinations for Developing Countries (2000-2015)')
print(f"R squared: {rvalue**2}")
plt.annotate(f"R squared: {rvalue**2}",(25,77),fontsize=12,color="red")
plt.show()
plt.savefig("output_graphs/Average Life Expectancy vs Diphtheria Vaccinations for Developing Countries (2000-2015).png")

<IPython.core.display.Javascript object>

R squared: 0.3845846769715367


In [49]:
mean_life_expectancy_polio_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Polio Vaccines (%)" : np.mean})                                                             
mean_life_expectancy_polio_developing.head()

Unnamed: 0,Country,Life Expectancy,Polio Vaccines (%)
0,Afghanistan,58.19375,48.375
1,Albania,75.15625,98.125
2,Algeria,73.486667,91.533333
3,Angola,48.793333,48.733333
4,Antigua and Barbuda,74.966667,97.666667


In [50]:
x_values = mean_life_expectancy_polio_developing["Polio Vaccines (%)"]
y_values = mean_life_expectancy_polio_developing["Life Expectancy"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, marker = "o", edgecolors = "coral", facecolors="yellow")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(35,80),fontsize=16,color="red")
plt.ylabel('Life Expectancy in Years')
plt.xlabel('Polio Vaccines (%)')
plt.title('Average Life Expectancy vs Polio Vaccinations for Developing Countries (2000-2015)')
# print(f"R squared: {rvalue**2}")
plt.annotate(f"R squared: {rvalue**2}",(30,77),fontsize=12,color="red")
plt.show()
plt.savefig("output_graphs/Average Life Expectancy vs Polio Vaccinations for Developing Countries (2000-2015).png")

<IPython.core.display.Javascript object>

In [51]:
#Is increased schooling positively correlated with increased life expectancy?

In [57]:
#use this for testing schooling variable
mean_lifeexp = life_expectancy.groupby('Country')["Life Expectancy"].mean()
mean_school = life_expectancy.groupby('Country')["Schooling"].mean()

#Data frame for schooling and life expectancy 
schooling_avgs = pd.DataFrame({"Avg Life Expectancy (2000-2015)": mean_lifeexp,
                               "Avg Schooling per Country (2000-2015)": mean_school})

#QUESTION 
#Is increased schooling positively correlated with increased life expectancy?
y_axis = schooling_avgs["Avg Life Expectancy (2000-2015)"]
x_axis = schooling_avgs["Avg Schooling per Country (2000-2015)"]


#regression code
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

#scatter plot
plt.scatter(x_axis, y_axis, marker = "o", edgecolors = "teal", facecolors="lightblue")
plt.title("Average Life Expectancy vs Average Schooling in Developing Countries (2000-2015)")
plt.ylabel("Life Expectancy in Years")
plt.xlabel("Schooling per Country in Years")


#regression code 
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.plot(x_axis,regress_values,"r-")
plt.annotate(line_eq,(4,85),fontsize=16,color="red")
plt.show()
# print("The r-value is " + str(rvalue**2))
plt.annotate(f"R squared: {rvalue**2}",(4,82),fontsize=12,color="red")
plt.savefig("output_graphs/Average Life Expectancy vs Average Schooling in Developing Countries (2000-2015).png")

<IPython.core.display.Javascript object>

In [60]:
#Are average life expectancy and infant deaths affected by GDP in developing countries?

In [65]:
infant_deaths_gdp = developing_countries.groupby(['Year'], as_index = False).agg({"GDP Per Capita ($)" : np.mean,
                                                                                  "Infant Deaths (%)" : np.mean})                                                             
infant_deaths_gdp.head()

Unnamed: 0,Year,GDP Per Capita ($),Infant Deaths (%)
0,2000,2389.65488,4.391549
1,2001,2405.09381,4.305634
2,2002,2604.811667,4.149296
3,2003,3103.506825,4.062676
4,2004,3721.897087,3.906338


In [70]:
x_values = infant_deaths_gdp["GDP Per Capita ($)"]
y_values = infant_deaths_gdp["Infant Deaths (%)"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, facecolor = "pink", edgecolor = "hotpink")
plt.title("Infant Deaths vs GDP by Year (2000-2015) in Developing Countries")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(5000,4.35),fontsize=15,color="red")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.xlabel('GDP Per Capita ($)')
plt.ylabel('Infant Deaths ($)')
plt.annotate(f"R squared: {rvalue**2}",(4200,4.15),fontsize=12,color="red")
plt.show()
plt.savefig("output_graphs/Infant Deaths vs GDP by Year (2000-2015) in Developing Countries.png")

<IPython.core.display.Javascript object>

In [71]:
life_expectancy_gdp = developing_countries.groupby(['Year'], as_index = False).agg({"GDP Per Capita ($)" : np.mean,
                                                                                                 "Life Expectancy" : np.mean})                                                             

life_expectancy_gdp.head()

Unnamed: 0,Year,GDP Per Capita ($),Life Expectancy
0,2000,2389.65488,65.057746
1,2001,2405.09381,65.428873
2,2002,2604.811667,65.566197
3,2003,3103.506825,65.530986
4,2004,3721.897087,65.780986


In [78]:
x_value = life_expectancy_gdp["Life Expectancy"]
y_value = life_expectancy_gdp["GDP Per Capita ($)"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_value, y_value)
regress_values = x_value * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_value,y_value, facecolor = "lightgreen", edgecolor = "green")
plt.plot(x_value,regress_values,"r-")
plt.annotate(line_eq,(65,7000),fontsize=15,color="red")
plt.annotate(f"R squared: {rvalue**2}",(65,6700),fontsize=12,color="red")
plt.ylabel('Life Expectancy in Years')
plt.xlabel('GDP Per Capita ($)')
plt.title('Average Life Expectancy vs GDP by Year (2000-2015) in Developing Countries')
plt.savefig("output_graphs/Average Life Expectancy vs GDP by Year (2000-2015) in Developing Countries.png")
plt.show()

<IPython.core.display.Javascript object>