In [1]:
%matplotlib notebook

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

In [3]:
life_expectancy_clean = "resources/life_expectancy_clean.csv"

life_expectancy = pd.read_csv(life_expectancy_clean)

In [4]:
life_expectancy.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality,Infant Deaths,Alcohol,Percentage Expenditure,Hepatitis B,Measles,...,Polio,Total Expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.28%,65.0,1154,...,6.0,8.16,65.0,0.1,584.259,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.52%,62.0,492,...,58.0,8.18,62.0,0.1,612.697,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.22%,64.0,430,...,62.0,8.13,64.0,0.1,631.745,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.18%,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.10%,68.0,3013,...,68.0,7.87,68.0,0.1,63.537,2978599.0,18.2,18.2,0.454,9.5


In [5]:
life_expectancy.describe()

Unnamed: 0,Year,Life Expectancy,Adult Mortality,Infant Deaths,Alcohol,Hepatitis B,Measles,BMI,Under-5yo Deaths,Polio,Total Expenditure,Diphtheria,HIV/AIDS,Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
count,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0,1657.0
mean,2007.850935,69.299819,168.074231,32.444176,4.513271,79.161738,2214.61557,38.173325,44.065782,83.421847,5.943808,83.972842,1.974774,14605690.0,4.857333,4.913096,0.631559,12.116295
std,4.086784,8.781483,125.150716,120.566224,4.029705,25.577115,10062.430303,19.748093,162.52047,22.592244,2.300552,21.762633,6.019189,70295550.0,4.589155,4.643184,0.182654,2.79064
min,2000.0,44.0,1.0,0.0,0.01,2.0,0.0,2.0,0.0,3.0,0.74,2.0,0.1,34.0,0.1,0.1,0.0,4.2
25%,2005.0,64.4,77.0,1.0,0.76,74.0,0.0,19.5,1.0,81.0,4.38,81.0,0.1,191897.0,1.6,1.7,0.511,10.3
50%,2008.0,71.7,148.0,3.0,3.76,89.0,15.0,43.8,4.0,93.0,5.83,92.0,0.1,1425221.0,3.1,3.2,0.672,12.3
75%,2011.0,75.0,227.0,22.0,7.33,96.0,373.0,55.8,29.0,97.0,7.46,97.0,0.7,7592865.0,7.0,7.1,0.75,13.9
max,2015.0,89.0,723.0,1600.0,17.87,99.0,131441.0,77.1,2100.0,99.0,14.39,99.0,50.6,1293859000.0,27.2,28.2,0.936,20.7


In [6]:
mean = life_expectancy.groupby('Country')["Life Expectancy"].mean()
median = life_expectancy.groupby('Country')["Life Expectancy"].median()
var = life_expectancy.groupby('Country')["Life Expectancy"].var()
std = life_expectancy.groupby('Country')["Life Expectancy"].std()
summary_statistics = pd.DataFrame({"Mean": mean, "Median": median, "Variance": var,"Standard Dev.": std})
summary_statistics.head()

Unnamed: 0_level_0,Mean,Median,Variance,Standard Dev.
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,58.19375,57.8,5.667292,2.380607
Albania,75.15625,75.6,3.373292,1.836652
Algeria,74.209091,74.4,1.034909,1.017305
Angola,50.675,50.3,6.193571,2.488689
Argentina,75.238462,75.4,0.464231,0.681345


In [7]:
life_expectancy["Status"].unique()

array(['Developing', 'Developed'], dtype=object)

In [8]:
developed_countries = life_expectancy.loc[life_expectancy["Status"] == "Developed",:]
developed_countries.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality,Infant Deaths,Alcohol,Percentage Expenditure,Hepatitis B,Measles,...,Polio,Total Expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
79,Australia,2014,Developed,82.7,6.0,1,9.71,"10,769.36%",91.0,340,...,92.0,9.42,92.0,0.1,62214.691,2346694.0,0.6,0.6,0.936,20.4
80,Australia,2013,Developed,82.5,61.0,1,9.87,"11,734.85%",91.0,158,...,91.0,9.36,91.0,0.1,67792.339,23117353.0,0.6,0.6,0.933,20.3
81,Australia,2012,Developed,82.3,61.0,1,10.03,"11,715.00%",91.0,199,...,92.0,9.36,92.0,0.1,67677.635,22728254.0,0.6,0.6,0.93,20.1
82,Australia,2011,Developed,82.0,63.0,1,10.3,"10,986.27%",92.0,190,...,92.0,9.2,92.0,0.1,62245.129,223424.0,0.6,0.6,0.927,19.8
83,Australia,2010,Developed,81.9,64.0,1,10.52,"8,875.79%",92.0,70,...,92.0,9.2,92.0,0.1,51874.848,223175.0,0.7,0.6,0.927,19.5


In [9]:
mean_life_expectancy_alcohol_developed = developed_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Alcohol" : np.mean})                                                             

mean_life_expectancy_alcohol_developed.head()


Unnamed: 0,Country,Life Expectancy,Alcohol
0,Australia,81.907143,10.154286
1,Austria,81.48,12.236
2,Belgium,80.653333,11.042667
3,Bulgaria,72.74,10.865333
4,Croatia,76.7875,12.1425


In [10]:
x = mean_life_expectancy_alcohol_developed["Alcohol"]
y = mean_life_expectancy_alcohol_developed["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumvioletred", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg Alcohol Consumption for Developed Countries")
plt.xlabel("Alcohol")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.ylim(70, 90)
plt.xlim(6,13)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [11]:
mean_life_expectancy_bmi_developed = developed_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "BMI" : np.mean})                                                             

mean_life_expectancy_bmi_developed.head()

Unnamed: 0,Country,Life Expectancy,BMI
0,Australia,81.907143,54.928571
1,Austria,81.48,47.666667
2,Belgium,80.653333,50.04
3,Bulgaria,72.74,53.753333
4,Croatia,76.7875,47.4375


In [14]:
x = mean_life_expectancy_bmi_developed["BMI"]
y = mean_life_expectancy_bmi_developed["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg BMI for Developed Countries")
plt.xlabel("BMI")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
plt.ylim(70, 90)
plt.xlim(40,70)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [13]:
developing_countries = life_expectancy.loc[life_expectancy["Status"] == "Developing",:]
developing_countries.head()

Unnamed: 0,Country,Year,Status,Life Expectancy,Adult Mortality,Infant Deaths,Alcohol,Percentage Expenditure,Hepatitis B,Measles,...,Polio,Total Expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1-19 yrs,Thinness 5-9 yrs,Income Composition of Resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.28%,65.0,1154,...,6.0,8.16,65.0,0.1,584.259,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.52%,62.0,492,...,58.0,8.18,62.0,0.1,612.697,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.22%,64.0,430,...,62.0,8.13,64.0,0.1,631.745,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.18%,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.10%,68.0,3013,...,68.0,7.87,68.0,0.1,63.537,2978599.0,18.2,18.2,0.454,9.5


In [27]:
developing_countries.columns

Index(['Country', 'Year', 'Status', 'Life Expectancy', 'Adult Mortality',
       'Infant Deaths', 'Alcohol', 'Percentage Expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'Under-5yo Deaths', 'Polio', 'Total Expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'Thinness  1-19 yrs',
       'Thinness 5-9 yrs', 'Income Composition of Resources', 'Schooling'],
      dtype='object')

In [17]:
mean_life_expectancy_alcohol_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Alcohol" : np.mean})                                                             

mean_life_expectancy_alcohol_developing.head()


Unnamed: 0,Country,Life Expectancy,Alcohol
0,Afghanistan,58.19375,0.014375
1,Albania,75.15625,4.84875
2,Algeria,74.209091,0.447273
3,Angola,50.675,7.62
4,Argentina,75.238462,8.004615


In [19]:
x = mean_life_expectancy_alcohol_developing["Alcohol"]
y = mean_life_expectancy_alcohol_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumvioletred", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg Alcohol Consumption for Developing Countries")
plt.xlabel("Alcohol")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
# plt.ylim(70, 90)
# plt.xlim(6,13)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [20]:
mean_life_expectancy_bmi_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "BMI" : np.mean})                                                             

mean_life_expectancy_bmi_developing.head()

Unnamed: 0,Country,Life Expectancy,BMI
0,Afghanistan,58.19375,15.51875
1,Albania,75.15625,49.06875
2,Algeria,74.209091,48.872727
3,Angola,50.675,18.45
4,Argentina,75.238462,54.484615


In [22]:
x = mean_life_expectancy_bmi_developing["BMI"]
y = mean_life_expectancy_bmi_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Avg BMI for Developing Countries")
plt.xlabel("BMI")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
# plt.ylim(70, 90)
# plt.xlim(40,70)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [23]:
mean_life_expectancy_dip_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Life Expectancy" : np.mean,
                                                                                                 "Diphtheria" : np.mean})                                                             

mean_life_expectancy_dip_developing.head()


Unnamed: 0,Country,Life Expectancy,Diphtheria
0,Afghanistan,58.19375,52.3125
1,Albania,75.15625,98.0625
2,Algeria,74.209091,93.363636
3,Angola,50.675,64.0
4,Argentina,75.238462,93.692308


In [24]:
x = mean_life_expectancy_dip_developing["Diphtheria"]
y = mean_life_expectancy_dip_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Diphtheria Vaccinations for Developing Countries")
plt.xlabel("Diphtheria")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
# plt.ylim(70, 90)
# plt.xlim(40,70)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [28]:
under5_dip_developing = developing_countries.groupby(['Country'], as_index = False).agg({"Under-5yo Deaths" : np.mean,
                                                                                                 "Diphtheria" : np.mean})                                                             

under5_dip_developing.head()


Unnamed: 0,Country,Under-5yo Deaths,Diphtheria
0,Afghanistan,107.5625,52.3125
1,Albania,0.9375,98.0625
2,Algeria,23.363636,93.363636
3,Angola,118.75,64.0
4,Argentina,11.230769,93.692308


In [None]:
x = mean_life_expectancy_dip_developing["Diphtheria"]
y = mean_life_expectancy_dip_developing["Life Expectancy"]
plt.scatter(x, y, facecolor = "mediumslateblue", edgecolor = "black")
plt.title("Avg Life Expectancy vs Diphtheria Vaccinations for Developing Countries")
plt.xlabel("Diphtheria")
plt.ylabel("Life Expectancy (Yrs)")
plt.grid(linestyle='-', linewidth=1, alpha = 0.5)
# plt.ylim(70, 90)
# plt.xlim(40,70)
plt.tight_layout()