In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
from scipy.stats import linregress
from scipy import stats
import pingouin as pg # Install pingouin stats package (pip install pingouin)
import seaborn as sns # Install seaborn data visualization library (pip install seaborn)
from scipy.stats import pearsonr

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# File to Load
data_to_load = "data.csv"

# Read the Population Health Data
health_data_pd = pd.read_csv(data_to_load)

# Display the data table for preview
health_data_pd

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,Unnamed: 60
0,Arab World,ARB,% of females ages 15-49 having comprehensive c...,SH.HIV.KNOW.FE.ZS,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,% of males ages 15-49 having comprehensive cor...,SH.HIV.KNOW.MA.ZS,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,133.555013,134.159119,134.857912,134.504576,134.105211,133.569626,...,49.999851,49.887046,49.781207,49.672975,49.536047,49.383745,48.796558,48.196418,,
3,Arab World,ARB,Adults (ages 15+) and children (0-14 years) li...,SH.HIV.TOTL,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,Adults (ages 15+) and children (ages 0-14) new...,SH.HIV.INCD.TL,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89005,Zimbabwe,ZWE,Use of insecticide-treated bed nets (% of unde...,SH.MLR.NETS.ZS,,,,,,,...,,,17.300000,,9.700000,,,26.800000,,
89006,Zimbabwe,ZWE,Use of Intermittent Preventive Treatment of ma...,SH.MLR.SPF2.ZS,,,,,,,...,,,13.900000,,7.300000,,,12.900000,,
89007,Zimbabwe,ZWE,Vitamin A supplementation coverage rate (% of ...,SN.ITK.VITA.ZS,,,,,,,...,83.000000,0.000000,77.000000,49.000000,47.000000,61.000000,34.000000,32.000000,,
89008,Zimbabwe,ZWE,Wanted fertility rate (births per woman),SP.DYN.WFRT,,,,,,,...,,,,,3.500000,,,,,


In [2]:
# Extracting data from 2005-2015

health_data_decade_df = health_data_pd[['Country Name','Country Code','Indicator Name','Indicator Code',
                                        '2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']]
health_data_decade_df


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Arab World,ARB,% of females ages 15-49 having comprehensive c...,SH.HIV.KNOW.FE.ZS,,,,,,,,,,,
1,Arab World,ARB,% of males ages 15-49 having comprehensive cor...,SH.HIV.KNOW.MA.ZS,,,,,,,,,,,
2,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,50.732590,50.329135,49.999851,49.887046,49.781207,49.672975,49.536047,49.383745,48.796558,48.196418,
3,Arab World,ARB,Adults (ages 15+) and children (0-14 years) li...,SH.HIV.TOTL,,,,,,,,,,,
4,Arab World,ARB,Adults (ages 15+) and children (ages 0-14) new...,SH.HIV.INCD.TL,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89005,Zimbabwe,ZWE,Use of insecticide-treated bed nets (% of unde...,SH.MLR.NETS.ZS,,3.100000,,,17.300000,,9.700000,,,26.800000,
89006,Zimbabwe,ZWE,Use of Intermittent Preventive Treatment of ma...,SH.MLR.SPF2.ZS,,6.300000,,,13.900000,,7.300000,,,12.900000,
89007,Zimbabwe,ZWE,Vitamin A supplementation coverage rate (% of ...,SN.ITK.VITA.ZS,81.000000,67.000000,83.000000,0.000000,77.000000,49.000000,47.000000,61.000000,34.000000,32.000000,
89008,Zimbabwe,ZWE,Wanted fertility rate (births per woman),SP.DYN.WFRT,,3.300000,,,,,3.500000,,,,


In [3]:
# Extracting USA's data

countries_sorted_df = health_data_decade_df.groupby('Country Name')
countries_sorted_df
usa_df = countries_sorted_df.get_group('United States') 
usa_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
85215,United States,USA,% of females ages 15-49 having comprehensive c...,SH.HIV.KNOW.FE.ZS,,,,,,,,,,,
85216,United States,USA,% of males ages 15-49 having comprehensive cor...,SH.HIV.KNOW.MA.ZS,,,,,,,,,,,
85217,United States,USA,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,41.0818,40.3754,39.669,37.7398,35.8106,33.8814,31.9522,30.023,27.0666,24.1102,
85218,United States,USA,Adults (ages 15+) and children (0-14 years) li...,SH.HIV.TOTL,,,,,,,,,,,
85219,United States,USA,Adults (ages 15+) and children (ages 0-14) new...,SH.HIV.INCD.TL,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85555,United States,USA,Use of insecticide-treated bed nets (% of unde...,SH.MLR.NETS.ZS,,,,,,,,,,,
85556,United States,USA,Use of Intermittent Preventive Treatment of ma...,SH.MLR.SPF2.ZS,,,,,,,,,,,
85557,United States,USA,Vitamin A supplementation coverage rate (% of ...,SN.ITK.VITA.ZS,,,,,,,,,,,
85558,United States,USA,Wanted fertility rate (births per woman),SP.DYN.WFRT,,,,,,,,,,,


In [4]:
# Dropping NaN values

dropped_usa_df = usa_df.dropna() 
dropped_usa_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
85224,United States,USA,Age dependency ratio (% of working-age populat...,SP.POP.DPND,4.869685e+01,4.871720e+01,4.864902e+01,4.857000e+01,4.858461e+01,4.873401e+01,4.891222e+01,4.921263e+01,4.967125e+01,5.025639e+01,5.091245e+01
85225,United States,USA,"Age dependency ratio, old",SP.POP.DPND.OL,1.832077e+01,1.842681e+01,1.855893e+01,1.874134e+01,1.900255e+01,1.935391e+01,1.979159e+01,2.032804e+01,2.094949e+01,2.162146e+01,2.231402e+01
85226,United States,USA,"Age dependency ratio, young",SP.POP.DPND.YG,3.037608e+01,3.029038e+01,3.009009e+01,2.982867e+01,2.958206e+01,2.938010e+01,2.912063e+01,2.888459e+01,2.872176e+01,2.863493e+01,2.859843e+01
85227,United States,USA,"Age population, age 0, female, interpolated",SP.POP.AG00.FE.IN,2.074479e+06,2.082882e+06,2.070141e+06,2.041092e+06,2.003862e+06,1.965540e+06,1.928888e+06,1.890396e+06,1.858214e+06,1.842730e+06,1.850506e+06
85228,United States,USA,"Age population, age 0, male, interpolated",SP.POP.AG00.MA.IN,2.158649e+06,2.167520e+06,2.155450e+06,2.127284e+06,2.091058e+06,2.053711e+06,2.016453e+06,1.977553e+06,1.945105e+06,1.929548e+06,1.937553e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85518,United States,USA,Rural population (% of total population),SP.RUR.TOTL.ZS,2.007200e+01,1.990100e+01,1.973100e+01,1.956200e+01,1.939400e+01,1.922800e+01,1.906000e+01,1.889200e+01,1.872300e+01,1.855300e+01,1.838300e+01
85519,United States,USA,Rural population growth (annual %),SP.RUR.TOTL.ZG,7.338981e-02,1.086703e-01,9.315709e-02,8.565726e-02,1.413321e-02,-2.362652e-02,-1.137156e-01,-1.235261e-01,-1.611790e-01,-1.314247e-01,-1.360943e-01
85550,United States,USA,Urban population,SP.URB.TOTL,2.362005e+08,2.389993e+08,2.417953e+08,2.446071e+08,2.472763e+08,2.498656e+08,2.523052e+08,2.547624e+08,2.571827e+08,2.597405e+08,2.623324e+08
85551,United States,USA,Urban population (% of total),SP.URB.TOTL.IN.ZS,7.992800e+01,8.009900e+01,8.026900e+01,8.043800e+01,8.060600e+01,8.077200e+01,8.094000e+01,8.110800e+01,8.127700e+01,8.144700e+01,8.161700e+01


In [5]:
# Extracting health indicators of interest

decade_health_USA_T = dropped_usa_df.iloc[:, 2:].T
decade_health_USA_T.columns = decade_health_USA_T.iloc[1,:]
decade_health_USA_T = decade_health_USA_T.iloc[2:, :]

decade_health_USA_T_codes = decade_health_USA_T[['SH.IMM.HEPB', 'SH.IMM.MEAS', 'SH.STA.ACSN',
                                                 'SP.DYN.IMRT.IN','SH.H2O.SAFE.ZS','SP.POP.GROW','SP.POP.TOTL','NY.GNP.PCAP.CD']]
decade_health_USA_T_codes

Indicator Code,SH.IMM.HEPB,SH.IMM.MEAS,SH.STA.ACSN,SP.DYN.IMRT.IN,SH.H2O.SAFE.ZS,SP.POP.GROW,SP.POP.TOTL,NY.GNP.PCAP.CD
2005,93,92,99.8,6.8,99.0,0.921713,295517000.0,46340
2006,93,92,99.9,6.7,99.0,0.964254,298380000.0,48080
2007,93,92,99.9,6.6,99.0,0.951055,301231000.0,48640
2008,94,92,99.9,6.5,99.0,0.945865,304094000.0,49330
2009,92,90,99.9,6.4,99.1,0.876651,306772000.0,48050
2010,92,92,99.9,6.3,99.1,0.835992,309347000.0,48950
2011,91,92,100.0,6.1,99.1,0.76385,311719000.0,50450
2012,90,91,100.0,6.1,99.1,0.761808,314103000.0,52520
2013,91,92,100.0,5.9,99.2,0.737406,316427000.0,53670
2014,92,92,100.0,5.7,99.2,0.780697,318907000.0,54400


In [None]:
# Plotting Immunization vs Infant Mortality (USA)

fig, (ax1) = plt.subplots(1, sharex=True)
fig.suptitle('Measles Immunization Rate Vs. \n Infant Mortality Rate from 2005-2015 (USA)', fontsize=14, fontweight="bold")

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

x = yr_list
y = [ x[0] for x in decade_health_USA_T_codes[['SP.DYN.IMRT.IN']].values]

ax1.set_xlim(min(yr_list)-.5, max(yr_list)+.5)
ax1.plot(x, y, linewidth=1, marker="o")

x = yr_list
y = [ x[0] for x in decade_health_USA_T_codes[['SH.IMM.MEAS']].values]

ax1.plot(x, y, linewidth=1, marker="o", color="r")
ax1.set_ylabel("Rate (per 100)")
ax1.set_xlabel("Year")

plt.savefig("measles_usa.png")

In [None]:
# Calculating r value for Immunization vs Infant Mortality (USA)

import pingouin as pg

x = [ x[0] for x in decade_health_USA_T_codes[['SP.DYN.IMRT.IN']].values]
y = [ x[0] for x in decade_health_USA_T_codes[['SH.IMM.MEAS']].values]

print(pg.corr(x, y))

In [None]:
# Calculating r value and plotting graphs for GNP vs Infant Mortality (USA)

sns.set(style='white', font_scale=1.2)

x = [ x[0] for x in decade_health_USA_T_codes[['NY.GNP.PCAP.CD']].values]
y = [ x[0] for x in decade_health_USA_T_codes[['SH.IMM.MEAS']].values]

print(pg.corr(x, y))

g = sns.JointGrid(x, y)
g = g.plot_joint(sns.regplot, color="xkcd:muted blue")
g = g.plot_marginals(sns.distplot, kde=False, bins=12, color="xkcd:bluey grey")
g.ax_joint.text(150, 95, 'r = 0.45, p < .001', fontstyle='italic')
plt.tight_layout()

plt.savefig("gnp_usa.png")

In [None]:

sns.set(style='white', font_scale=1.2)

x = [ x[0] for x in decade_health_germany_T_codes[['NY.GNP.PCAP.CD']].values]
y = [ x[0] for x in decade_health_germany_T_codes[['SH.IMM.MEAS']].values]
print(pg.corr(x, y))

g = sns.JointGrid(x, y)
g = g.plot_joint(sns.regplot, color="xkcd:muted blue")
g = g.plot_marginals(sns.distplot, kde=False, bins=12, color="xkcd:bluey grey")
g.ax_joint.text(150, 95, 'r = 0.45, p < .001', fontstyle='italic')
plt.tight_layout()

plt.savefig("gnp_germany.png")

In [None]:

sns.set(style='white', font_scale=1.2)

x = [ x[0] for x in decade_health_china_T_codes[['NY.GNP.PCAP.CD']].values]
y = [ x[0] for x in decade_health_china_T_codes[['SH.IMM.MEAS']].values]
print(pg.corr(x, y))

g = sns.JointGrid(x, y)
g = g.plot_joint(sns.regplot, color="xkcd:muted blue")
g = g.plot_marginals(sns.distplot, kde=False, bins=12, color="xkcd:bluey grey")
g.ax_joint.text(150, 95,'r = 0.45,  p < .001', fontstyle='italic')
plt.tight_layout()

plt.savefig("gnp_china.png")

In [None]:

sns.set(style='white', font_scale=1.2)

x = [ x[0] for x in decade_health_brazil_T_codes[['NY.GNP.PCAP.CD']].values]
y = [ x[0] for x in decade_health_brazil_T_codes[['SH.IMM.MEAS']].values]
print(pg.corr(x, y))

g = sns.JointGrid(x, y)
g = g.plot_joint(sns.regplot, color="xkcd:muted blue")
g = g.plot_marginals(sns.distplot, kde=False, bins=12, color="xkcd:bluey grey")
g.ax_joint.text(150, 95, 'r = 0.45, p < .001', fontstyle='italic')
plt.tight_layout()

plt.savefig("gnp_brazil.png")

In [None]:


fig, (ax1) = plt.subplots(1, sharex=True)
fig.suptitle("Immunization Rates from 2005-2015 (USA)", fontsize=16, fontweight="bold")

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

x = yr_list
y = [ x[0] for x in decade_health_USA_T_codes[['SH.IMM.HEPB']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.set_xlim(min(yr_list)-.5, max(yr_list)+.5)
ax1.plot(x, y, linewidth=1, marker="o")




x = yr_list
y = [ x[0] for x in decade_health_USA_T_codes[['SH.IMM.MEAS']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.plot(x, y, linewidth=1, marker="o", color="r")
ax1.set_ylabel("Measles Imm. Rate")
ax1.set_xlabel("Year")

In [None]:
germany_df = countries_sorted_df.get_group('Germany')
germany_df

In [None]:
dropped_germany_df = germany_df.dropna() 
dropped_germany_df

In [None]:
decade_health_germany_T = dropped_germany_df.iloc[:, 2:].T
decade_health_germany_T.columns = decade_health_germany_T.iloc[1,:]
decade_health_germany_T = decade_health_germany_T.iloc[2:, :]
decade_health_germany_T

In [None]:
decade_health_germany_T_codes = decade_health_germany_T[['SH.IMM.HEPB', 'SH.IMM.MEAS', 'SH.STA.ACSN','SP.DYN.IMRT.IN','SH.H2O.SAFE.ZS','SP.POP.GROW','SP.POP.TOTL','NY.GNP.PCAP.CD']]
decade_health_germany_T_codes

In [None]:
fig, (ax1) = plt.subplots(1, sharex=True)
fig.suptitle('Measles Immunization Rate Vs. \n Infant Mortality Rate from 2005-2015 (Germany)', fontsize=14, fontweight="bold")

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

x = yr_list
y = [ x[0] for x in decade_health_germany_T_codes[['SP.DYN.IMRT.IN']].values*10]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.set_xlim(min(yr_list)-.5, max(yr_list)+.5)
ax1.plot(x, y, linewidth=1, marker="o")

x = yr_list
y = [ x[0] for x in decade_health_germany_T_codes[['SH.IMM.MEAS']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.plot(x, y, linewidth=1, marker="o", color="r")
ax1.set_ylabel("Rate (per 100)")
ax1.set_xlabel("Year")

plt.savefig("measles_germany.png")

In [None]:

x = [ x[0] for x in decade_health_germany_T_codes[['SP.DYN.IMRT.IN']].values]
y = [ x[0] for x in decade_health_germany_T_codes[['SH.IMM.MEAS']].values]
print(pg.corr(x, y))

In [None]:
china_df = countries_sorted_df.get_group('China') 
china_df

In [None]:
dropped_china_df = china_df.dropna() 
dropped_china_df

In [None]:
decade_health_china_T = dropped_china_df.iloc[:, 2:].T
decade_health_china_T.columns = decade_health_china_T.iloc[1,:]
decade_health_china_T = decade_health_china_T.iloc[2:, :]
decade_health_china_T

In [None]:
decade_health_china_T_codes = decade_health_china_T[['SH.IMM.HEPB', 'SH.IMM.MEAS', 'SH.STA.ACSN','SP.DYN.IMRT.IN','SH.H2O.SAFE.ZS','SP.POP.GROW','SP.POP.TOTL','NY.GNP.PCAP.CD']]
decade_health_china_T_codes

In [None]:
decade_health_china_T_codes.iloc[:,1]

In [None]:

fig, (ax1) = plt.subplots(1, sharex=True)
fig.suptitle('Measles Immunization Rate Vs. \n Infant Mortality Rate from 2005-2015 (China)', fontsize=14, fontweight="bold")

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

x = yr_list
y = [ x[0] for x in decade_health_china_T_codes[['SP.DYN.IMRT.IN']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.set_xlim(min(yr_list)-.5, max(yr_list)+.5)
ax1.plot(x, y, linewidth=1, marker="o")

x = yr_list
y = [ x[0] for x in decade_health_china_T_codes[['SH.IMM.MEAS']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.plot(x, y, linewidth=1, marker="o", color="r")
ax1.set_ylabel("Rate (per 100)")
ax1.set_xlabel("Year")

plt.savefig("measles_china.png")

In [None]:

x = [ x[0] for x in decade_health_china_T_codes[['SP.DYN.IMRT.IN']].values]
y = [ x[0] for x in decade_health_china_T_codes[['SH.IMM.MEAS']].values]
print(pg.corr(x, y))

In [None]:
brazil_df = countries_sorted_df.get_group('Brazil') 
brazil_df

In [None]:
dropped_brazil_df = brazil_df.dropna() 
dropped_brazil_df

In [None]:
decade_health_brazil_T = dropped_brazil_df.iloc[:, 2:].T
decade_health_brazil_T.columns = decade_health_brazil_T.iloc[1,:]
decade_health_brazil_T = decade_health_brazil_T.iloc[2:, :]
decade_health_brazil_T

In [None]:
decade_health_brazil_T_codes = decade_health_brazil_T[['SH.IMM.HEPB', 'SH.IMM.MEAS', 'SH.STA.ACSN','SP.DYN.IMRT.IN','SH.H2O.SAFE.ZS','SP.POP.GROW','SP.POP.TOTL','NY.GNP.PCAP.CD']]
decade_health_brazil_T_codes

In [None]:

fig, (ax1) = plt.subplots(1, sharex=True)
fig.suptitle('Measles Immunization Rate Vs. \n Infant Mortality Rate from 2005-2015 (Brazil)', fontsize=14, fontweight="bold")

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

x = yr_list
y = [ x[0] for x in decade_health_brazil_T_codes[['SP.DYN.IMRT.IN']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.set_xlim(min(yr_list)-.5, max(yr_list)+.5)
ax1.plot(x, y, linewidth=1, marker="o")

x = yr_list
y = [ x[0] for x in decade_health_brazil_T_codes[['SH.IMM.MEAS']].values]

print(x)
print(y)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
values = [intercept + slope * data for data in x]
print('r-squared:', r_value**2)

print(values)

ax1.plot(x, y, linewidth=1, marker="o", color="r")
ax1.set_ylabel("Rate (per 100)")
ax1.set_xlabel("Year")

plt.savefig("measles_brazil.png")

In [None]:

x = [ x[0] for x in decade_health_brazil_T_codes[['SP.DYN.IMRT.IN']].values]
y = [ x[0] for x in decade_health_brazil_T_codes[['SH.IMM.MEAS']].values]
print(pg.corr(x, y))