In [156]:
%matplotlib notebook

import os
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sn
from scipy.stats import sem
from scipy.stats import linregress
import scipy.stats as st


# This notebook is intended for the case in which we decide to analyze correlations to life expectancy

# ESTABLISH PATHS

# Probability (%) of dying between age 30 and exact age 70 from cardiovascular disease, cancer, diabetes, or chronic respiratory disease
cancer_etc_path = "./data/good_data/30-70cancerChdEtc.csv"
# could also analyze healthy life expectancy at birth (?) instead of cancer etc.
hale_birth_path = "./data/good_data/HALElifeExpectancyAtBirth.csv"

# Primary reliance on clean fuels and technologies
cleantech_path = "./data/good_data/cleanFuelAndTech.csv"

# Alcohol per capita (15+)
alcohol_path = "./data/good_data/alcoholSubstanceAbuse.csv"

# Medical doctors (per 10,000 people)
doctors_path = "./data/good_data/medicalDoctors.csv"

# Age-standardized prevalence of current tobacco smoking among persons aged 15 years and older
tobacco_path = "./data/good_data/tobaccoAge15.csv"


In [157]:
# CONVERT TO DATA FRAMES

# Healthy life expectancy (HALE) at birth
hale_birth_df = pd.read_csv(hale_birth_path, encoding = "utf-8")

# Probability (%) of dying between 30 and 70 of cancer, cardiovascular disease, diabetes or chronic respiratory disease
cancer_etc_df = pd.read_csv(cancer_etc_path, encoding = "utf-8")

# Proportion of population with primary reliance on clean fuels and technologies (%)
cleantech_df = pd.read_csv(cleantech_path, encoding = "utf-8")

# Alcohol per capita (15+)
alcohol_df = pd.read_csv(alcohol_path, encoding = "utf-8")

# Medical doctors (per 10,000 people)
doctors_df = pd.read_csv(doctors_path, encoding = "utf-8")

# Age-standardized prevalence of current tobacco smoking among persons aged 15 years and older (%)
tobacco_df = pd.read_csv(tobacco_path, encoding = "utf-8")


In [158]:
# Cancer vs. Tobacco

cancer_etc_df = cancer_etc_df.set_index(["Period"])
tobacco_df = tobacco_df.set_index(["Period"])

cancer_tobacco_df = pd.merge(cancer_etc_df, tobacco_df, on = ["Location", "Period", "Dim1"], how = "inner")
cancer_tobacco_df

# cancer_etc
# # tobacco_df

cancer_tobacco_df = cancer_tobacco_df.rename(columns = {
    'First Tooltip_x': 'Probability of Dying (%)',
    'First Tooltip_y': 'Tobacco Use (%)',
    'Dim1': 'Gender'
})
cancer_tobacco_df

# cancer_tobacco_df = cancer_tobacco_df.drop(columns = ["Indicator_x", "Indicator_y"])
# both_sexes = cancer_tobacco_df["Gender"] == "Both sexes"
# both_sexes

# male_female_df = cancer_tobacco_df[cancer_tobacco_df.Gender != 'Both sexes']
# male_female_df


Unnamed: 0_level_0,Location,Indicator_x,Gender,Probability of Dying (%),Indicator_y,Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016,Albania,Probability (%) of dying between age 30 and ex...,Both sexes,17.0,Age-standardized prevalence of current tobacco...,29.7
2016,Albania,Probability (%) of dying between age 30 and ex...,Male,20.7,Age-standardized prevalence of current tobacco...,51.1
2016,Albania,Probability (%) of dying between age 30 and ex...,Female,13.1,Age-standardized prevalence of current tobacco...,8.2
2015,Albania,Probability (%) of dying between age 30 and ex...,Both sexes,17.4,Age-standardized prevalence of current tobacco...,29.9
2015,Albania,Probability (%) of dying between age 30 and ex...,Male,20.9,Age-standardized prevalence of current tobacco...,51.4
...,...,...,...,...,...,...
2005,Zimbabwe,Probability (%) of dying between age 30 and ex...,Male,22.1,Age-standardized prevalence of current tobacco...,31.6
2005,Zimbabwe,Probability (%) of dying between age 30 and ex...,Female,22.9,Age-standardized prevalence of current tobacco...,2.3
2000,Zimbabwe,Probability (%) of dying between age 30 and ex...,Both sexes,21.6,Age-standardized prevalence of current tobacco...,18.3
2000,Zimbabwe,Probability (%) of dying between age 30 and ex...,Male,21.6,Age-standardized prevalence of current tobacco...,33.7


In [159]:
# Cancer vs. Tobacco

cancer_tobacco_df = pd.merge(cancer_etc_df, tobacco_df, on = ["Location", "Period", "Dim1"], how = "inner")
cancer_tobacco_df

cancer_tobacco_df = cancer_tobacco_df.rename(columns = {
    'First Tooltip_x': 'Probability of Dying (%)',
    'First Tooltip_y': 'Tobacco Use (%)',
    'Dim1': 'Gender'
})
cancer_tobacco_df

cancer_tobacco_df = cancer_tobacco_df.drop(columns = ["Indicator_x", "Indicator_y"])
both_sexes = cancer_tobacco_df["Gender"] == "Both sexes"
both_sexes

male_female_df = cancer_tobacco_df[cancer_tobacco_df.Gender != 'Both sexes']
male_female_df

Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,Albania,Male,20.7,51.1
2016,Albania,Female,13.1,8.2
2015,Albania,Male,20.9,51.4
2015,Albania,Female,13.7,8.3
2010,Albania,Male,22.3,53.2
...,...,...,...,...
2010,Zimbabwe,Female,21.5,1.9
2005,Zimbabwe,Male,22.1,31.6
2005,Zimbabwe,Female,22.9,2.3
2000,Zimbabwe,Male,21.6,33.7


In [160]:
male_female_df["Gender"].value_counts()


Female    720
Male      720
Name: Gender, dtype: int64

In [161]:
male_female_df.sort_values(by=['Tobacco Use (%)'], inplace=True, ascending=False)
male_female_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  male_female_df.sort_values(by=['Tobacco Use (%)'], inplace=True, ascending=False)


Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000,Timor-Leste,Male,28.2,91.0
2005,Timor-Leste,Male,25.6,85.7
2000,Lao People's Democratic Republic,Male,29.8,84.0
2000,Myanmar,Male,26.1,83.0
2000,Kiribati,Male,35.1,81.7
...,...,...,...,...
2015,Democratic People's Republic of Korea,Female,17.9,0.0
2010,Democratic People's Republic of Korea,Female,19.4,0.0
2005,Democratic People's Republic of Korea,Female,18.9,0.0
2000,Democratic People's Republic of Korea,Female,17.1,0.0


In [162]:
cancer_tobacco2010_df = male_female_df[male_female_df.index == 2010]
cancer_tobacco2010_df

Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,Timor-Leste,Male,24.9,78.0
2010,Myanmar,Male,27.7,76.1
2010,Kiribati,Male,34.6,73.7
2010,Lao People's Democratic Republic,Male,29.7,71.0
2010,Indonesia,Male,30.7,68.7
...,...,...,...,...
2010,Ghana,Female,23.0,0.6
2010,Egypt,Female,24.1,0.6
2010,Eritrea,Female,24.0,0.5
2010,Azerbaijan,Female,18.9,0.2


In [164]:
# graph prevalence of tobacco vs. 30-70 cancer, etc deaths

x_values = cancer_tobacco2010_df["Tobacco Use (%)"]
y_values = cancer_tobacco2010_df["Probability of Dying (%)"]
title = "Tobacco Use & Probability of Death by Disease (2010)"
x_label = "Tobacco Use (%)"
y_label = "Probability of Dying (%)"

# def plot_linear_regression(x_values, y_values, title, x_label, y_label):
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values*slope + intercept
line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))
plt.scatter(x_values, y_values, marker = "o", alpha = 0.5, linewidth = 1, edgecolor = "black")
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq, (6,5), fontsize = 12, color = "red")
plt.grid(True)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.show()

# def correlation(x_values, y_values):
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation coefficient is {round(correlation[0],2)}.")
    
# plot_linear_regression(cancer_tobacco2010_df["Tobacco Use (%)"], cancer_tobacco2010_df["Probability of Dying (%)"], "Tobacco Use & Probability of Death by Disease (2010)","Tobacco Use (%)", "Probability of Dying (%)" )
# correlation(cancer_tobacco2010_df["Tobacco Use (%)"], cancer_tobacco2010_df["Probability of Dying (%)"])

<IPython.core.display.Javascript object>

The correlation coefficient is 0.4.


In [165]:
# Cancer vs. Alcohol
alcohol_df = alcohol_df.set_index(['Period'])

cancer_alcohol_df = pd.merge(cancer_etc_df, alcohol_df, on = ["Location", "Period", "Dim1"], how = "inner")
cancer_alcohol_df

cancer_alcohol_df = cancer_alcohol_df.rename(columns = {
    'First Tooltip_x': 'Probability of Dying (%)',
    'First Tooltip_y': 'Alcohol per Capita',
    'Dim1': 'Gender'
})
cancer_alcohol_df

cancer_alcohol_df = cancer_alcohol_df.drop(columns = ["Indicator_x", "Indicator_y"])
both_sexes_alcohol = cancer_alcohol_df["Gender"] == "Both sexes"
both_sexes_alcohol

male_female_alcohol_df = cancer_alcohol_df[cancer_alcohol_df.Gender != 'Both sexes']
male_female_alcohol_df

Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Alcohol per Capita
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015,Afghanistan,Male,31.9,0.350
2015,Afghanistan,Female,27.8,0.054
2010,Afghanistan,Male,34.1,0.360
2010,Afghanistan,Female,29.4,0.054
2015,Albania,Male,20.9,10.920
...,...,...,...,...
2010,Zimbabwe,Female,21.5,1.390
2005,Zimbabwe,Male,22.1,4.720
2005,Zimbabwe,Female,22.9,0.990
2000,Zimbabwe,Male,21.6,4.160


In [166]:
# sort by alcohol per capita
male_female_alcohol_df.sort_values(by=['Alcohol per Capita'], inplace=True, ascending=False)
male_female_alcohol_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  male_female_alcohol_df.sort_values(by=['Alcohol per Capita'], inplace=True, ascending=False)


Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Alcohol per Capita
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005,Republic of Moldova,Male,40.0,30.7400
2015,Seychelles,Male,28.8,28.5700
2005,Russian Federation,Male,51.4,28.5200
2010,Belarus,Male,45.2,27.6800
2000,Romania,Male,34.3,27.1300
...,...,...,...,...
2005,Kuwait,Female,18.6,0.0040
2015,Somalia,Female,22.1,0.0040
2015,Kuwait,Female,14.9,0.0006
2000,Somalia,Male,24.0,0.0000


In [167]:
cancer_alcohol2010_df = male_female_alcohol_df[male_female_alcohol_df.index == 2010]
cancer_alcohol2010_df


Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Alcohol per Capita
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,Belarus,Male,45.2,27.680
2010,Russian Federation,Male,43.1,26.540
2010,Lithuania,Male,34.7,24.590
2010,Republic of Moldova,Male,38.1,24.360
2010,Montenegro,Male,28.1,23.920
...,...,...,...,...
2010,Bangladesh,Female,21.9,0.044
2010,Saudi Arabia,Female,15.9,0.038
2010,Libya,Female,16.6,0.025
2010,Mauritania,Female,18.0,0.023


In [168]:
x_values = cancer_alcohol2010_df["Alcohol per Capita"]
y_values = cancer_alcohol2010_df["Probability of Dying (%)"]
title = "Alcohol Use & Probability of Death by Disease (2010)"
x_label = "Alcohol Use (per capita)"
y_label = "Probability of Dying (%)"

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values*slope + intercept
line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))
plt.scatter(x_values, y_values, marker = "o", alpha = 0.5, linewidth = 1, edgecolor = "black")
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq, (6,5), fontsize = 12, color = "red")
plt.grid(True)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.show()

correlation = st.pearsonr(x_values, y_values)
print(f"The correlation coefficient is {round(correlation[0],2)}.")

<IPython.core.display.Javascript object>

The correlation coefficient is 0.22.


In [169]:
doctors_df


Unnamed: 0,Location,Period,Indicator,First Tooltip
0,Afghanistan,2016,"Medical doctors (per 10,000)",2.78
1,Afghanistan,2015,"Medical doctors (per 10,000)",2.85
2,Afghanistan,2014,"Medical doctors (per 10,000)",2.98
3,Afghanistan,2013,"Medical doctors (per 10,000)",2.85
4,Afghanistan,2012,"Medical doctors (per 10,000)",2.41
...,...,...,...,...
2501,Zimbabwe,2005,"Medical doctors (per 10,000)",1.26
2502,Zimbabwe,2004,"Medical doctors (per 10,000)",1.74
2503,Zimbabwe,2000,"Medical doctors (per 10,000)",1.26
2504,Zimbabwe,1995,"Medical doctors (per 10,000)",1.43


In [170]:
# Cancer vs. Doctors

#doctors_df = doctors_df.set_index(["Period"])

cancer_doctors_df = pd.merge(cancer_etc_df, doctors_df, on = ["Location", "Period"], how = "inner")
cancer_doctors_df

cancer_doctors_df = cancer_doctors_df.rename(columns = {
    'First Tooltip_x': 'Probability of Dying (%)',
    'First Tooltip_y': 'Doctors (per 10,000)',
    'Dim1': 'Gender'
})
cancer_doctors_df

cancer_doctors_df = cancer_doctors_df.drop(columns = ["Indicator_x", "Indicator_y"])
both_sexes_doctors = cancer_doctors_df["Gender"] == "Both sexes"
both_sexes_doctors

male_female_doctors_df = cancer_doctors_df[cancer_doctors_df.Gender != 'Both sexes']


male_female_doctors_df = male_female_doctors_df.set_index(["Period"])
male_female_doctors_df


Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),"Doctors (per 10,000)"
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,Afghanistan,Male,31.8,2.78
2016,Afghanistan,Female,27.7,2.78
2015,Afghanistan,Male,31.9,2.85
2015,Afghanistan,Female,27.8,2.85
2010,Afghanistan,Male,34.1,2.37
...,...,...,...,...
2010,Zimbabwe,Female,21.5,1.27
2005,Zimbabwe,Male,22.1,1.26
2005,Zimbabwe,Female,22.9,1.26
2000,Zimbabwe,Male,21.6,1.26


In [171]:
# sort by doctors per 10,000 people
male_female_doctors_df.sort_values(by=['Doctors (per 10,000)'], inplace=True, ascending=False)
male_female_doctors_df

Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),"Doctors (per 10,000)"
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,Cuba,Male,19.0,79.54
2016,Cuba,Female,13.8,79.54
2015,Cuba,Female,14.0,77.69
2015,Cuba,Male,19.1,77.69
2010,Cuba,Female,15.1,68.15
...,...,...,...,...
2016,Malawi,Female,14.5,0.17
2016,United Republic of Tanzania,Male,18.5,0.14
2016,United Republic of Tanzania,Female,17.4,0.14
2016,Togo,Female,23.1,0.13


In [172]:
cancer_doctors2010_df = male_female_doctors_df[male_female_doctors_df.index == 2010]
cancer_doctors2010_df


Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),"Doctors (per 10,000)"
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,Cuba,Female,15.1,68.15
2010,Cuba,Male,20.0,68.15
2010,Austria,Male,15.8,48.02
2010,Austria,Female,8.7,48.02
2010,Georgia,Female,16.7,44.47
...,...,...,...,...
2010,Burkina Faso,Female,21.8,0.46
2010,Liberia,Male,17.9,0.23
2010,Liberia,Female,18.1,0.23
2010,Sierra Leone,Male,29.7,0.21


In [173]:
x_values = cancer_doctors2010_df["Doctors (per 10,000)"]
y_values = cancer_doctors2010_df["Probability of Dying (%)"]
title = "Alcohol Use & Probability of Death by Disease (2010)"
x_label = "Alcohol Use (per capita)"
y_label = "Probability of Dying (%)"

correlation = st.pearsonr(x_values, y_values)
print(f"The correlation coefficient is {round(correlation[0],2)}.")

The correlation coefficient is -0.24.


In [174]:
cancer_doctors2010_df1 = cancer_doctors2010_df.groupby(["Location"])
cancer_doctors2010_df1

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022E570543D0>