In [57]:
%matplotlib notebook

import os
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sn
from scipy.stats import sem

# This notebook is intended for the case in which we decide to analyze correlations to life expectancy

# ESTABLISH PATHS

# Probability (%) of dying between age 30 and exact age 70 from cardiovascular disease, cancer, diabetes, or chronic respiratory disease
cancer_etc_path = "./data/good_data/30-70cancerChdEtc.csv"
# could also analyze healthy life expectancy at birth (?) instead of cancer etc.
hale_birth_path = "./data/good_data/HALElifeExpectancyAtBirth.csv"

# Primary reliance on clean fuels and technologies
cleantech_path = "./data/good_data/cleanFuelAndTech.csv"

# Dentists (per 10,000 people)
dentists_path = "./data/good_data/dentists.csv"

# Medical doctors (per 10,000 people)
doctors_path = "./data/good_data/medicalDoctors.csv"

# Mortality rate due to unsafe wash (per 100,000 people)
unsafewash_path = "./data/good_data/mortalityRateUnsafeWash.csv"

# Married or in-union women of reproductive age who have their need for family planning satisfied with modern methods (%)
reproductiveneeds_path = "./data/good_data/reproductiveAgeWomen.csv"

# Age-standardized prevalence of current tobacco smoking among persons aged 15 years and older
tobacco_path = "./data/good_data/tobaccoAge15.csv"

# CONVERT TO DATA FRAMES

# Healthy life expectancy (HALE) at birth
hale_birth_df = pd.read_csv(hale_birth_path, encoding = "utf-8")
hale_birth_df

# Probability (%) of dying between 30 and 70 of cancer, cardiovascular disease, diabetes or chronic respiratory disease
cancer_etc_df = pd.read_csv(cancer_etc_path, encoding = "utf-8")
cancer_etc_df.head()

# Proportion of population with primary reliance on clean fuels and technologies (%)
cleantech_df = pd.read_csv(cleantech_path, encoding = "utf-8")
cleantech_df

# Dentists (per 10,000 people)
dentists_df = pd.read_csv(dentists_path, encoding = "utf-8")
dentists_df

# Medical doctors (per 10,000 people)
doctors_df = pd.read_csv(doctors_path, encoding = "utf-8")
doctors_df

# Mortality rate attributed to exposure to unsafe WASH services (per 100,000 population)
unsafewash_df = pd.read_csv(unsafewash_path, encoding = "utf-8")
unsafewash_df

# Married or in-union women of reproductive age with their family planning needs satisfied by modern methods (%)
reproductiveneeds_df = pd.read_csv(reproductiveneeds_path, encoding = "utf-8")
reproductiveneeds_df

# Population using safely managed sanitation services (%)
safelysanitation_df = pd.read_csv(safelysanitation_path, encoding = "utf-8")
safelysanitation_df

# Age-standardized prevalence of current tobacco smoking among persons aged 15 years and older (%)
tobacco_df = pd.read_csv(tobacco_path, encoding = "utf-8")
tobacco_df

cancer_etc_df = cancer_etc_df.set_index(["Period"])
tobacco_df = tobacco_df.set_index(["Period"])

hale_birth_df

cancer_tobacco_df = pd.merge(cancer_etc_df, tobacco_df, on = ["Location", "Period", "Dim1"], how = "inner")
cancer_tobacco_df

# cancer_etc
# tobacco_df

cancer_tobacco_df = cancer_tobacco_df.rename(columns = {
    'First Tooltip_x': 'Probability of Dying (%)',
    'First Tooltip_y': 'Tobacco Use (%)',
    'Dim1': 'Gender'
})
cancer_tobacco_df

cancer_tobacco_df = cancer_tobacco_df.drop(columns = ["Indicator_x", "Indicator_y"])
both_sexes = cancer_tobacco_df["Gender"] == "Both sexes"
both_sexes

male_female_df = cancer_tobacco_df[cancer_tobacco_df.Gender != 'Both sexes']
male_female_df



Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,Albania,Male,20.7,51.1
2016,Albania,Female,13.1,8.2
2015,Albania,Male,20.9,51.4
2015,Albania,Female,13.7,8.3
2010,Albania,Male,22.3,53.2
...,...,...,...,...
2010,Zimbabwe,Female,21.5,1.9
2005,Zimbabwe,Male,22.1,31.6
2005,Zimbabwe,Female,22.9,2.3
2000,Zimbabwe,Male,21.6,33.7


In [52]:
male_female_df["Gender"].value_counts()


Male      720
Female    720
Name: Gender, dtype: int64

In [53]:
male_female_df.sort_values(by=['Probability of Dying (%)'], inplace=True, ascending=False)
male_female_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  male_female_df.sort_values(by=['Probability of Dying (%)'], inplace=True, ascending=False)


Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005,Kazakhstan,Male,51.7,52.0
2000,Kazakhstan,Male,51.6,55.9
2005,Russian Federation,Male,51.4,50.7
2000,Russian Federation,Male,50.5,55.5
2005,Belarus,Male,49.2,56.2
...,...,...,...,...
2010,Japan,Female,6.3,12.2
2015,Japan,Female,5.8,11.1
2016,Japan,Female,5.7,10.9
2015,Republic of Korea,Female,5.1,6.1


In [58]:
cancer_tobacco2010_df = male_female_df[male_female_df.index == 2010]
cancer_tobacco2010_df

Unnamed: 0_level_0,Location,Gender,Probability of Dying (%),Tobacco Use (%)
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,Albania,Male,22.3,53.2
2010,Albania,Female,14.6,9.1
2010,Algeria,Male,16.4,38.3
2010,Algeria,Female,14.4,1.8
2010,Argentina,Male,22.8,37.2
...,...,...,...,...
2010,Yemen,Female,29.4,12.2
2010,Zambia,Male,19.8,27.8
2010,Zambia,Female,20.0,5.6
2010,Zimbabwe,Male,20.5,29.4


In [65]:
# graph prevalence of tobacco vs. 30-70 cancer, etc deaths
y_values = cancer_tobacco2010_df["Probability of Dying (%)"]
x_values = cancer_tobacco2010_df["Tobacco Use (%)"]

plt.scatter(x_values, y_values)
plt.title("Tobacco Use & Probability of Death by Disease (2010)")
plt.xlabel("Tobacco Use (%)")
plt.ylabel("Probability of Dying (%)")
plt.plot()

<IPython.core.display.Javascript object>

[]