Importing datasets into pandas

In [220]:
import matplotlib as plt
import kaggle
import zipfile
import pandas as pd
import numpy as np
import os

In [221]:
#downloading pollution_us dataset if doesnt exist locally
us_pollution_filename = 'pollution_us_2000_2016.csv'
if not(os.path.isfile(us_pollution_filename)):
    kaggle.api.dataset_download_file('sogun3/uspollution', us_pollution_filename)
    zip_ref = zipfile.ZipFile(us_pollution_filename+'.zip', 'r')
    zip_ref.extractall()
    zip_ref.close()
df_us_pollution = pd.read_csv(us_pollution_filename)

In [222]:
#assuming resp dataset already exists locally
resp_data_path = 'IHME_USA_COUNTY_RESP_DISEASE_MORTALITY_1980_2014_NATIONAL_Y2017M09D26.XLSX'
list_sheet_names = ['Chronic respiratory diseases', 'Chronic obstructive pulmonary ', 'Pneumoconiosis', 'Silicosis',
                   'Asbestosis', 'Coal workers pneumoconiosis', 'Other pneumoconiosis', 'Asthma', 'Interstitial lung disease',
                   'Other chronic respiratory ']
df_resp_disease = pd.read_excel(resp_data_path, skiprows=1, sheet_name=list_sheet_names)


In [223]:
for x in list_sheet_names:
    #dropping mortality rate measured below year 2000
    #dropping any na values
    df_resp_disease[x].drop(['Mortality Rate, 1980*', 'Mortality Rate, 1985*', 'Mortality Rate, 1990*', 'Mortality Rate, 1995*'],
                           inplace=True, errors='ignore', axis=1)
    df_resp_disease[x].dropna(inplace=True)
    #dropping county data, state mortality rate is average of all counties
    df_resp_disease[x].drop(df_resp_disease[x][df_resp_disease[x].FIPS>57].index, inplace=True)
#     #Extracting state out of Location
#     df_resp_disease[x]['state'] = df_resp_disease[x]['Location']\
#     .apply(lambda x: x.split(',')[1].strip() if len(x.split(',')) > 1 else x.strip())
    df_resp_disease[x]['mortality_2000'] = df_resp_disease[x]['Mortality Rate, 2000*']\
    .apply(lambda x: x.split(' ')[0].strip())
    df_resp_disease[x]['mortality_2005'] = df_resp_disease[x]['Mortality Rate, 2005*']\
    .apply(lambda x: x.split(' ')[0].strip())
    df_resp_disease[x]['mortality_2010'] = df_resp_disease[x]['Mortality Rate, 2010*']\
    .apply(lambda x: x.split(' ')[0].strip())
    df_resp_disease[x]['mortality_2014'] = df_resp_disease[x]['Mortality Rate, 2014*']\
    .apply(lambda x: x.split(' ')[0].strip())

In [224]:
#df_us_pollution does not have data for all counties, therefore need to do it based on state
#need to aggregate by year in date local field to match with mortality data ([2000, 2005, 2010, 2014])
df_us_pollution['year'] = pd.DatetimeIndex(df_us_pollution['Date Local']).year
series_year_data = df_us_pollution['year']
df_us_pollution['year_bin'] = np.where(np.logical_and(series_year_data>=2000, series_year_data<2005), 2000,
                                      np.where(np.logical_and(series_year_data>=2005, series_year_data<2010), 2005,
                                      np.where(series_year_data>=2010, 2010, None)))
# df_us_pollution.drop(['State Code', 'County Code', 'Site Num', 'Address', 'County', 'City', 'Date Local'], axis=1,inplace=True)

In [200]:
df_resp_disease[list_sheet_names[8]]

Unnamed: 0,level_0,index,Location,FIPS,"Mortality Rate, 2000*","Mortality Rate, 2005*","Mortality Rate, 2010*","Mortality Rate, 2014*","% Change in Mortality Rate, 1980-2014",mortality_2000,mortality_2005,mortality_2010,mortality_2014
0,0,1,Alabama,1.0,"5.21 (3.71, 5.75)","5.78 (3.88, 6.44)","6.03 (3.95, 6.74)","6.25 (4.11, 7.08)","116.26 (16.30, 177.83)",5.21,5.78,6.03,6.25
1,1,69,Alaska,2.0,"5.32 (3.91, 6.27)","5.79 (3.92, 6.84)","5.94 (3.84, 7.15)","6.16 (3.99, 7.52)","77.34 (-11.89, 180.94)",5.32,5.79,5.94,6.16
2,2,99,Arizona,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
3,3,115,Arkansas,5.0,"4.57 (3.26, 5.06)","5.01 (3.32, 5.59)","5.10 (3.28, 5.73)","5.23 (3.43, 5.96)","89.77 (-0.88, 148.80)",4.57,5.01,5.1,5.23
4,4,191,California,6.0,"4.61 (3.32, 5.09)","4.94 (3.27, 5.48)","4.92 (3.18, 5.48)","4.93 (3.20, 5.55)","65.83 (-12.55, 113.31)",4.61,4.94,4.92,4.93
5,5,250,Colorado,8.0,"4.69 (3.32, 5.23)","5.17 (3.41, 5.83)","5.32 (3.39, 6.04)","5.41 (3.47, 6.22)","95.87 (-0.76, 161.93)",4.69,5.17,5.32,5.41
6,6,315,Connecticut,9.0,"4.16 (3.00, 4.62)","4.73 (3.18, 5.26)","5.00 (3.25, 5.61)","5.36 (3.50, 6.11)","145.76 (30.66, 223.43)",4.16,4.73,5.0,5.36
7,7,324,Delaware,10.0,"4.63 (3.30, 5.20)","5.32 (3.53, 6.03)","5.60 (3.60, 6.42)","5.94 (3.87, 6.91)","148.69 (34.00, 231.00)",4.63,5.32,5.6,5.94
8,8,328,District of Columbia,11.0,"5.43 (3.90, 6.16)","5.45 (3.77, 6.19)","5.23 (3.52, 6.03)","5.14 (3.55, 5.98)","30.60 (-23.10, 71.26)",5.43,5.45,5.23,5.14
9,9,330,Florida,12.0,"4.17 (3.01, 4.61)","4.53 (3.03, 5.01)","4.63 (3.00, 5.16)","4.71 (3.08, 5.29)","74.44 (-8.50, 123.83)",4.17,4.53,4.63,4.71


In [215]:
df_us_pollution.merge(df_resp_disease[list_sheet_names[8]], left_on='State', right_on='Location')

Unnamed: 0.1,Unnamed: 0,State,NO2 Units,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Units,O3 Mean,O3 1st Max Value,...,FIPS,"Mortality Rate, 2000*","Mortality Rate, 2005*","Mortality Rate, 2010*","Mortality Rate, 2014*","% Change in Mortality Rate, 1980-2014",mortality_2000,mortality_2005,mortality_2010,mortality_2014
0,0,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
1,1,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
2,2,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
3,3,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
4,4,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
5,5,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
6,6,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
7,7,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
8,8,Arizona,Parts per billion,38.125000,51.0,8,48,Parts per million,0.007958,0.016,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91
9,9,Arizona,Parts per billion,38.125000,51.0,8,48,Parts per million,0.007958,0.016,...,4.0,"4.38 (3.17, 4.92)","4.81 (3.18, 5.40)","4.78 (3.01, 5.39)","4.91 (3.15, 5.61)","65.79 (-14.65, 121.16)",4.38,4.81,4.78,4.91


In [218]:
df_us_pollution

Unnamed: 0.1,Unnamed: 0,State,NO2 Units,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Units,O3 Mean,O3 1st Max Value,...,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,year,year_bin
0,0,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,9.0,21,13.0,Parts per million,1.145833,4.200,21,,2000,2000
1,1,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,9.0,21,13.0,Parts per million,0.878947,2.200,23,25.0,2000,2000
2,2,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,6.6,23,,Parts per million,1.145833,4.200,21,,2000,2000
3,3,Arizona,Parts per billion,19.041667,49.0,19,46,Parts per million,0.022500,0.040,...,6.6,23,,Parts per million,0.878947,2.200,23,25.0,2000,2000
4,4,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,3.0,22,4.0,Parts per million,0.850000,1.600,23,,2000,2000
5,5,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,3.0,22,4.0,Parts per million,1.066667,2.300,0,26.0,2000,2000
6,6,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,2.6,23,,Parts per million,0.850000,1.600,23,,2000,2000
7,7,Arizona,Parts per billion,22.958333,36.0,19,34,Parts per million,0.013375,0.032,...,2.6,23,,Parts per million,1.066667,2.300,0,26.0,2000,2000
8,8,Arizona,Parts per billion,38.125000,51.0,8,48,Parts per million,0.007958,0.016,...,11.0,19,16.0,Parts per million,1.929167,4.400,8,,2000,2000
9,9,Arizona,Parts per billion,38.125000,51.0,8,48,Parts per million,0.007958,0.016,...,11.0,19,16.0,Parts per million,1.762500,2.500,8,28.0,2000,2000
