In [60]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
disease_file = "Resources/500_Cities__Local_Data_for_Better_Health__2018_release.csv"
disease_df = pd.read_csv(disease_file, index_col=None)
disease_df.head(2)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
0,2016,US,United States,,US,BRFSS,Prevention,59,Current lack of health insurance among adults ...,%,...,14.9,,,308745538,,PREVENT,ACCESS2,,,Health Insurance
1,2016,US,United States,,US,BRFSS,Prevention,59,Current lack of health insurance among adults ...,%,...,11.8,,,308745538,,PREVENT,ACCESS2,,,Health Insurance


In [3]:
disease_df.keys()

Index(['Year', 'StateAbbr', 'StateDesc', 'CityName', 'GeographicLevel',
       'DataSource', 'Category', 'UniqueID', 'Measure', 'Data_Value_Unit',
       'DataValueTypeID', 'Data_Value_Type', 'Data_Value',
       'Low_Confidence_Limit', 'High_Confidence_Limit',
       'Data_Value_Footnote_Symbol', 'Data_Value_Footnote', 'PopulationCount',
       'GeoLocation', 'CategoryID', 'MeasureId', 'CityFIPS', 'TractFIPS',
       'Short_Question_Text'],
      dtype='object')

In [4]:
disease_df.count()

Year                          810103
StateAbbr                     810103
StateDesc                     810103
CityName                      810047
GeographicLevel               810103
DataSource                    810103
Category                      810103
UniqueID                      810103
Measure                       810103
Data_Value_Unit               810103
DataValueTypeID               810103
Data_Value_Type               810103
Data_Value                    787311
Low_Confidence_Limit          787311
High_Confidence_Limit         787311
Data_Value_Footnote_Symbol     22794
Data_Value_Footnote            22794
PopulationCount               810103
GeoLocation                   810047
CategoryID                    810103
MeasureId                     810103
CityFIPS                      810047
TractFIPS                     782047
Short_Question_Text           810103
dtype: int64

## 1.1 Clean DataFrame:  filter, drop columns

In [5]:
# filter data to only keep city level data, Age-adjusted prevalence data type
disease_df = disease_df.loc[(disease_df['GeographicLevel'] == "City") 
                                & (disease_df['DataValueTypeID'] == "AgeAdjPrv"), :]
disease_df.count()

Year                          14000
StateAbbr                     14000
StateDesc                     14000
CityName                      14000
GeographicLevel               14000
DataSource                    14000
Category                      14000
UniqueID                      14000
Measure                       14000
Data_Value_Unit               14000
DataValueTypeID               14000
Data_Value_Type               14000
Data_Value                    13953
Low_Confidence_Limit          13953
High_Confidence_Limit         13953
Data_Value_Footnote_Symbol       47
Data_Value_Footnote              47
PopulationCount               14000
GeoLocation                   14000
CategoryID                    14000
MeasureId                     14000
CityFIPS                      14000
TractFIPS                         0
Short_Question_Text           14000
dtype: int64

### 1.1.1 asthma data

In [6]:
# further filter to get asthma data
asthma_2016_df = disease_df.loc[(disease_df['Measure'] == "Current asthma among adults aged >=18 Years")
                           & (disease_df['Year'] == 2016) ]

asthma_2016_df.head(2)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
659,2016,AL,Alabama,Birmingham,City,BRFSS,Health Outcomes,107000,Current asthma among adults aged >=18 Years,%,...,10.6,,,212237,"(33.5275663773, -86.7988174678)",HLTHOUT,CASTHMA,107000.0,,Current Asthma
3037,2016,AL,Alabama,Hoover,City,BRFSS,Health Outcomes,135896,Current asthma among adults aged >=18 Years,%,...,8.0,,,81619,"(33.3767602729, -86.8051937568)",HLTHOUT,CASTHMA,135896.0,,Current Asthma


In [7]:
#select columns
asthma_2016_df = asthma_2016_df[['StateAbbr', 'StateDesc', 'CityName',
       'Data_Value','Low_Confidence_Limit', 'High_Confidence_Limit',
       'PopulationCount','GeoLocation']].copy()
asthma_2016_df.head()

Unnamed: 0,StateAbbr,StateDesc,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,PopulationCount,GeoLocation
659,AL,Alabama,Birmingham,10.5,10.4,10.6,212237,"(33.5275663773, -86.7988174678)"
3037,AL,Alabama,Hoover,7.9,7.7,8.0,81619,"(33.3767602729, -86.8051937568)"
4112,AL,Alabama,Huntsville,9.5,9.4,9.6,180105,"(34.6989692671, -86.6387042882)"
6031,AL,Alabama,Mobile,9.9,9.8,10.0,195111,"(30.6776248648, -88.1184482714)"
8160,AL,Alabama,Montgomery,10.3,10.2,10.4,205764,"(32.3472645333, -86.2677059552)"


In [8]:
asthma_2016_df.reset_index(drop=True, inplace=True)
asthma_2016_df.head()

Unnamed: 0,StateAbbr,StateDesc,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,PopulationCount,GeoLocation
0,AL,Alabama,Birmingham,10.5,10.4,10.6,212237,"(33.5275663773, -86.7988174678)"
1,AL,Alabama,Hoover,7.9,7.7,8.0,81619,"(33.3767602729, -86.8051937568)"
2,AL,Alabama,Huntsville,9.5,9.4,9.6,180105,"(34.6989692671, -86.6387042882)"
3,AL,Alabama,Mobile,9.9,9.8,10.0,195111,"(30.6776248648, -88.1184482714)"
4,AL,Alabama,Montgomery,10.3,10.2,10.4,205764,"(32.3472645333, -86.2677059552)"


### 1.1.2 COPD data

In [9]:
# further filter to get asthma data
COPD_2016_df = disease_df.loc[(disease_df['Measure'] == "Chronic obstructive pulmonary disease among adults aged >=18 Years")
                           & (disease_df['Year'] == 2016) ]

COPD_2016_df.head(2)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
1162,2016,AL,Alabama,Birmingham,City,BRFSS,Health Outcomes,107000,Chronic obstructive pulmonary disease among ad...,%,...,8.9,,,212237,"(33.5275663773, -86.7988174678)",HLTHOUT,COPD,107000.0,,COPD
3192,2016,AL,Alabama,Hoover,City,BRFSS,Health Outcomes,135896,Chronic obstructive pulmonary disease among ad...,%,...,5.3,,,81619,"(33.3767602729, -86.8051937568)",HLTHOUT,COPD,135896.0,,COPD


In [10]:
#select columns
COPD_2016_df = COPD_2016_df[['StateAbbr', 'CityName',
       'Data_Value','Low_Confidence_Limit', 'High_Confidence_Limit']].copy()
COPD_2016_df.head()

Unnamed: 0,StateAbbr,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit
1162,AL,Birmingham,8.7,8.6,8.9
3192,AL,Hoover,5.1,4.9,5.3
4443,AL,Huntsville,7.4,7.3,7.6
6430,AL,Mobile,8.2,8.1,8.4
8477,AL,Montgomery,8.2,8.1,8.4


In [11]:
COPD_2016_df.reset_index(drop=True, inplace=True)
COPD_2016_df.head()

Unnamed: 0,StateAbbr,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit
0,AL,Birmingham,8.7,8.6,8.9
1,AL,Hoover,5.1,4.9,5.3
2,AL,Huntsville,7.4,7.3,7.6
3,AL,Mobile,8.2,8.1,8.4
4,AL,Montgomery,8.2,8.1,8.4


### 1.1.3 Stroke data

In [12]:
# further filter to get asthma data
stroke_2016_df = disease_df.loc[(disease_df['Measure'] == "Stroke among adults aged >=18 Years")
                           & (disease_df['Year'] == 2016) ]

stroke_2016_df.head(2)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
2447,2016,AL,Alabama,Birmingham,City,BRFSS,Health Outcomes,107000,Stroke among adults aged >=18 Years,%,...,5.1,,,212237,"(33.5275663773, -86.7988174678)",HLTHOUT,STROKE,107000.0,,Stroke
3653,2016,AL,Alabama,Hoover,City,BRFSS,Health Outcomes,135896,Stroke among adults aged >=18 Years,%,...,2.2,,,81619,"(33.3767602729, -86.8051937568)",HLTHOUT,STROKE,135896.0,,Stroke


In [13]:
#select columns
stroke_2016_df = stroke_2016_df[['StateAbbr', 'CityName',
       'Data_Value','Low_Confidence_Limit', 'High_Confidence_Limit']].copy()
stroke_2016_df.head()

Unnamed: 0,StateAbbr,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit
2447,AL,Birmingham,5.0,4.9,5.1
3653,AL,Hoover,2.2,2.1,2.2
5419,AL,Huntsville,3.3,3.3,3.4
7621,AL,Mobile,4.2,4.2,4.3
9417,AL,Montgomery,4.2,4.2,4.3


In [14]:
stroke_2016_df.reset_index(drop=True, inplace=True)
stroke_2016_df.head()

Unnamed: 0,StateAbbr,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit
0,AL,Birmingham,5.0,4.9,5.1
1,AL,Hoover,2.2,2.1,2.2
2,AL,Huntsville,3.3,3.3,3.4
3,AL,Mobile,4.2,4.2,4.3
4,AL,Montgomery,4.2,4.2,4.3


### 1.1.4 heart disease data

In [15]:
# further filter to get asthma data
heart_2016_df = disease_df.loc[(disease_df['Measure'] == "Coronary heart disease among adults aged >=18 Years")
                           & (disease_df['Year'] == 2016) ]

heart_2016_df.head(2)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
760,2016,AL,Alabama,Birmingham,City,BRFSS,Health Outcomes,107000,Coronary heart disease among adults aged >=18 ...,%,...,7.5,,,212237,"(33.5275663773, -86.7988174678)",HLTHOUT,CHD,107000.0,,Coronary Heart Disease
3068,2016,AL,Alabama,Hoover,City,BRFSS,Health Outcomes,135896,Coronary heart disease among adults aged >=18 ...,%,...,5.2,,,81619,"(33.3767602729, -86.8051937568)",HLTHOUT,CHD,135896.0,,Coronary Heart Disease


In [16]:
#select columns
heart_2016_df = heart_2016_df[['StateAbbr', 'CityName',
       'Data_Value','Low_Confidence_Limit', 'High_Confidence_Limit']].copy()
heart_2016_df.head()

Unnamed: 0,StateAbbr,CityName,Data_Value,Low_Confidence_Limit,High_Confidence_Limit
760,AL,Birmingham,7.5,7.4,7.5
3068,AL,Hoover,5.1,5.0,5.2
4179,AL,Huntsville,6.4,6.3,6.5
6111,AL,Mobile,7.2,7.1,7.3
8222,AL,Montgomery,6.9,6.8,7.0


## 1.2 Merge dataframes

In [17]:
disease_2016_df1 = pd.merge(asthma_2016_df, COPD_2016_df, on=['StateAbbr','CityName'], suffixes=('_asthma', '_COPD'))
disease_2016_df1.head()

Unnamed: 0,StateAbbr,StateDesc,CityName,Data_Value_asthma,Low_Confidence_Limit_asthma,High_Confidence_Limit_asthma,PopulationCount,GeoLocation,Data_Value_COPD,Low_Confidence_Limit_COPD,High_Confidence_Limit_COPD
0,AL,Alabama,Birmingham,10.5,10.4,10.6,212237,"(33.5275663773, -86.7988174678)",8.7,8.6,8.9
1,AL,Alabama,Hoover,7.9,7.7,8.0,81619,"(33.3767602729, -86.8051937568)",5.1,4.9,5.3
2,AL,Alabama,Huntsville,9.5,9.4,9.6,180105,"(34.6989692671, -86.6387042882)",7.4,7.3,7.6
3,AL,Alabama,Mobile,9.9,9.8,10.0,195111,"(30.6776248648, -88.1184482714)",8.2,8.1,8.4
4,AL,Alabama,Montgomery,10.3,10.2,10.4,205764,"(32.3472645333, -86.2677059552)",8.2,8.1,8.4


In [18]:
disease_2016_df2 = pd.merge(stroke_2016_df, heart_2016_df, on=['StateAbbr','CityName'], suffixes=('_stroke', '_heart'))
disease_2016_df2.head()

Unnamed: 0,StateAbbr,CityName,Data_Value_stroke,Low_Confidence_Limit_stroke,High_Confidence_Limit_stroke,Data_Value_heart,Low_Confidence_Limit_heart,High_Confidence_Limit_heart
0,AL,Birmingham,5.0,4.9,5.1,7.5,7.4,7.5
1,AL,Hoover,2.2,2.1,2.2,5.1,5.0,5.2
2,AL,Huntsville,3.3,3.3,3.4,6.4,6.3,6.5
3,AL,Mobile,4.2,4.2,4.3,7.2,7.1,7.3
4,AL,Montgomery,4.2,4.2,4.3,6.9,6.8,7.0


In [19]:
disease_2016_df = pd.merge(disease_2016_df1, disease_2016_df2, on=['StateAbbr','CityName'], suffixes=('', ''))
disease_2016_df.head()

Unnamed: 0,StateAbbr,StateDesc,CityName,Data_Value_asthma,Low_Confidence_Limit_asthma,High_Confidence_Limit_asthma,PopulationCount,GeoLocation,Data_Value_COPD,Low_Confidence_Limit_COPD,High_Confidence_Limit_COPD,Data_Value_stroke,Low_Confidence_Limit_stroke,High_Confidence_Limit_stroke,Data_Value_heart,Low_Confidence_Limit_heart,High_Confidence_Limit_heart
0,AL,Alabama,Birmingham,10.5,10.4,10.6,212237,"(33.5275663773, -86.7988174678)",8.7,8.6,8.9,5.0,4.9,5.1,7.5,7.4,7.5
1,AL,Alabama,Hoover,7.9,7.7,8.0,81619,"(33.3767602729, -86.8051937568)",5.1,4.9,5.3,2.2,2.1,2.2,5.1,5.0,5.2
2,AL,Alabama,Huntsville,9.5,9.4,9.6,180105,"(34.6989692671, -86.6387042882)",7.4,7.3,7.6,3.3,3.3,3.4,6.4,6.3,6.5
3,AL,Alabama,Mobile,9.9,9.8,10.0,195111,"(30.6776248648, -88.1184482714)",8.2,8.1,8.4,4.2,4.2,4.3,7.2,7.1,7.3
4,AL,Alabama,Montgomery,10.3,10.2,10.4,205764,"(32.3472645333, -86.2677059552)",8.2,8.1,8.4,4.2,4.2,4.3,6.9,6.8,7.0


In [20]:
disease_2016_df = disease_2016_df.sort_values("PopulationCount", ascending=False)

In [21]:
#reset index
disease_2016_df.reset_index(drop=True, inplace=True)
disease_2016_df.head(21)

Unnamed: 0,StateAbbr,StateDesc,CityName,Data_Value_asthma,Low_Confidence_Limit_asthma,High_Confidence_Limit_asthma,PopulationCount,GeoLocation,Data_Value_COPD,Low_Confidence_Limit_COPD,High_Confidence_Limit_COPD,Data_Value_stroke,Low_Confidence_Limit_stroke,High_Confidence_Limit_stroke,Data_Value_heart,Low_Confidence_Limit_heart,High_Confidence_Limit_heart
0,NY,New York,New York,10.5,10.5,10.5,8175133,"(40.694960689, -73.9313850409)",6.0,6.0,6.0,3.2,3.2,3.2,5.8,5.8,5.8
1,CA,California,Los Angeles,8.5,8.5,8.5,3792621,"(34.1182277898, -118.408500088)",5.6,5.5,5.6,3.1,3.1,3.1,5.4,5.4,5.5
2,IL,Illinois,Chicago,9.9,9.9,9.9,2695598,"(41.8372950615, -87.6862308732)",6.2,6.2,6.3,3.7,3.7,3.8,6.0,6.0,6.0
3,TX,Texas,Houston,8.8,8.7,8.8,2099451,"(29.7806691396, -95.3860033966)",6.1,6.1,6.1,3.6,3.5,3.6,6.5,6.5,6.6
4,PA,Pennsylvania,Philadelphia,11.6,11.6,11.7,1526006,"(40.0093147808, -75.1333888571)",7.2,7.1,7.2,4.1,4.1,4.1,6.6,6.5,6.6
5,AZ,Arizona,Phoenix,10.2,10.1,10.2,1445632,"(33.5724138695, -112.088995222)",6.5,6.5,6.6,3.3,3.3,3.3,6.3,6.3,6.3
6,TX,Texas,San Antonio,8.4,8.3,8.4,1327407,"(29.4721475333, -98.5246763525)",5.6,5.5,5.6,3.2,3.2,3.2,6.3,6.3,6.4
7,CA,California,San Diego,8.0,8.0,8.0,1307402,"(32.8355639418, -117.119792061)",4.6,4.6,4.7,2.5,2.5,2.5,4.9,4.8,4.9
8,TX,Texas,Dallas,9.3,9.3,9.4,1197816,"(32.7939804066, -96.7656929463)",6.4,6.3,6.4,3.8,3.8,3.8,6.6,6.6,6.7
9,HI,Hawaii,Honolulu,9.6,9.6,9.6,953207,"(21.4588039305, -157.973296737)",3.9,3.9,3.9,2.4,2.4,2.4,4.6,4.6,4.6


In [22]:
disease_2016_df.columns

Index(['StateAbbr', 'StateDesc', 'CityName', 'Data_Value_asthma',
       'Low_Confidence_Limit_asthma', 'High_Confidence_Limit_asthma',
       'PopulationCount', 'GeoLocation', 'Data_Value_COPD',
       'Low_Confidence_Limit_COPD', 'High_Confidence_Limit_COPD',
       'Data_Value_stroke', 'Low_Confidence_Limit_stroke',
       'High_Confidence_Limit_stroke', 'Data_Value_heart',
       'Low_Confidence_Limit_heart', 'High_Confidence_Limit_heart'],
      dtype='object')

In [23]:
# rename the columns

disease_2016_df.columns = ['StateAbbr', 'StateDesc', 'City', 
                           'Asthma_Prevalence','Asthma_LCL', 'Asthma_HCL','Population', 'Location', 
                           'COPD_Prevalence','COPD_LCL', 'COPD_HCL','Stroke_Prevalence', 'Stroke_LCL',
                           'Stroke_HCL', 'Heart_Prevalence','Heart_LCL', 'Heart_HCL']
disease_2016_df.head()

Unnamed: 0,StateAbbr,StateDesc,City,Asthma_Prevalence,Asthma_LCL,Asthma_HCL,Population,Location,COPD_Prevalence,COPD_LCL,COPD_HCL,Stroke_Prevalence,Stroke_LCL,Stroke_HCL,Heart_Prevalence,Heart_LCL,Heart_HCL
0,NY,New York,New York,10.5,10.5,10.5,8175133,"(40.694960689, -73.9313850409)",6.0,6.0,6.0,3.2,3.2,3.2,5.8,5.8,5.8
1,CA,California,Los Angeles,8.5,8.5,8.5,3792621,"(34.1182277898, -118.408500088)",5.6,5.5,5.6,3.1,3.1,3.1,5.4,5.4,5.5
2,IL,Illinois,Chicago,9.9,9.9,9.9,2695598,"(41.8372950615, -87.6862308732)",6.2,6.2,6.3,3.7,3.7,3.8,6.0,6.0,6.0
3,TX,Texas,Houston,8.8,8.7,8.8,2099451,"(29.7806691396, -95.3860033966)",6.1,6.1,6.1,3.6,3.5,3.6,6.5,6.5,6.6
4,PA,Pennsylvania,Philadelphia,11.6,11.6,11.7,1526006,"(40.0093147808, -75.1333888571)",7.2,7.1,7.2,4.1,4.1,4.1,6.6,6.5,6.6


In [25]:
disease_2016_df = disease_2016_df[['StateAbbr', 'City', 
                           'Asthma_Prevalence','COPD_Prevalence','Stroke_Prevalence','Heart_Prevalence',
                           'Location', 'Population', 'Asthma_LCL', 'Asthma_HCL','COPD_LCL', 'COPD_HCL',
                            'Stroke_LCL','Stroke_HCL','Heart_LCL', 'Heart_HCL']]
disease_2016_df.head()

Unnamed: 0,StateAbbr,City,Asthma_Prevalence,COPD_Prevalence,Stroke_Prevalence,Heart_Prevalence,Location,Population,Asthma_LCL,Asthma_HCL,COPD_LCL,COPD_HCL,Stroke_LCL,Stroke_HCL,Heart_LCL,Heart_HCL
0,NY,New York,10.5,6.0,3.2,5.8,"(40.694960689, -73.9313850409)",8175133,10.5,10.5,6.0,6.0,3.2,3.2,5.8,5.8
1,CA,Los Angeles,8.5,5.6,3.1,5.4,"(34.1182277898, -118.408500088)",3792621,8.5,8.5,5.5,5.6,3.1,3.1,5.4,5.5
2,IL,Chicago,9.9,6.2,3.7,6.0,"(41.8372950615, -87.6862308732)",2695598,9.9,9.9,6.2,6.3,3.7,3.8,6.0,6.0
3,TX,Houston,8.8,6.1,3.6,6.5,"(29.7806691396, -95.3860033966)",2099451,8.7,8.8,6.1,6.1,3.5,3.6,6.5,6.6
4,PA,Philadelphia,11.6,7.2,4.1,6.6,"(40.0093147808, -75.1333888571)",1526006,11.6,11.7,7.1,7.2,4.1,4.1,6.5,6.6


In [27]:
# Change column names StateAbbr to State
disease_2016_df.columns = ['State', 'City', 
                           'Asthma_Prevalence','COPD_Prevalence','Stroke_Prevalence','Heart_Prevalence',
                           'Location', 'Population', 'Asthma_LCL', 'Asthma_HCL','COPD_LCL', 'COPD_HCL',
                            'Stroke_LCL','Stroke_HCL','Heart_LCL', 'Heart_HCL']
disease_2016_df.head()

Unnamed: 0,State,City,Asthma_Prevalence,COPD_Prevalence,Stroke_Prevalence,Heart_Prevalence,Location,Population,Asthma_LCL,Asthma_HCL,COPD_LCL,COPD_HCL,Stroke_LCL,Stroke_HCL,Heart_LCL,Heart_HCL
0,NY,New York,10.5,6.0,3.2,5.8,"(40.694960689, -73.9313850409)",8175133,10.5,10.5,6.0,6.0,3.2,3.2,5.8,5.8
1,CA,Los Angeles,8.5,5.6,3.1,5.4,"(34.1182277898, -118.408500088)",3792621,8.5,8.5,5.5,5.6,3.1,3.1,5.4,5.5
2,IL,Chicago,9.9,6.2,3.7,6.0,"(41.8372950615, -87.6862308732)",2695598,9.9,9.9,6.2,6.3,3.7,3.8,6.0,6.0
3,TX,Houston,8.8,6.1,3.6,6.5,"(29.7806691396, -95.3860033966)",2099451,8.7,8.8,6.1,6.1,3.5,3.6,6.5,6.6
4,PA,Philadelphia,11.6,7.2,4.1,6.6,"(40.0093147808, -75.1333888571)",1526006,11.6,11.7,7.1,7.2,4.1,4.1,6.5,6.6


In [32]:
disease_2016_df.to_csv("Output/disease_2016.csv", index=False, encoding="utf-8", header=True)

## 2.1 Past air quality data

In [51]:
air_file = "Resources/annual_aqi_by_cbsa_2016.csv"
air_2016_df = pd.read_csv(air_file, index_col=None)
air_2016_df

Unnamed: 0,State,City,CBSA,CBSA Code,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,...,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10
0,,,"Aberdeen, SD",10100,2016,119,113,6,0,0,...,0,76,42,20,0,0,0,0,103,16
1,,,"Aberdeen, WA",10140,2016,364,355,9,0,0,...,0,71,37,21,0,0,0,0,364,0
2,,,"Adjuntas, PR",10260,2016,97,90,7,0,0,...,0,72,47,21,0,0,0,0,97,0
3,,,"Adrian, MI",10300,2016,361,286,73,2,0,...,0,119,58,40,0,0,163,0,198,0
4,,,"Akron, OH",10420,2016,366,254,111,1,0,...,0,112,61,44,0,0,129,0,237,0
5,,,"Albany, GA",10500,2016,363,263,100,0,0,...,0,93,59,36,0,0,0,0,363,0
6,,,"Albany, OR",10540,2016,366,333,33,0,0,...,0,78,50,19,0,0,0,0,366,0
7,,,"Albany-Schenectady-Troy, NY",10580,2016,366,310,53,3,0,...,0,119,58,37,0,0,283,0,83,0
8,,,"Albuquerque, NM",10740,2016,366,185,178,3,0,...,0,143,74,50,0,3,239,0,69,55
9,,,"Alexandria, LA",10780,2016,102,96,6,0,0,...,0,62,48,34,0,0,0,0,102,0


In [52]:
air_2016_df = air_2016_df.dropna(how="any")

In [53]:
air_2016_df.count()

State                                  20
City                                   20
CBSA                                   20
CBSA Code                              20
Year                                   20
Days with AQI                          20
Good Days                              20
Moderate Days                          20
Unhealthy for Sensitive Groups Days    20
Unhealthy Days                         20
Very Unhealthy Days                    20
Hazardous Days                         20
Max AQI                                20
90th Percentile AQI                    20
Median AQI                             20
Days CO                                20
Days NO2                               20
Days Ozone                             20
Days SO2                               20
Days PM2.5                             20
Days PM10                              20
dtype: int64

In [54]:
air_2016_df.head()

Unnamed: 0,State,City,CBSA,CBSA Code,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,...,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10
29,TX,Austin,"Austin-Round Rock, TX",12420,2016,366,280,85,1,0,...,0,105,60,42,0,18,225,0,122,1
83,NC,Charlotte,"Charlotte-Concord-Gastonia, NC-SC",16740,2016,366,227,130,7,2,...,0,156,80,46,0,6,263,0,97,0
87,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",16980,2016,366,151,187,23,4,...,0,217,87,54,0,42,119,8,175,22
105,OH,Columbus,"Columbus, OH",18140,2016,366,264,92,10,0,...,0,126,71,42,0,41,188,0,136,1
116,TX,Dallas,"Dallas-Fort Worth-Arlington, TX",19100,2016,366,217,131,17,1,...,0,174,80,48,0,12,242,0,112,0


In [55]:
air_2016_df.keys()

Index(['State', 'City', 'CBSA', 'CBSA Code', 'Year', 'Days with AQI',
       'Good Days', 'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10'],
      dtype='object')

In [56]:
#select columns
air_2016_df = air_2016_df[['State', 'City','CBSA','Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Median AQI','Days Ozone', 'Days PM2.5']].copy()
air_2016_df.head()

Unnamed: 0,State,City,CBSA,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Median AQI,Days Ozone,Days PM2.5
29,TX,Austin,"Austin-Round Rock, TX",366,280,85,1,0,0,0,42,225,122
83,NC,Charlotte,"Charlotte-Concord-Gastonia, NC-SC",366,227,130,7,2,0,0,46,263,97
87,IL,Chicago,"Chicago-Naperville-Elgin, IL-IN-WI",366,151,187,23,4,1,0,54,119,175
105,OH,Columbus,"Columbus, OH",366,264,92,10,0,0,0,42,188,136
116,TX,Dallas,"Dallas-Fort Worth-Arlington, TX",366,217,131,17,1,0,0,48,242,112


## 3.1 Merge disease and air quality data

In [61]:
summary_2016_df = pd.merge(disease_2016_df, air_2016_df, on=['City','State'], suffixes=('', ''))
summary_2016_df.head()

Unnamed: 0,State,City,Asthma_Prevalence,COPD_Prevalence,Stroke_Prevalence,Heart_Prevalence,Location,Population,Asthma_LCL,Asthma_HCL,...,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Median AQI,Days Ozone,Days PM2.5
0,NY,New York,10.5,6.0,3.2,5.8,"(40.694960689, -73.9313850409)",8175133,10.5,10.5,...,366,155,181,28,2,0,0,53,162,160
1,CA,Los Angeles,8.5,5.6,3.1,5.4,"(34.1182277898, -118.408500088)",3792621,8.5,8.5,...,366,32,226,83,21,4,0,75,181,153
2,IL,Chicago,9.9,6.2,3.7,6.0,"(41.8372950615, -87.6862308732)",2695598,9.9,9.9,...,366,151,187,23,4,1,0,54,119,175
3,TX,Houston,8.8,6.1,3.6,6.5,"(29.7806691396, -95.3860033966)",2099451,8.7,8.8,...,366,164,179,22,1,0,0,52,189,145
4,PA,Philadelphia,11.6,7.2,4.1,6.6,"(40.0093147808, -75.1333888571)",1526006,11.6,11.7,...,366,122,224,18,2,0,0,57,134,214


In [58]:
summary_2016_df.to_csv("Output/summary_2016.csv", index=False, encoding="utf-8", header=True)

## 4.1 Load to DB

In [70]:
database_path = "Output/summary_2016"
engine = create_engine(f"sqlite:///{database_path}")

In [71]:
engine.table_names()

['summary_2016']

In [72]:
summary_2016_df.to_sql(name='summary_2016', con=engine, if_exists='append',index=False)

In [73]:
pd.read_sql_query('select * from summary_2016', con=engine).head()

Unnamed: 0,State,City,Asthma_Prevalence,COPD_Prevalence,Stroke_Prevalence,Heart_Prevalence,Location,Population,Asthma_LCL,Asthma_HCL,...,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Median AQI,Days Ozone,Days PM2.5
0,NY,New York,10.5,6.0,3.2,5.8,"(40.694960689, -73.9313850409)",8175133,10.5,10.5,...,366,155,181,28,2,0,0,53,162,160
1,CA,Los Angeles,8.5,5.6,3.1,5.4,"(34.1182277898, -118.408500088)",3792621,8.5,8.5,...,366,32,226,83,21,4,0,75,181,153
2,IL,Chicago,9.9,6.2,3.7,6.0,"(41.8372950615, -87.6862308732)",2695598,9.9,9.9,...,366,151,187,23,4,1,0,54,119,175
3,TX,Houston,8.8,6.1,3.6,6.5,"(29.7806691396, -95.3860033966)",2099451,8.7,8.8,...,366,164,179,22,1,0,0,52,189,145
4,PA,Philadelphia,11.6,7.2,4.1,6.6,"(40.0093147808, -75.1333888571)",1526006,11.6,11.7,...,366,122,224,18,2,0,0,57,134,214


In [None]:
# # making separate first name column from new data frame 
# air_2016_df["Area"]= new[0] 
# # making separate last name column from new data frame 
# air_2016_df["State"]= new[1] 
# air_2016_df.head()

# # new data frame with split value columns 
# new = air_2016_df["CBSA"].str.split(",", n = 1, expand = True) 
  
# new.head()


