In [1]:
import pandas as pd
import regex as re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
### NOTE: You must unzip the 500 Cities Dataset first before reading it in with this line
read_in = pd.read_csv("./datasets/500_Cities__Local_Data_for_Better_Health__2017_release.csv")

In [3]:
# filtering for census tract level data
df = read_in[read_in['GeographicLevel'] == "Census Tract"] 

In [33]:
df[df.StateAbbr == "DC"]

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,Latitude,Longitude
7904,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001001001,Current smoking among adults aged >=18 Years,%,...,,7436.0,"(38.9489550754, -77.0884651956)",UNHBEH,CSMOKING,1150000.0,1.100100e+10,Current Smoking,38.948955,-77.088465
7905,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001001002,Current smoking among adults aged >=18 Years,%,...,,3442.0,"(38.9384216789, -77.0788872939)",UNHBEH,CSMOKING,1150000.0,1.100100e+10,Current Smoking,38.938422,-77.078887
7906,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001001100,Current smoking among adults aged >=18 Years,%,...,,4779.0,"(38.95724392, -77.0776735816)",UNHBEH,CSMOKING,1150000.0,1.100100e+10,Current Smoking,38.957244,-77.077674
7907,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001001200,Current smoking among adults aged >=18 Years,%,...,,5008.0,"(38.9462265217, -77.0705246905)",UNHBEH,CSMOKING,1150000.0,1.100100e+10,Current Smoking,38.946227,-77.070525
7908,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001001301,Current smoking among adults aged >=18 Years,%,...,,3955.0,"(38.9538356982, -77.0545997657)",UNHBEH,CSMOKING,1150000.0,1.100100e+10,Current Smoking,38.953836,-77.054600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8078,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001010700,Current smoking among adults aged >=18 Years,%,...,,1838.0,"(38.9039989156, -77.0419808973)",UNHBEH,CSMOKING,1150000.0,1.100101e+10,Current Smoking,38.903999,-77.041981
8079,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001010800,Current smoking among adults aged >=18 Years,%,...,,6408.0,"(38.8973521338, -77.0446120467)",UNHBEH,CSMOKING,1150000.0,1.100101e+10,Current Smoking,38.897352,-77.044612
8080,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001010900,Current smoking among adults aged >=18 Years,%,...,,3211.0,"(38.8116201244, -77.0248889119)",UNHBEH,CSMOKING,1150000.0,1.100101e+10,Current Smoking,38.811620,-77.024889
8081,2015,DC,District of Columbia,Washington,Census Tract,BRFSS,Unhealthy Behaviors,1150000-11001011000,Current smoking among adults aged >=18 Years,%,...,,3715.0,"(38.8688417474, -77.0180491156)",UNHBEH,CSMOKING,1150000.0,1.100101e+10,Current Smoking,38.868842,-77.018049


In [5]:
# these are the potential feature columns along with their counts
# I need to pull the Data_Value along with the GeoLocation (as my unique identifier)
df.Measure.value_counts() 

Mental health not good for >=14 days among adults aged >=18 Years                                                                                                                                 28004
High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years                                                                                                          28004
No leisure-time physical activity among adults aged >=18 Years                                                                                                                                    28004
Physical health not good for >=14 days among adults aged >=18 Years                                                                                                                               28004
Cancer (excluding skin cancer) among adults aged >=18 Years                                                                                                                                       28004


In [6]:
df.shape

(782047, 24)

In [7]:
# Example of a Geolocation from the Data
df.GeoLocation[58]

'(33.5794328326, -86.7228323926)'

In [8]:
# Going to use RegEx to creat latitude/longitude cols, example here
re.findall("-\d+.\d+", df.GeoLocation[58])[0]

'-86.7228323926'

In [9]:
# Create latitude column for all rows
df['Latitude'] = [float(re.findall("(\d+.\d+),", df.GeoLocation[i])[0]) for i in df.index] # regex for lat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
# Create longitude column for all rows
df['Longitude'] = [float(re.findall("-\d+.\d+", df.GeoLocation[i])[0]) for i in df.index] # regex for long

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
# I am going to pull the data for which all locations have data
measures_of_interest = df.Measure.value_counts().index[0:21] 

In [12]:
# These are the variables/measures that we are interested in
print(list(measures_of_interest))

['Mental health not good for >=14 days among adults aged >=18 Years', 'High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years', 'No leisure-time physical activity among adults aged >=18 Years', 'Physical health not good for >=14 days among adults aged >=18 Years', 'Cancer (excluding skin cancer) among adults aged >=18 Years', 'Visits to dentist or dental clinic among adults aged >=18 Years', 'Chronic obstructive pulmonary disease among adults aged >=18 Years', 'Obesity among adults aged >=18 Years', 'Stroke among adults aged >=18 Years', 'Visits to doctor for routine checkup within the past Year among adults aged >=18 Years', 'Cholesterol screening among adults aged >=18 Years', 'Current asthma among adults aged >=18 Years', 'Binge drinking among adults aged >=18 Years', 'Coronary heart disease among adults aged >=18 Years', 'Taking medicine for high blood pressure control among adults aged >=18 Years with high blood pressure', 'Arthritis among adults 

In [13]:
# making a separate df for each measure
df_list = [df[df["Measure"] == x] for x in measures_of_interest] 

In [14]:
# A little bit of reformatting of our dataframes, which will be merged together into a master df
metric_dfs = []
for df in df_list:
    df.reset_index(inplace=True, drop=True)
    temp = pd.DataFrame({df.MeasureId[0]:df.Data_Value, # this is the actual value for the measure
                         "UniqueID":df.UniqueID,"geometry":df.GeoLocation, # UniqueID and geolocation cols
                         "state":df.StateAbbr, "population":df.PopulationCount,
                         "longitude": df.Longitude, "latitude":df.Latitude,})
    metric_dfs.append(temp)

In [15]:
metric_dfs[0].UniqueID.value_counts().sum()

28004

In [16]:
# Create master DF of all metrics dfs
master = pd.merge(metric_dfs[0],metric_dfs[1], how='left',on="UniqueID") 
for i in metric_dfs[2:]: # merging the rest with a for loop
    master = master.merge(i, how = "left",on="UniqueID")

In [17]:
master.columns

Index(['MHLTH', 'UniqueID', 'geometry_x', 'state_x', 'population_x',
       'longitude_x', 'latitude_x', 'HIGHCHOL', 'geometry_y', 'state_y',
       ...
       'state_y', 'population_y', 'longitude_y', 'latitude_y', 'CSMOKING',
       'geometry', 'state', 'population', 'longitude', 'latitude'],
      dtype='object', length=127)

In [18]:
# drop duplicate columns from merge
master = master.T.drop_duplicates().T 

In [19]:
master.columns

Index(['MHLTH', 'UniqueID', 'geometry_x', 'state_x', 'population_x',
       'longitude_x', 'latitude_x', 'HIGHCHOL', 'LPA', 'PHLTH', 'CANCER',
       'DENTAL', 'COPD', 'OBESITY', 'STROKE', 'CHECKUP', 'CHOLSCREEN',
       'CASTHMA', 'BINGE', 'CHD', 'BPMED', 'ARTHRITIS', 'KIDNEY', 'BPHIGH',
       'DIABETES', 'SLEEP', 'CSMOKING'],
      dtype='object')

In [20]:
# Rename the columns that have an _x suffix
master.rename({"geometry_x":"geometry","state_x":"state","population_x":"population","longitude_x":"longitude",
              "latitude_x":"latitude"}, axis = 1, inplace = True)

In [21]:
#The 5 digit fips codes can be pulled from the first 5 indices of the unique ID
##... using this to aggregate data to the county level, since thats the income data I have
master.UniqueID[0][0:5] 

'01070'

In [22]:
# list comprehension to add county fips code column to all rows
master["fips"] = [x[0:5] for x in master.UniqueID] 

In [23]:
# Check how much data we would lose by dropping columns containing NA values
master.dropna().shape, master.shape

((27204, 28), (28004, 28))

In [24]:
# Since it's not too many, we can drop the rows without significantly affecting our analysis
# Assumption made for simplicity that columns are missing entirely at random
master.dropna(inplace=True)

In [25]:
#dropping these columns since I am aggregating on fips code (County)
clean_master = master.drop(["UniqueID", "geometry","population","latitude","longitude","state"], axis = 1)

In [26]:
clean_master.columns

Index(['MHLTH', 'HIGHCHOL', 'LPA', 'PHLTH', 'CANCER', 'DENTAL', 'COPD',
       'OBESITY', 'STROKE', 'CHECKUP', 'CHOLSCREEN', 'CASTHMA', 'BINGE', 'CHD',
       'BPMED', 'ARTHRITIS', 'KIDNEY', 'BPHIGH', 'DIABETES', 'SLEEP',
       'CSMOKING', 'fips'],
      dtype='object')

In [27]:
# need to change these columns from 'object' to 'float'
for col in ['DIABETES', 'CHOLSCREEN', 'MHLTH', 'CSMOKING', 'BPHIGH', 'CANCER',
       'CASTHMA', 'CHD', 'PHLTH', 'BPMED', 'KIDNEY', 'BINGE', 'DENTAL',
       'STROKE', 'SLEEP', 'OBESITY', 'COPD', 'LPA', 'CHECKUP', 'HIGHCHOL',
       'ARTHRITIS']:
    clean_master[col] = clean_master[col].astype(float) 

In [28]:
# Aggregate data by taking the means of each column and grouping by the "fips" code column
agg_data = clean_master.groupby("fips").mean()

In [29]:
agg_data.reset_index(drop=False, inplace=True)

In [30]:
# format for export so we dont lose leading 0s
agg_data["fips"] = agg_data['fips'].astype(str) 

In [31]:
#write the data to csv
agg_data.to_csv("./datasets/agg_county_data_final.csv")

In [32]:
#484 resulting fips code values once aggregated
agg_data.shape

(484, 22)