In [None]:
import pandas as pd
import regex as re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [None]:
### NOTE: You must unzip the 500 Cities Dataset first before reading it in with this line
read_in = pd.read_csv("./datasets/500_Cities__Local_Data_for_Better_Health__2017_release.csv")

In [None]:
# filtering for census tract level data
df = read_in[read_in['GeographicLevel'] == "Census Tract"] 

In [None]:
df[df.StateAbbr == "DC"]

In [None]:
# these are the potential feature columns along with their counts
# I need to pull the Data_Value along with the GeoLocation (as my unique identifier)
df.Measure.value_counts() 

In [None]:
df.shape

In [None]:
# Example of a Geolocation from the Data
df.GeoLocation[58]

In [None]:
# Going to use RegEx to creat latitude/longitude cols, example here
re.findall("-\d+.\d+", df.GeoLocation[58])[0]

In [None]:
# Create latitude column for all rows
df['Latitude'] = [float(re.findall("(\d+.\d+),", df.GeoLocation[i])[0]) for i in df.index] # regex for lat

In [None]:
# Create longitude column for all rows
df['Longitude'] = [float(re.findall("-\d+.\d+", df.GeoLocation[i])[0]) for i in df.index] # regex for long

In [None]:
# I am going to pull the data for which all locations have data
measures_of_interest = df.Measure.value_counts().index[0:21] 

In [None]:
# These are the variables/measures that we are interested in
print(list(measures_of_interest))

In [None]:
# making a separate df for each measure
df_list = [df[df["Measure"] == x] for x in measures_of_interest] 

In [None]:
# A little bit of reformatting of our dataframes, which will be merged together into a master df
metric_dfs = []
for df in df_list:
    df.reset_index(inplace=True, drop=True)
    temp = pd.DataFrame({df.MeasureId[0]:df.Data_Value, # this is the actual value for the measure
                         "UniqueID":df.UniqueID,"geometry":df.GeoLocation, # UniqueID and geolocation cols
                         "state":df.StateAbbr, "population":df.PopulationCount,
                         "longitude": df.Longitude, "latitude":df.Latitude,})
    metric_dfs.append(temp)

In [None]:
metric_dfs[0].UniqueID.value_counts().sum()

In [None]:
# Create master DF of all metrics dfs
master = pd.merge(metric_dfs[0],metric_dfs[1], how='left',on="UniqueID") 
for i in metric_dfs[2:]: # merging the rest with a for loop
    master = master.merge(i, how = "left",on="UniqueID")

In [None]:
master.columns

In [None]:
# drop duplicate columns from merge
master = master.T.drop_duplicates().T 

In [None]:
master.columns

In [None]:
# Rename the columns that have an _x suffix
master.rename({"geometry_x":"geometry","state_x":"state","population_x":"population","longitude_x":"longitude",
              "latitude_x":"latitude"}, axis = 1, inplace = True)

In [None]:
#The 5 digit fips codes can be pulled from the first 5 indices of the unique ID
##... using this to aggregate data to the county level, since thats the income data I have
master.UniqueID[0][0:5] 

In [None]:
# list comprehension to add county fips code column to all rows
master["fips"] = [x[0:5] for x in master.UniqueID] 

In [None]:
# Check how much data we would lose by dropping columns containing NA values
master.dropna().shape, master.shape

In [None]:
# Since it's not too many, we can drop the rows without significantly affecting our analysis
# Assumption made for simplicity that columns are missing entirely at random
master.dropna(inplace=True)

In [None]:
#dropping these columns since I am aggregating on fips code (County)
clean_master = master.drop(["UniqueID", "geometry","population","latitude","longitude","state"], axis = 1)

In [None]:
clean_master.columns

In [None]:
# need to change these columns from 'object' to 'float'
for col in ['DIABETES', 'CHOLSCREEN', 'MHLTH', 'CSMOKING', 'BPHIGH', 'CANCER',
       'CASTHMA', 'CHD', 'PHLTH', 'BPMED', 'KIDNEY', 'BINGE', 'DENTAL',
       'STROKE', 'SLEEP', 'OBESITY', 'COPD', 'LPA', 'CHECKUP', 'HIGHCHOL',
       'ARTHRITIS']:
    clean_master[col] = clean_master[col].astype(float) 

In [None]:
# Aggregate data by taking the means of each column and grouping by the "fips" code column
agg_data = clean_master.groupby("fips").mean()

In [None]:
agg_data.reset_index(drop=False, inplace=True)

In [None]:
# format for export so we dont lose leading 0s
agg_data["fips"] = agg_data['fips'].astype(str) 

In [None]:
#write the data to csv
agg_data.to_csv("./datasets/agg_county_data_final.csv")

In [None]:
#484 resulting fips code values once aggregated
agg_data.shape