In [1]:
import pandas as pd
import regex as re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [7]:
read_in = pd.read_csv("./datasets/500_Cities__Local_Data_for_Better_Health__2017_release.csv")

In [8]:
df = read_in[read_in['GeographicLevel'] == "Census Tract"] # filtering for census tract level data

In [9]:
df.head()

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
58,2015,AL,Alabama,Birmingham,Census Tract,BRFSS,Prevention,0107000-01073000100,Current lack of health insurance among adults ...,%,...,27.2,,,3042.0,"(33.5794328326, -86.7228323926)",PREVENT,ACCESS2,107000.0,1073000000.0,Health Insurance
59,2015,AL,Alabama,Birmingham,Census Tract,BRFSS,Prevention,0107000-01073000300,Current lack of health insurance among adults ...,%,...,32.4,,,2735.0,"(33.5428208686, -86.752433978)",PREVENT,ACCESS2,107000.0,1073000000.0,Health Insurance
60,2015,AL,Alabama,Birmingham,Census Tract,BRFSS,Prevention,0107000-01073000400,Current lack of health insurance among adults ...,%,...,29.9,,,3338.0,"(33.5632449633, -86.7640474064)",PREVENT,ACCESS2,107000.0,1073000000.0,Health Insurance
61,2015,AL,Alabama,Birmingham,Census Tract,BRFSS,Prevention,0107000-01073000500,Current lack of health insurance among adults ...,%,...,32.0,,,2864.0,"(33.5442404594, -86.7749130719)",PREVENT,ACCESS2,107000.0,1073000000.0,Health Insurance
62,2015,AL,Alabama,Birmingham,Census Tract,BRFSS,Prevention,0107000-01073000700,Current lack of health insurance among adults ...,%,...,36.7,,,2577.0,"(33.5525406139, -86.8016893706)",PREVENT,ACCESS2,107000.0,1073001000.0,Health Insurance


In [10]:
df.Measure.value_counts() # these are the potential feature columns
### ... I need to pull the Data_Value along with the GeoLocation (as my unique identifier)

Diagnosed diabetes among adults aged >=18 Years                                                                                                                                                   28004
Cholesterol screening among adults aged >=18 Years                                                                                                                                                28004
Mental health not good for >=14 days among adults aged >=18 Years                                                                                                                                 28004
Current smoking among adults aged >=18 Years                                                                                                                                                      28004
High blood pressure among adults aged >=18 Years                                                                                                                                                  28004


In [11]:
df.shape

(782047, 24)

In [12]:
df.GeoLocation[58]

'(33.5794328326, -86.7228323926)'

In [13]:
re.findall("-\d+.\d+", df.GeoLocation[58])[0] #Going to use RegEx to creat latitude/longitude cols

'-86.7228323926'

In [14]:
df['Latitude'] = [float(re.findall("(\d+.\d+),", df.GeoLocation[i])[0]) for i in df.index] # regex for lat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
df['Longitude'] = [float(re.findall("-\d+.\d+", df.GeoLocation[i])[0]) for i in df.index] # regex for long

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
df.Measure.value_counts() # these are the feature columns I need to create
### ... I need to pull the Data_Value along with the UniqueID so I can combine the data

Diagnosed diabetes among adults aged >=18 Years                                                                                                                                                   28004
Cholesterol screening among adults aged >=18 Years                                                                                                                                                28004
Mental health not good for >=14 days among adults aged >=18 Years                                                                                                                                 28004
Current smoking among adults aged >=18 Years                                                                                                                                                      28004
High blood pressure among adults aged >=18 Years                                                                                                                                                  28004


In [17]:
df.Measure.value_counts().index[0]

'Diagnosed diabetes among adults aged >=18 Years'

In [18]:
measures_of_interest = df.Measure.value_counts().index[0:21] # I am going to pull the data for which all locations have data

In [19]:
df_list = [df[df["Measure"] == x] for x in measures_of_interest] # making a df for each measure

In [20]:
metric_dfs = []
for df in df_list:
    df.reset_index(inplace=True, drop=True)
    temp = pd.DataFrame({df.MeasureId[0]:df.Data_Value, # this is the actual value for the measure
                         "UniqueID":df.UniqueID,"geometry":df.GeoLocation, # UniqueID and geolocation cols
                         "state":df.StateAbbr, "population":df.PopulationCount,
                         "longitude": df.Longitude, "latitude":df.Latitude,})
    metric_dfs.append(temp)

In [21]:
metric_dfs[0].UniqueID.value_counts().sum()

28004

In [22]:
master = pd.merge(metric_dfs[0],metric_dfs[1], how='left',on="UniqueID") # Starting master DF of all metrics dfs

In [23]:
for i in metric_dfs[2:]: # merging the rest with a for loop
    master = master.merge(i, how = "left",on="UniqueID")

In [24]:
master.columns

Index(['DIABETES', 'UniqueID', 'geometry_x', 'state_x', 'population_x',
       'longitude_x', 'latitude_x', 'CHOLSCREEN', 'geometry_y', 'state_y',
       ...
       'state_y', 'population_y', 'longitude_y', 'latitude_y', 'ARTHRITIS',
       'geometry', 'state', 'population', 'longitude', 'latitude'],
      dtype='object', length=127)

In [25]:
master = master.T.drop_duplicates().T # dropping dup columns

In [26]:
master.columns

Index(['DIABETES', 'UniqueID', 'geometry_x', 'state_x', 'population_x',
       'longitude_x', 'latitude_x', 'CHOLSCREEN', 'MHLTH', 'CSMOKING',
       'BPHIGH', 'CANCER', 'CASTHMA', 'CHD', 'PHLTH', 'BPMED', 'KIDNEY',
       'BINGE', 'DENTAL', 'STROKE', 'SLEEP', 'OBESITY', 'COPD', 'LPA',
       'CHECKUP', 'HIGHCHOL', 'ARTHRITIS'],
      dtype='object')

In [27]:
master.rename({"geometry_x":"geometry","state_x":"state","population_x":"population","longitude_x":"longitude",
              "latitude_x":"latitude"}, axis = 1, inplace = True)

In [28]:
master.UniqueID[0][0:5] #The 5 digit fips codes can be pulled from the first 5 indices of the unique ID
##... using this to aggregate data to the county level, since thats the income data I have

'01070'

In [29]:
master["fips"] = [x[0:5] for x in master.UniqueID] # list comprehension to add county fips to all rows

In [30]:
master.dropna().shape, master.shape

((27204, 28), (28004, 28))

In [31]:
master.dropna(inplace=True)

In [40]:
#master.to_csv("./tract_health_data_final.csv")

In [65]:
master.columns

Index(['DIABETES', 'UniqueID', 'geometry', 'state', 'population', 'longitude',
       'latitude', 'CHOLSCREEN', 'MHLTH', 'CSMOKING', 'BPHIGH', 'CANCER',
       'CASTHMA', 'CHD', 'PHLTH', 'BPMED', 'KIDNEY', 'BINGE', 'DENTAL',
       'STROKE', 'SLEEP', 'OBESITY', 'COPD', 'LPA', 'CHECKUP', 'HIGHCHOL',
       'ARTHRITIS', 'fips'],
      dtype='object')

In [66]:
#dropping these columns since I am aggregating on fips code (County)
clean_master = master.drop(["UniqueID", "geometry","population","latitude","longitude","state"], axis = 1)

In [67]:
clean_master.columns

Index(['DIABETES', 'CHOLSCREEN', 'MHLTH', 'CSMOKING', 'BPHIGH', 'CANCER',
       'CASTHMA', 'CHD', 'PHLTH', 'BPMED', 'KIDNEY', 'BINGE', 'DENTAL',
       'STROKE', 'SLEEP', 'OBESITY', 'COPD', 'LPA', 'CHECKUP', 'HIGHCHOL',
       'ARTHRITIS', 'fips'],
      dtype='object')

In [69]:
for col in ['DIABETES', 'CHOLSCREEN', 'MHLTH', 'CSMOKING', 'BPHIGH', 'CANCER',
       'CASTHMA', 'CHD', 'PHLTH', 'BPMED', 'KIDNEY', 'BINGE', 'DENTAL',
       'STROKE', 'SLEEP', 'OBESITY', 'COPD', 'LPA', 'CHECKUP', 'HIGHCHOL',
       'ARTHRITIS']:
    clean_master[col] = clean_master[col].astype(float) # need to change from 'object' to 'float'

In [70]:
agg_data = clean_master.groupby("fips").mean()

In [71]:
agg_data.reset_index(drop=False, inplace=True)

In [75]:
agg_data["fips"] = agg_data['fips'].astype(str) # format for export

In [76]:
#agg_data.to_csv("./agg_county_data_final.csv")