In [119]:
import numpy as np
import pandas as pd

In [120]:
df = pd.read_csv('data_sources\\uscounties.csv', converters={'county_fips': lambda x: str(x)})

In [121]:
df.columns

Index(['county', 'county_ascii', 'county_fips', 'state_id', 'state_name',
       'city_largest', 'city_largest_id', 'lat', 'lng', 'population',
       'density', 'timezone', 'timezone_all', 'age_median', 'age_under_10',
       'age_10_to_19', 'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s',
       'age_70s', 'age_over_80', 'male', 'female', 'married', 'divorced',
       'never_married', 'widowed', 'family_size', 'family_dual_income',
       'income_household_median', 'income_household_under_5',
       'income_household_5_to_10', 'income_household_10_to_15',
       'income_household_15_to_20', 'income_household_20_to_25',
       'income_household_25_to_35', 'income_household_35_to_50',
       'income_household_50_to_75', 'income_household_75_to_100',
       'income_household_100_to_150', 'income_household_150_over',
       'income_household_six_figure', 'income_individual_median',
       'home_ownership', 'home_value', 'rent_median', 'rent_burden',
       'education_less_highschoo

## Bring in and Clean Weather Data

In [122]:
# weather import
df_weather = pd.read_csv('data_sources\\additional_sources\\2020_weather_station_to_county_aggregated_by_county.csv',
                            dtype={'county'             :str,
                                    'county_fips'       :int,
                                    'state_id'          :str,
                                    'state_name'        :str,
                                    'state_fips'        :int,
                                    'total_precip_mm'   :float,
                                    'num_precip_days'   :int,
                                    'num_precip_days_greater_1mm'    :int,
                                    '0.05_percentile_high'           :float,
                                    '0.25_percentile_high'           :float,
                                    '0.50_percentile_high'           :float,
                                    '0.75_percentile_high'           :float,
                                    '0.95_percentile_high'           :float,
                                    'min_high'                       :float,
                                    'max_high'                       :float
                                    })

In [123]:
df_weather.dtypes

county                          object
county_fips                      int32
state_id                        object
state_name                      object
state_fips                       int32
total_precip_mm                float64
num_precip_days                  int32
num_precip_days_greater_1mm      int32
0.05_percentile_high           float64
0.25_percentile_high           float64
0.50_percentile_high           float64
0.75_percentile_high           float64
0.95_percentile_high           float64
min_high                       float64
max_high                       float64
dtype: object

In [124]:
# ensure all FIPS codes are 5 and 2 digits
df_weather['county_fips'] = df_weather['county_fips'].apply(lambda x: '{0:0>5}'.format(str(x)))
df_weather['state_fips'] = df_weather['state_fips'].apply(lambda x: '{0:0>2}'.format(str(x)))

In [125]:
df_weather.head(5)

Unnamed: 0,county,county_fips,state_id,state_name,state_fips,total_precip_mm,num_precip_days,num_precip_days_greater_1mm,0.05_percentile_high,0.25_percentile_high,0.50_percentile_high,0.75_percentile_high,0.95_percentile_high,min_high,max_high
0,Abbeville,45001,SC,South Carolina,45,8430.0,366,366,10.0,17.8,23.3,29.4,32.8,1.7,36.7
1,Accomack,51001,VA,Virginia,51,7493.0,365,365,7.5,14.75,20.55,26.95,31.4,1.7,37.8
2,Ada,16001,ID,Idaho,16,6690.0,360,360,3.3,8.6,17.5,28.05,33.6,-2.1,41.7
3,Adair,19001,IA,Iowa,19,5762.0,326,322,-3.3,6.1,16.7,26.7,30.6,-14.4,34.4
4,Adair,29001,MO,Missouri,29,3218.0,184,180,-5.55,3.6,9.45,16.65,21.15,-10.6,34.4


In [136]:
# drop redundant columns
df_weather.columns
df_weather.drop(['county', 'state_id', 'state_name', 'state_fips'], axis=1, inplace=True)

## Bring in and Clean Crime Data

In [127]:
# https://www.kaggle.com/mikejohnsonjr/united-states-crime-rates-by-county
df_crime = pd.read_csv('data_sources\\additional_sources\\crime_data_w_population_and_crime_rate.csv')

In [128]:
df_crime.dtypes

county_name               object
crime_rate_per_100000    float64
index                      int64
EDITION                    int64
PART                       int64
IDNO                       int64
CPOPARST                   int64
CPOPCRIM                   int64
AG_ARRST                   int64
AG_OFF                     int64
COVIND                   float64
INDEX                      int64
MODINDX                    int64
MURDER                     int64
RAPE                       int64
ROBBERY                    int64
AGASSLT                    int64
BURGLRY                    int64
LARCENY                    int64
MVTHEFT                    int64
ARSON                      int64
population                 int64
FIPS_ST                    int64
FIPS_CTY                   int64
dtype: object

In [129]:
# ensure all FIPS codes are have correct amount of digits and rename columns
df_crime['FIPS_ST'] = df_crime['FIPS_ST'].apply(lambda x: '{0:0>2}'.format(str(x)))
df_crime['FIPS_CTY'] = df_crime['FIPS_CTY'].apply(lambda x: '{0:0>3}'.format(str(x)))

df_crime['county_fips'] = df_crime['FIPS_ST'] + df_crime['FIPS_CTY']

In [130]:
df_crime.head()

Unnamed: 0,county_name,crime_rate_per_100000,index,EDITION,PART,IDNO,CPOPARST,CPOPCRIM,AG_ARRST,AG_OFF,...,ROBBERY,AGASSLT,BURGLRY,LARCENY,MVTHEFT,ARSON,population,FIPS_ST,FIPS_CTY,county_fips
0,"St. Louis city, MO",1791.995377,1,1,4,1612,318667,318667,15,15,...,1778,3609,4995,13791,3543,464,318416,29,510,29510
1,"Crittenden County, AR",1754.914968,2,1,4,130,50717,50717,4,4,...,165,662,1482,1753,189,28,49746,5,35,5035
2,"Alexander County, IL",1664.700485,3,1,4,604,8040,8040,2,2,...,5,119,82,184,12,2,7629,17,3,17003
3,"Kenedy County, TX",1456.31068,4,1,4,2681,444,444,1,1,...,1,2,5,4,4,0,412,48,261,48261
4,"De Soto Parish, LA",1447.40243,5,1,4,1137,26971,26971,3,3,...,17,368,149,494,60,0,27083,22,31,22031


In [131]:
# drop unneeded cols
df_crime= df_crime[['county_fips', 'crime_rate_per_100000']]    

In [132]:
df_crime.head()

Unnamed: 0,county_fips,crime_rate_per_100000
0,29510,1791.995377
1,5035,1754.914968
2,17003,1664.700485
3,48261,1456.31068
4,22031,1447.40243


## Combine datasets


Index(['county', 'county_fips', 'state_id', 'state_name', 'state_fips',
       'total_precip_mm', 'num_precip_days', 'num_precip_days_greater_1mm',
       '0.05_percentile_high', '0.25_percentile_high', '0.50_percentile_high',
       '0.75_percentile_high', '0.95_percentile_high', 'min_high', 'max_high'],
      dtype='object')

In [141]:
combined_df = df.merge(df_weather, how='right', on='county_fips')
combined_df.head(15)

Unnamed: 0,county,county_ascii,county_fips,state_id,state_name,city_largest,city_largest_id,lat,lng,population,...,total_precip_mm,num_precip_days,num_precip_days_greater_1mm,0.05_percentile_high,0.25_percentile_high,0.50_percentile_high,0.75_percentile_high,0.95_percentile_high,min_high,max_high
0,Abbeville,Abbeville,45001,SC,South Carolina,Abbeville,1840014000.0,34.2226,-82.4592,24627.0,...,8430.0,366,366,10.0,17.8,23.3,29.4,32.8,1.7,36.7
1,Accomack,Accomack,51001,VA,Virginia,Chincoteague,1840006000.0,37.7643,-75.6333,32673.0,...,7493.0,365,365,7.5,14.75,20.55,26.95,31.4,1.7,37.8
2,Ada,Ada,16001,ID,Idaho,Boise,1840027000.0,43.4511,-116.2412,456849.0,...,6690.0,360,360,3.3,8.6,17.5,28.05,33.6,-2.1,41.7
3,Adair,Adair,19001,IA,Iowa,Greenfield,1840008000.0,41.3307,-94.471,7085.0,...,5762.0,326,322,-3.3,6.1,16.7,26.7,30.6,-14.4,34.4
4,Adair,Adair,29001,MO,Missouri,Kirksville,1840008000.0,40.1906,-92.6007,25369.0,...,3218.0,184,180,-5.55,3.6,9.45,16.65,21.15,-10.6,34.4
5,Adams,Adams,8001,CO,Colorado,Aurora,1840019000.0,39.8736,-104.3378,504108.0,...,7293.0,341,340,0.0,10.8,21.15,30.85,35.0,-9.4,38.9
6,Adams,Adams,17001,IL,Illinois,Quincy,1840009000.0,39.9879,-91.1885,66085.0,...,5963.0,323,320,-0.25,8.05,17.25,27.5,31.15,-13.3,35.0
7,Adams,Adams,18001,IN,Indiana,Decatur,1840007000.0,40.7457,-84.9366,35376.0,...,5997.0,341,333,-0.6,7.2,16.7,26.4,30.85,-9.4,36.1
8,Adams,Adams,19003,IA,Iowa,Corning,1840007000.0,41.029,-94.6992,3670.0,...,5959.0,330,327,-2.2,7.2,16.1,26.7,30.6,-14.4,34.4
9,Adams,Adams,31001,NE,Nebraska,Hastings,1840001000.0,40.5245,-98.5012,31587.0,...,6539.0,342,334,-1.05,8.05,19.15,28.6,32.5,-8.8,38.3


In [145]:
combined_df = combined_df.merge(df_crime, how='right', on='county_fips')
combined_df.head(15)
len(combined_df)

3152

In [153]:
# crime data has extra FIPS, drop rows
combined_df = combined_df.dropna(thresh=10)

In [154]:
combined_df.to_csv("data_sources//combined_data.csv", index=False)