In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
os.listdir('data/clean')

['income_clean.csv',
 'education_clean.csv',
 'commute_clean.csv',
 'vehicles_clean.csv',
 'masterdata.csv',
 'population_clean.csv',
 'housing_clean.csv',
 'masterdata_allcounties.csv']

In [4]:
def load_data(category):
    loc = 'data/clean/' + category + '_clean.csv'
    return pd.read_csv(loc)

In [5]:
incomedata = load_data('income')
incomedata['fips'] = incomedata['id'].apply(lambda s: s[-5:])
incomedata = incomedata.drop(columns=['id'])
incomedata.head()

Unnamed: 0,geographic area name,median income (family),median income (nonfamily),year,fips
0,"Jersey County, Illinois",64773,28125,2010,17083
1,"Jo Daviess County, Illinois",60381,28266,2010,17085
2,"Johnson County, Illinois",47423,21378,2010,17087
3,"Kane County, Illinois",77998,40333,2010,17089
4,"Kankakee County, Illinois",59998,28864,2010,17091


In [6]:
educationdata = load_data('education')
educationdata['fips'] = educationdata['id'].apply(lambda s: s[-5:])
educationdata = educationdata.drop(columns=['id', 'geographic area name'])
educationdata.head()

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips
0,8.4,36.7,25.1,9.3,8.9,7.2,2010,17083
1,7.1,39.3,19.6,7.2,14.5,8.5,2010,17085
2,16.1,32.3,24.6,8.3,9.4,3.4,2010,17087
3,8.2,24.7,19.6,7.0,20.9,10.9,2010,17089
4,9.2,35.0,23.9,8.9,10.8,6.6,2010,17091


In [7]:
def tofips_vehicle(s):
    fstr = str(int(s))
    if len(fstr) == 4:
        fstr = '0' + fstr
    return fstr

In [8]:
vehicledata = load_data('vehicles')
vehicledata.head()

Unnamed: 0,count,county,county geoid,registration year,state
0,95,Calhoun County,1015.0,2018.0,fl
1,4584,Clay County,1027.0,2018.0,fl
2,4500,Escambia County,1053.0,2018.0,fl
3,216,Franklin County,1059.0,2018.0,fl
4,330,Jackson County,1071.0,2018.0,fl


In [9]:
vehicledata = load_data('vehicles')
vehicledata['fips'] = vehicledata['county geoid'].apply(tofips_vehicle)
vehicledata['year'] = vehicledata['registration year'].apply(lambda x: int(x))
vehicledata = vehicledata.drop(columns=['registration year', 'county geoid'])
vehicledata.head()

Unnamed: 0,count,county,state,fips,year
0,95,Calhoun County,fl,1015,2018
1,4584,Clay County,fl,1027,2018
2,4500,Escambia County,fl,1053,2018
3,216,Franklin County,fl,1059,2018
4,330,Jackson County,fl,1071,2018


In [10]:
populationdata = load_data('population')
populationdata['fips'] = populationdata['id'].apply(lambda s: s[-5:])
populationdata = populationdata.drop(columns=['id', 'geographic area name'])
populationdata.head()

Unnamed: 0,total population,year,fips
0,5897,2010,8025
1,30533,2010,8029
2,578087,2010,8031
3,2027,2010,8033
4,273440,2010,8035


In [11]:
housingdata = load_data('housing')
housingdata['fips'] = housingdata['id'].apply(lambda s: s[-5:])
housingdata = housingdata.drop(columns=['id', 'geographic area name'])
housingdata.head()

Unnamed: 0,occupied housing units,"1-unit, attached",2 units,3 or 4 units,5 to 9 units,10 to 19 units,20 or more units,mobile home,"boat, rv, van, etc.",year,1-unit attached,boat rv van etc.,fips
0,3339,21,57,65,0,19,0,1148,0,2010,,,13155
1,20917,234,639,231,215,217,55,4264,45,2010,,,13157
2,4998,81,68,48,26,9,28,1053,0,2010,,,13159
3,5567,44,152,263,99,8,0,2056,0,2010,,,13161
4,6281,66,323,84,293,0,72,1709,0,2010,,,13163


In [12]:
commutedata = load_data('commute')
commutedata['fips'] = commutedata['id'].apply(lambda s: s[-5:])
commutedata = commutedata.drop(columns=['id', 'geographic area name'])
commutedata.head()

Unnamed: 0,commute time,year,fips
0,8685.0,2010,30019
1,96535.0,2010,72073
2,304545.0,2010,24023
3,1161730.0,2010,39173
4,4905.0,2010,30079


In [13]:
m1 = educationdata.merge(incomedata, how='inner', on=['fips', 'year'])
m1.head()

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips,geographic area name,median income (family),median income (nonfamily)
0,8.4,36.7,25.1,9.3,8.9,7.2,2010,17083,"Jersey County, Illinois",64773,28125
1,7.1,39.3,19.6,7.2,14.5,8.5,2010,17085,"Jo Daviess County, Illinois",60381,28266
2,16.1,32.3,24.6,8.3,9.4,3.4,2010,17087,"Johnson County, Illinois",47423,21378
3,8.2,24.7,19.6,7.0,20.9,10.9,2010,17089,"Kane County, Illinois",77998,40333
4,9.2,35.0,23.9,8.9,10.8,6.6,2010,17091,"Kankakee County, Illinois",59998,28864


In [14]:
m2 = m1.merge(populationdata, how='inner', on=['fips', 'year'])
m2.head()

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips,geographic area name,median income (family),median income (nonfamily),total population
0,8.4,36.7,25.1,9.3,8.9,7.2,2010,17083,"Jersey County, Illinois",64773,28125,22932
1,7.1,39.3,19.6,7.2,14.5,8.5,2010,17085,"Jo Daviess County, Illinois",60381,28266,22728
2,16.1,32.3,24.6,8.3,9.4,3.4,2010,17087,"Johnson County, Illinois",47423,21378,12710
3,8.2,24.7,19.6,7.0,20.9,10.9,2010,17089,"Kane County, Illinois",77998,40333,502628
4,9.2,35.0,23.9,8.9,10.8,6.6,2010,17091,"Kankakee County, Illinois",59998,28864,112100


In [15]:
m3 = m2.merge(housingdata, how='inner', on=['fips', 'year'])
m3.head()

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips,geographic area name,median income (family),median income (nonfamily),total population,occupied housing units,"1-unit, attached",2 units,3 or 4 units,5 to 9 units,10 to 19 units,20 or more units,mobile home,"boat, rv, van, etc.",1-unit attached,boat rv van etc.
0,8.4,36.7,25.1,9.3,8.9,7.2,2010,17083,"Jersey County, Illinois",64773,28125,22932,8626,149,248,158,250,99,145,737,14,,
1,7.1,39.3,19.6,7.2,14.5,8.5,2010,17085,"Jo Daviess County, Illinois",60381,28266,22728,10001,633,296,399,301,134,166,455,0,,
2,16.1,32.3,24.6,8.3,9.4,3.4,2010,17087,"Johnson County, Illinois",47423,21378,12710,4396,15,95,148,86,27,16,1512,2,,
3,8.2,24.7,19.6,7.0,20.9,10.9,2010,17089,"Kane County, Illinois",77998,40333,502628,168980,17314,8362,6600,8183,5008,8447,1243,0,,
4,9.2,35.0,23.9,8.9,10.8,6.6,2010,17091,"Kankakee County, Illinois",59998,28864,112100,40943,1807,1763,2070,2103,870,1327,3114,3,,


In [17]:
m4 = m2.merge(commutedata, how='inner', on=['fips', 'year'])
m4['commute time'] = m4['commute time'] / m4['total population']
m4.head()

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips,geographic area name,median income (family),median income (nonfamily),total population,commute time
0,8.4,36.7,25.1,9.3,8.9,7.2,2010,17083,"Jersey County, Illinois",64773,28125,22932,13.279042
1,7.1,39.3,19.6,7.2,14.5,8.5,2010,17085,"Jo Daviess County, Illinois",60381,28266,22728,9.353881
2,16.1,32.3,24.6,8.3,9.4,3.4,2010,17087,"Johnson County, Illinois",47423,21378,12710,7.024784
3,8.2,24.7,19.6,7.0,20.9,10.9,2010,17089,"Kane County, Illinois",77998,40333,502628,13.112819
4,9.2,35.0,23.9,8.9,10.8,6.6,2010,17091,"Kankakee County, Illinois",59998,28864,112100,9.937511


In [19]:
m4['commute time'].describe()

count    29822.000000
mean         9.386683
std          2.498992
min          1.411674
25%          7.748248
50%          9.151568
75%         10.779852
max         23.290487
Name: commute time, dtype: float64

In [20]:
def first(x):
    return s[0]

In [21]:
vehicledata_new = vehicledata.groupby(['fips', 'year']).aggregate('first').reset_index()

In [22]:
vehicledata_new.shape

(6920, 5)

In [23]:
masterdf = m4.merge(vehicledata_new, how='inner', left_on=['fips', 'year'], right_on=['fips', 'year'])

# masterdf = m3.merge(vehicledata, how='left', left_on='fips', right_on='fips')
# masterdf = masterdf.dropna(axis=0, subset=['count'])
print(masterdf.shape)
masterdf.head(25)

(5148, 16)


Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips,geographic area name,median income (family),median income (nonfamily),total population,commute time,count,county,state
0,6.6,20.3,18.4,6.9,24.0,16.3,2010,6001,"Alameda County, California",85014,44439,1477980,12.456474,20,Alameda County,CA
1,9.8,30.5,28.5,9.5,13.2,5.8,2010,6005,"Amador County, California",65103,33411,38327,9.893939,1,Amador County,CA
2,5.9,19.7,22.3,8.2,24.5,13.7,2010,6013,"Contra Costa County, California",91791,47627,1024809,13.799532,10,Contra Costa County,CA
3,10.8,23.2,22.6,7.6,13.4,6.3,2010,6019,"Fresno County, California",52306,28843,908830,8.00066,2,Fresno County,CA
4,6.4,26.0,29.3,8.8,17.7,8.6,2010,6023,"Humboldt County, California",53221,24568,133058,7.378587,2,Humboldt County,CA
5,13.9,26.8,22.7,6.9,9.8,4.8,2010,6029,"Kern County, California",51311,29291,815693,8.4111,1,Kern County,CA
6,8.8,32.9,28.9,8.1,11.7,4.7,2010,6033,"Lake County, California",50140,21906,64371,9.292694,1,Lake County,CA
7,10.2,21.3,18.8,6.8,19.0,9.9,2010,6037,"Los Angeles County, California",61622,40208,9758256,12.477287,360,Los Angeles County,CA
8,4.1,12.8,18.5,6.4,31.5,22.7,2010,6041,"Marin County, California",112911,55060,248601,12.429375,15,Marin County,CA
9,10.2,20.6,19.6,7.2,14.2,9.2,2010,6053,"Monterey County, California",63372,41786,407435,9.054819,1,Monterey County,CA


In [24]:
masterdf.to_csv('data/clean/masterdata.csv', index=False)

In [25]:
m4.to_csv('data/clean/masterdata_allcounties.csv', index=False)