In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import os

In [2]:
from matplotlib import rcParams
# figure properties
rcParams['figure.figsize'] = (6,4)
rcParams['figure.dpi'] = 150
rcParams['savefig.dpi'] = 300
rcParams['savefig.bbox'] = 'tight'
# tick styling
rcParams['xtick.direction'] = 'in'
rcParams['ytick.direction'] = 'in'
rcParams['xtick.top'] = True
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True
rcParams['ytick.right'] = True
rcParams['xtick.labelsize'] = 10
rcParams['ytick.labelsize'] = 10
#legend styling
rcParams['legend.fancybox'] = False
rcParams['legend.edgecolor'] = 'k'
rcParams['legend.fontsize'] = 10
# line styling
rcParams['lines.linewidth'] = 1.5
rcParams['errorbar.capsize'] = 3

rcParams['mathtext.default'] = 'regular'
rcParams['axes.labelsize'] = 12
rcParams['axes.titlesize'] = 14
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 10

In [3]:
pd.set_option('display.max_columns', None)

# Load data 

In [4]:
rawdata = pd.read_csv('data/clean/masterdata.csv')

In [5]:
rawdata

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,fips,geographic area name,median income (family),median income (nonfamily),total population,occupied housing units,"1-unit, attached",2 units,3 or 4 units,5 to 9 units,10 to 19 units,20 or more units,mobile home,"boat, rv, van, etc.",commute time,count,county,state
0,6.600000,20.300000,18.400000,6.900000,24.000000,16.300000,2010,6001,"Alameda County, California",85014.0,44439.0,1477980,532026,8.242266,4.516133,7.746238,6.190675,5.983918,16.833576,1.344671,0.115032,12.456474,20,Alameda County,CA
1,9.800000,30.500000,28.500000,9.500000,13.200000,5.800000,2010,6005,"Amador County, California",65103.0,33411.0,38327,14715,3.533809,2.324159,2.154264,0.767924,1.230037,2.657153,9.113150,0.013592,9.893939,1,Amador County,CA
2,5.900000,19.700000,22.300000,8.200000,24.500000,13.700000,2010,6013,"Contra Costa County, California",91791.0,47627.0,1024809,368087,8.502881,2.053047,5.690774,5.219690,3.747755,8.936203,1.937314,0.062757,13.799532,10,Contra Costa County,CA
3,10.800000,23.200000,22.600000,7.600000,13.400000,6.300000,2010,6019,"Fresno County, California",52306.0,28843.0,908830,283836,2.584943,3.393157,8.102919,8.195578,2.956637,5.566242,4.951451,0.144450,8.000660,2,Fresno County,CA
4,6.400000,26.000000,29.300000,8.800000,17.700000,8.600000,2010,6023,"Humboldt County, California",53221.0,24568.0,133058,54276,3.758567,4.875083,6.387722,4.355516,2.546245,3.080551,10.761663,0.287420,7.378587,2,Humboldt County,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5143,0.002326,0.015866,0.011163,0.006452,0.012130,0.004171,2019,55131,"Washington County, Wisconsin",95408.0,44643.0,136034,55256,6.772115,4.075576,4.520776,7.052628,2.627769,5.617490,1.301216,0.000000,13.398305,200,Washington County,fl
5144,0.000399,0.003754,0.003427,0.001684,0.005299,0.002786,2019,55133,"Waukesha County, Wisconsin",113591.0,48449.0,404198,160635,6.531578,2.257291,2.776481,5.617705,3.804277,10.051981,0.783142,0.000000,12.112071,6,Waukesha County,mi
5145,0.002219,0.012883,0.007517,0.004034,0.008855,0.003567,2019,55139,"Winnebago County, Wisconsin",76425.0,40352.0,171907,71238,4.487773,5.818524,4.635167,8.995199,5.120862,7.269996,2.880485,0.000000,9.185955,63,Winnebago County,tx
5146,0.006054,0.038556,0.019355,0.012654,0.012519,0.006938,2019,55141,"Wood County, Wisconsin",68511.0,31511.0,72999,32684,1.759271,6.360911,2.105006,2.631257,2.444621,7.951903,3.677640,0.021417,8.886286,25,Wood County,wi


# Preprocessing

In [7]:
data = rawdata.drop(columns=['fips', 'geographic area name', 'county', 'state'])

In [8]:
data.head(3)

Unnamed: 0,"9th to 12th grade, no diploma",high school graduate (includes equivalency),"some college, no degree",associate's degree,bachelor's degree,graduate or professional degree,year,median income (family),median income (nonfamily),total population,occupied housing units,"1-unit, attached",2 units,3 or 4 units,5 to 9 units,10 to 19 units,20 or more units,mobile home,"boat, rv, van, etc.",commute time,count
0,6.6,20.3,18.4,6.9,24.0,16.3,2010,85014.0,44439.0,1477980,532026,8.242266,4.516133,7.746238,6.190675,5.983918,16.833576,1.344671,0.115032,12.456474,20
1,9.8,30.5,28.5,9.5,13.2,5.8,2010,65103.0,33411.0,38327,14715,3.533809,2.324159,2.154264,0.767924,1.230037,2.657153,9.11315,0.013592,9.893939,1
2,5.9,19.7,22.3,8.2,24.5,13.7,2010,91791.0,47627.0,1024809,368087,8.502881,2.053047,5.690774,5.21969,3.747755,8.936203,1.937314,0.062757,13.799532,10


In [9]:
for col in data.columns:
    data[col] = data[col].apply(float)

In [10]:
data.dtypes

9th to 12th grade, no diploma                  float64
high school graduate (includes equivalency)    float64
some college, no degree                        float64
associate's degree                             float64
bachelor's degree                              float64
graduate or professional degree                float64
year                                           float64
median income (family)                         float64
median income (nonfamily)                      float64
total population                               float64
occupied housing units                         float64
1-unit, attached                               float64
2 units                                        float64
3 or 4 units                                   float64
5 to 9 units                                   float64
10 to 19 units                                 float64
20 or more units                               float64
mobile home                                    float64
boat, rv, 

In [None]:
data['count'] = data['count'] / data['total population']

In [None]:
data.head()

In [None]:
house_cols = data.columns[10:18]
for col in house_cols:
    data[col] = data[col]

In [None]:
data.head()

In [None]:
features = data.drop(columns='count')
features.head()

In [None]:
response = data[['count']]
response.head()

In [None]:
features_scale = StandardScaler().fit_transform(features)
response_scale = StandardScaler().fit_transform(response)

In [None]:
features_scale.shape, response_scale.shape

In [None]:
response_scale = response_scale.ravel()

In [None]:
response_scale.shape

# Test/Train Split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(features, response, test_size=0.3, random_state=42)

In [None]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

In [None]:
model = LinearRegression(fit_intercept=True)

In [None]:
model.fit(xtrain, ytrain)

In [None]:
model.score(xtrain, ytrain)

In [None]:
model = DecisionTreeRegressor()

In [None]:
model.fit(xtrain, ytrain)

In [None]:
ypred = model.predict(xtrain)

In [None]:
mean_squared_error(ytrain, ypred)

In [None]:
ypred = model.predict(xtest)
mean_squared_error(ypred, ytest)

In [None]:
model = RandomForestRegressor()

In [None]:
model.fit(xtrain, ytrain.values.ravel())

In [None]:
ypred = model.predict(xtest)

In [None]:
mean_squared_error(ytest, ypred)