In [2]:
# import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# read in csv as dataframe
df = pd.read_csv("county_statistics_for_model.csv", index_col=0)
df.head()

Unnamed: 0,county,state,percentage16_Donald_Trump,percentage16_Hillary_Clinton,total_votes16,votes16_Donald_Trump,votes16_Hillary_Clinton,percentage20_Donald_Trump,percentage20_Joe_Biden,total_votes20,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,Abbeville,SC,0.629,0.346,10724,6742,3712,0.661,0.33,12433.0,...,1.8,1.8,6.5,25.8,9505.0,78.8,13.3,7.8,0.1,9.4
1,Acadia,LA,0.773,0.206,27386,21159,5638,0.795,0.191,28425.0,...,1.6,2.2,2.5,27.6,24982.0,80.0,12.1,7.6,0.3,8.9
2,Accomack,VA,0.545,0.428,15755,8582,6737,0.542,0.447,16938.0,...,2.6,1.8,4.5,22.0,13837.0,74.6,18.1,7.1,0.2,5.4
3,Ada,ID,0.479,0.387,195587,93748,75676,0.504,0.465,259389.0,...,1.5,2.8,6.9,20.4,214984.0,78.3,15.0,6.6,0.1,4.3
4,Adair,IA,0.653,0.3,3759,2456,1127,0.697,0.286,4183.0,...,2.8,0.4,6.2,22.3,3680.0,73.8,15.3,10.4,0.5,3.0


In [4]:
# drop unnecessary rows
df = df.drop(['percentage16_Donald_Trump', 'percentage16_Hillary_Clinton', 'total_votes16', 'votes16_Donald_Trump', 'votes16_Hillary_Clinton', 'lat', 'long', 'deaths'], axis = 1)
df.columns

Index(['county', 'state', 'percentage20_Donald_Trump',
       'percentage20_Joe_Biden', 'total_votes20', 'votes20_Donald_Trump',
       'votes20_Joe_Biden', 'cases', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [5]:
# drop rows with null values and check if there are any nulls in columns after
df = df.dropna(axis = 0, how = 'any')
df.isnull().sum()

county                       0
state                        0
percentage20_Donald_Trump    0
percentage20_Joe_Biden       0
total_votes20                0
votes20_Donald_Trump         0
votes20_Joe_Biden            0
cases                        0
TotalPop                     0
Men                          0
Women                        0
Hispanic                     0
White                        0
Black                        0
Native                       0
Asian                        0
Pacific                      0
VotingAgeCitizen             0
Income                       0
IncomeErr                    0
IncomePerCap                 0
IncomePerCapErr              0
Poverty                      0
ChildPoverty                 0
Professional                 0
Service                      0
Office                       0
Construction                 0
Production                   0
Drive                        0
Carpool                      0
Transit                      0
Walk    

In [6]:
# separate string columns in separate df to be merged after using model
counties_df = df[['county', 'state']]
counties_df.head()

Unnamed: 0,county,state
0,Abbeville,SC
1,Acadia,LA
2,Accomack,VA
3,Ada,ID
4,Adair,IA


In [7]:
# drop string columns to only input integers into model
df = df.drop(['county', 'state'], axis = 1)
df.head()

Unnamed: 0,percentage20_Donald_Trump,percentage20_Joe_Biden,total_votes20,votes20_Donald_Trump,votes20_Joe_Biden,cases,TotalPop,Men,Women,Hispanic,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,0.661,0.33,12433.0,8215.0,4101.0,805.0,24788.0,12044.0,12744.0,1.3,...,1.8,1.8,6.5,25.8,9505.0,78.8,13.3,7.8,0.1,9.4
1,0.795,0.191,28425.0,22596.0,5443.0,3182.0,62607.0,30433.0,32174.0,2.4,...,1.6,2.2,2.5,27.6,24982.0,80.0,12.1,7.6,0.3,8.9
2,0.542,0.447,16938.0,9172.0,7578.0,1227.0,32840.0,16079.0,16761.0,8.8,...,2.6,1.8,4.5,22.0,13837.0,74.6,18.1,7.1,0.2,5.4
3,0.504,0.465,259389.0,130699.0,120539.0,17451.0,435117.0,217999.0,217118.0,7.9,...,1.5,2.8,6.9,20.4,214984.0,78.3,15.0,6.6,0.1,4.3
4,0.697,0.286,4183.0,2917.0,1197.0,222.0,7192.0,3552.0,3640.0,1.7,...,2.8,0.4,6.2,22.3,3680.0,73.8,15.3,10.4,0.5,3.0


In [8]:
# select cases as target
y = df.cases

In [9]:
# create features and drop target
X = df.drop(['cases'], axis = 1)

In [12]:
# instantiate model
regressor = RandomForestRegressor(n_estimators=100,random_state=0)

In [13]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=48)

In [14]:
# fit model
regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [16]:
# make predictions
predictions = regressor.predict(X_train)
predictions[:5]

array([ 458.33, 1521.69,  370.77,   60.07,  927.12])

In [17]:
# check score
regressor.score(X_train, y_train)

0.9782767930155859