In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
sns.set_style("whitegrid")

In [7]:
aggregate=pd.read_csv('./aggregate.csv',engine='python')
aggregate.set_index('GEO_ID',inplace=True)

#Construct extra features for use in the models
aggregate['unemp_bac']=aggregate['unemp_rate']*aggregate['bac_deg']
aggregate['log_pop']=np.log10(aggregate['total_pop'].apply(lambda x: max(x,0.1)))
aggregate['log_urban_pop']=np.log10((aggregate['urban']*aggregate['total_pop']).apply(lambda x: max(x,0.1)))
for x in aggregate.index:
    aggregate.loc[x,'winner'] = 1 if aggregate.loc[x,'democrat'] > aggregate.loc[x,'republican'] else -1 #For classifer problems: determine winner in each county

features=['white_pc','urban', 'unemp_bac', 'log_pop','log_urban_pop','pov_pc', 'unemp_rate', 'bac_deg', 'hs_deg', 'pop_18_30_pc', 'pop_60_up_pc'] #list of features we want to use
results=['democrat','republican']
target='winner' #target variable, 'republican'=100-'democrat' approximately
agg_train, agg_test = train_test_split(aggregate,test_size=0.2,shuffle=True,random_state=121,stratify=aggregate['winner']) # Split dataset into 80%-20% training and testing, random_state=121 for consistency


In [8]:
#GridSearch to determine optimal hyperparameters for random forests

pipe_rf=Pipeline(steps=[('scaler',StandardScaler()),('rf',RandomForestClassifier(bootstrap=True,random_state=121,max_samples=300))])
params_rf={'rf__n_estimators':range(100,500,20),'rf__max_depth':[4,5,6]}
grid_rf=GridSearchCV(pipe_rf,params_rf)
grid_rf.fit(agg_train[features],agg_train[target])

In [9]:
grid_rf.best_params_

{'rf__max_depth': 5, 'rf__n_estimators': 160}

In [10]:
#GridSearch for optimal hyperparameters for SVC
pipe_svc=Pipeline(steps=[('scaler',StandardScaler()),('svc',SVC(degree=2))])
params_svc={'svc__C':range(10,500,20),'svc__kernel':['rbf','poly','sigmoid']}
grid_svc=GridSearchCV(pipe_svc,params_svc)
grid_svc.fit(agg_train[features],agg_train[target])

In [11]:
grid_svc.best_params_

{'svc__C': 10, 'svc__kernel': 'rbf'}

In [12]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=121) # Need to do a stratified kfold for classification
cutoffs=[0.4,0.5,0.6]
mse_lr=np.zeros(len(cutoffs))
mse_rf=0
mse_svc=0
baseline=0
for train_i, test_i in kfold.split(agg_train,agg_train['winner']):
    train_var = agg_train.iloc[train_i]
    test_var = agg_train.iloc[test_i]

    #Baseline prediction

    base_p=(train_var[target]==1).sum()/len(train_var[target]) 
    base_pred=2*np.random.binomial(1,base_p,size=len(test_var[target]))-1 #Convert True,False to 1,-1
    baseline += accuracy_score(base_pred,test_var[target])
    
    #Logistic Regression

    log_reg=make_pipeline(StandardScaler(),LogisticRegression(max_iter=10000,penalty=None))
    log_reg.fit(train_var[features],train_var[target])
    for i,cutoff in enumerate(cutoffs):
        mse_pred= 2*(log_reg.predict_proba(test_var[features])[:,1] >= cutoff) - 1 #Convert True,False to 1,-1
        mse_lr[i] += accuracy_score(mse_pred,test_var[target])
    
    #Random Forest Classifier

    rf = RandomForestClassifier(
        n_estimators = grid_rf.best_params_['rf__n_estimators'], 
        max_depth = grid_rf.best_params_['rf__max_depth'],
        #max_features = 2, 
        bootstrap= True, 
        max_samples = 300,
        random_state = 121
        )
    rf.fit(train_var[features],train_var[target])
    rf_pred=rf.predict(test_var[features])
    mse_rf+=accuracy_score(rf_pred,test_var[target])

    #SVC

    svc = make_pipeline(StandardScaler(),SVC(C=grid_svc.best_params_['svc__C'], kernel=grid_svc.best_params_['svc__kernel'], degree=2))
    svc.fit(train_var[features],train_var[target])
    svc_pred=svc.predict(test_var[features])
    mse_svc+=accuracy_score(svc_pred,test_var[target])

mse_lr/=5
baseline/=5
mse_rf/=5
mse_svc/=5

In [13]:
print('Baseline:',baseline,'\nLogisticRegression:',mse_lr,'\nRandomForest:',mse_rf,'\nSVC:',mse_svc)

Baseline: 0.7050607238573533 
LogisticRegression: [0.90489413 0.90609573 0.90047806] 
RandomForest: 0.9225543456390692 
SVC: 0.9265599472036442


In [14]:
#Winning Classifier

svc = make_pipeline(StandardScaler(),SVC(C=10, kernel='rbf', degree=2))
svc.fit(agg_train[features],agg_train[target])
svc_pred=svc.predict(agg_test[features])
mse_svc=accuracy_score(svc_pred,agg_test[target])

In [15]:
mse_svc

0.9197431781701445