In [80]:
import numpy as np
import pandas as pd

from sklearn import naive_bayes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

In [81]:
sf1_numbers = pd.read_csv('sf1_numbers_short.csv')
sf1_percentages = pd.read_csv('sf1_percentages_short.csv')

acs1_numbers = pd.read_csv('acs1_numbers_short.csv')
acs1_percentages = pd.read_csv('acs1_percentages_short.csv')

social_numbers = pd.read_csv('social_numbers_short.csv')
social_percentages = pd.read_csv('social_percentages_short.csv')

economic_numbers = pd.read_csv('economic_numbers_short.csv')
economic_percentages = pd.read_csv('economic_percentages_short.csv')

housing_numbers = pd.read_csv('housing_numbers_short.csv')
housing_percentages = pd.read_csv('housing_percentages_short.csv')

In [82]:
predictions = pd.read_csv('predictions.csv')

In [83]:
full_dfs = [sf1_numbers, sf1_percentages, acs1_numbers, acs1_percentages,
           social_numbers, social_percentages, economic_numbers,
           economic_percentages, housing_numbers, housing_percentages]

dfs = [sf1_numbers, sf1_percentages, acs1_numbers, acs1_percentages]

dfs_numbers = ['sf1_numbers', 'acs1_numbers', 'social_numbers', 'economic_numbers',
              'housing_numbers']

names = ['sf1_numbers', 'sf1_percentages', 'acs1_numbers',
         'acs1_percentages','social_numbers','social_percentages',
         'economic_numbers','economic_percentages','housing_numbers',
         'housing_percentages']

In [84]:
for df in full_dfs:
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
predictions.drop(['Unnamed: 0'], axis=1, inplace=True)

In [85]:
for index, df in enumerate(full_dfs):
    df.name = names[index]

In [86]:
y = predictions.loc[:,'Democrat']

scale = StandardScaler()

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [87]:
rf = RandomForestClassifier()

params = {'max_depth': range(1,21)}

grid = GridSearchCV(rf, params, scoring='roc_auc', cv=kf)

In [88]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(grid, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, grid.predict(X_test)))
    print(df.name, "Best Depth", grid.best_estimator_, '\n')

sf1_numbers Cross Val Score: 0.9557031740123982
sf1_numbers AUC Score: 0.9217320261437909
sf1_numbers Best Depth RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

sf1_percentages Cross Val Score: 0.9606239849843721
sf1_percentages AUC Score: 0.9215686274509804
sf1_percentages Best Depth RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           

In [89]:
dt = DecisionTreeClassifier()

params = {'max_depth': range(1,21)}

grid = GridSearchCV(dt, params, scoring='roc_auc', cv=kf)

In [90]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(grid, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, grid.predict(X_test)))
    print(df.name, "Best Depth", grid.best_estimator_, '\n')

sf1_numbers Cross Val Score: 0.9275086463923673
sf1_numbers AUC Score: 0.930392156862745
sf1_numbers Best Depth DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=17,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 

sf1_percentages Cross Val Score: 0.9353956657982255
sf1_percentages AUC Score: 0.930392156862745
sf1_percentages Best Depth DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=14,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 

acs1_numbers Cross Val Score: 0.8620929656265381
acs1_numbers AU

In [91]:
nbg = naive_bayes.GaussianNB()
nbb = naive_bayes.BernoulliNB()
nbm = naive_bayes.MultinomialNB()

In [92]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    print(df.name,'\n')
    
    nbg.fit(X_train, y_train)
    
    print('Gaussian Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(nbg,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, nbg.predict(X_test)),'\n')
    
    nbb.fit(X_train, y_train)
    
    print('Bernoulli Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(nbb,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, nbb.predict(X_test)),'\n')
    
    nbm.fit(X_train, y_train)
    
    print('Multinomial Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(nbm,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, nbm.predict(X_test)),'\n')

sf1_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.7539096989270548
AUC Score: 0.7192810457516341 

Bernoulli Naive Bayes
Cross Val Score: 0.5
AUC Score: 0.5 

Multinomial Naive Bayes
Cross Val Score: 0.6845638992608245
AUC Score: 0.6643790849673202 

sf1_percentages 

Gaussian Naive Bayes
Cross Val Score: 0.893437566283672
AUC Score: 0.8034313725490196 

Bernoulli Naive Bayes
Cross Val Score: 0.5708985228496075
AUC Score: 0.5766339869281045 

Multinomial Naive Bayes
Cross Val Score: 0.8752573051109964
AUC Score: 0.7722222222222223 

acs1_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.8538940221632589
AUC Score: 0.7807189542483661 

Bernoulli Naive Bayes
Cross Val Score: 0.5374782037274934
AUC Score: 0.5704248366013072 

Multinomial Naive Bayes
Cross Val Score: 0.7936862390094168
AUC Score: 0.7928104575163398 

acs1_percentages 

Gaussian Naive Bayes
Cross Val Score: 0.8842828124687342
AUC Score: 0.8058823529411766 

Bernoulli Naive Bayes
Cross Val Score: 0.6219215993724813
AUC S

In [93]:
lvm = LinearSVC()

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

grid = GridSearchCV(lvm, params, scoring='roc_auc', cv=kf)

In [94]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(grid, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, grid.predict(X_test)))
    print(df.name, "Best Model", grid.best_estimator_, '\n')

sf1_numbers Cross Val Score: 0.7864886458320827
sf1_numbers AUC Score: 0.6934640522875817
sf1_numbers Best Model LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 

sf1_percentages Cross Val Score: 0.8854370796364555
sf1_percentages AUC Score: 0.6647058823529411
sf1_percentages Best Model LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 

acs1_numbers Cross Val Score: 0.9144063704361816
acs1_numbers AUC Score: 0.8179738562091503
acs1_numbers Best Model LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=

In [95]:
svm = SVC(kernel='rbf')

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

grid = GridSearchCV(svm, params, scoring='roc_auc', cv=kf)

In [96]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(grid, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, grid.predict(X_test)))
    print(df.name, "Best Model", grid.best_estimator_, '\n')

sf1_numbers Cross Val Score: 0.9198838537977292
sf1_numbers AUC Score: 0.8753267973856209
sf1_numbers Best Model SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

sf1_percentages Cross Val Score: 0.9340465236340061
sf1_percentages AUC Score: 0.8866013071895424
sf1_percentages Best Model SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

acs1_numbers Cross Val Score: 0.940284759858008
acs1_numbers AUC Score: 0.8787581699346406
acs1_numbers Best Model SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking

In [97]:
lr = LogisticRegression()

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l1','l2']}

grid = GridSearchCV(lr, params, scoring='roc_auc', cv=kf)

In [98]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(grid, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, grid.predict(X_test)))
    print(df.name, "Best Model", grid.best_estimator_, '\n')

sf1_numbers Cross Val Score: 0.7878251551788308
sf1_numbers AUC Score: 0.7055555555555556
sf1_numbers Best Model LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

sf1_percentages Cross Val Score: 0.9010442768926616
sf1_percentages AUC Score: 0.7929738562091503
sf1_percentages Best Model LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

acs1_numbers Cross Val Score: 0.9169657778151301
acs1_numbers AUC Score: 0.8122549019607843
acs1_numbers Best Model LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_i

In [99]:
knn = KNeighborsClassifier()

params = {'n_neighbors': range(1,11)}

grid = GridSearchCV(knn, params, scoring='roc_auc', cv=kf)

In [100]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(grid, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, grid.predict(X_test)))
    print(df.name, "Best Model", grid.best_estimator_, '\n')

sf1_numbers Cross Val Score: 0.9251990002921484
sf1_numbers AUC Score: 0.9238562091503268
sf1_numbers Best Model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform') 

sf1_percentages Cross Val Score: 0.9400242723303439
sf1_percentages AUC Score: 0.9238562091503268
sf1_percentages Best Model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform') 

acs1_numbers Cross Val Score: 0.9170514189208119
acs1_numbers AUC Score: 0.8383986928104575
acs1_numbers Best Model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

acs1_percentages Cross Val Score: 0.9193383382758441
acs1_percentages AUC Score: 0.8277777777777778
acs1_percentages Best Model KNeighborsClassifier

In [106]:
classifiers

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,