In [1]:
import numpy as np
import pandas as pd

from sklearn import naive_bayes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
sf1_numbers = pd.read_csv('sf1_numbers_07.csv')
sf1_percentages = pd.read_csv('sf1_percentages_07.csv')

acs1_numbers = pd.read_csv('acs1_numbers_07.csv')
acs1_percentages = pd.read_csv('acs1_percentages_07.csv')

social_numbers = pd.read_csv('social_numbers_07.csv')
social_percentages = pd.read_csv('social_percentages_07.csv')

economic_numbers = pd.read_csv('economic_numbers_07.csv')
economic_percentages = pd.read_csv('economic_percentages_07.csv')

housing_numbers = pd.read_csv('housing_numbers_07.csv')
housing_percentages = pd.read_csv('housing_percentages_07.csv')

In [3]:
predictions = pd.read_csv('predictions.csv')

In [4]:
full_dfs = [sf1_numbers, sf1_percentages, acs1_numbers, acs1_percentages,
           social_numbers, social_percentages, economic_numbers,
           economic_percentages, housing_numbers, housing_percentages]

dfs = [sf1_numbers, acs1_numbers,social_numbers,economic_numbers,housing_numbers]

dfs_numbers = ['sf1_numbers', 'acs1_numbers', 'social_numbers', 'economic_numbers',
              'housing_numbers']

names = ['sf1_numbers', 'sf1_percentages', 'acs1_numbers',
         'acs1_percentages','social_numbers','social_percentages',
         'economic_numbers','economic_percentages','housing_numbers',
         'housing_percentages']

In [5]:
for df in full_dfs:
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
predictions.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
for index, df in enumerate(full_dfs):
    df.name = names[index]

In [7]:
y = predictions.loc[:,'Democrat']

scale = StandardScaler()

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

models = {}

In [8]:
nbg = naive_bayes.GaussianNB()
nbb = naive_bayes.BernoulliNB()
nbm = naive_bayes.MultinomialNB()

In [9]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    print(df.name,'\n')
    
    model1 = nbg.fit(X_train, y_train)
    
    print('Gaussian Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(model1,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, model1.predict(X_test)),'\n')
    
    if roc_auc_score(y_test, model1.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Gaussian Naive Bayes")] = model1
    
    model2 = nbb.fit(X_train, y_train)
    
    print('Bernoulli Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(model2,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, model2.predict(X_test)),'\n')
    
    if roc_auc_score(y_test, model2.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Bernoulli Naive Bayes")] = model2
    
    model3 = nbm.fit(X_train, y_train)
    
    print('Multinomial Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(model3,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, model3.predict(X_test)),'\n')
    
    if roc_auc_score(y_test, model3.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Multinomial Naive Bayes")] = model3

sf1_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.8883482321019077
AUC Score: 0.788562091503268 

Bernoulli Naive Bayes
Cross Val Score: 0.5
AUC Score: 0.5 

Multinomial Naive Bayes
Cross Val Score: 0.737861970681106
AUC Score: 0.7594771241830065 

acs1_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.9146520232277998
AUC Score: 0.7895424836601308 

Bernoulli Naive Bayes
Cross Val Score: 0.5384004570321724
AUC Score: 0.5704248366013072 

Multinomial Naive Bayes
Cross Val Score: 0.8142354868273083
AUC Score: 0.8001633986928105 

social_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.897576562493747
AUC Score: 0.7720588235294117 

Bernoulli Naive Bayes
Cross Val Score: 0.5
AUC Score: 0.5 

Multinomial Naive Bayes
Cross Val Score: 0.8019401127772909
AUC Score: 0.7839869281045752 

economic_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.8944264574403797
AUC Score: 0.7859477124183006 

Bernoulli Naive Bayes
Cross Val Score: 0.5184906916713691
AUC Score: 0.5089869281045751 

Multin

In [10]:
rf = RandomForestClassifier(n_estimators=10)

params = {'max_depth': range(1,21)}

grid = GridSearchCV(rf, params, scoring='roc_auc', cv=kf)

In [11]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best Depth:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Random Forest")] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9570159144845581
sf1_numbers AUC Score: 0.9249999999999999
sf1_numbers Best Depth: {'max_depth': 19} 

acs1_numbers Cross Val Score: 0.9455194846982267
acs1_numbers AUC Score: 0.8437908496732026
acs1_numbers Best Depth: {'max_depth': 19} 

social_numbers Cross Val Score: 0.9477111268524412
social_numbers AUC Score: 0.8501633986928105
social_numbers Best Depth: {'max_depth': 17} 

economic_numbers Cross Val Score: 0.9249937432215566
economic_numbers AUC Score: 0.8617647058823529
economic_numbers Best Depth: {'max_depth': 12} 

housing_numbers Cross Val Score: 0.9329098766173216
housing_numbers AUC Score: 0.853921568627451
housing_numbers Best Depth: {'max_depth': 11} 



In [12]:
dt = DecisionTreeClassifier()

params = {'max_depth': range(1,21)}

grid = GridSearchCV(dt, params, scoring='roc_auc', cv=kf)

In [13]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best Depth:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Decision Tree")] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9389530353419537
sf1_numbers AUC Score: 0.930392156862745
sf1_numbers Best Depth: {'max_depth': 20} 

acs1_numbers Cross Val Score: 0.8762686108543141
acs1_numbers AUC Score: 0.8065359477124183
acs1_numbers Best Depth: {'max_depth': 4} 

social_numbers Cross Val Score: 0.8940864455143214
social_numbers AUC Score: 0.8334967320261438
social_numbers Best Depth: {'max_depth': 5} 

economic_numbers Cross Val Score: 0.878266804336603
economic_numbers AUC Score: 0.8331699346405229
economic_numbers Best Depth: {'max_depth': 4} 

housing_numbers Cross Val Score: 0.878140809931445
housing_numbers AUC Score: 0.8112745098039216
housing_numbers Best Depth: {'max_depth': 4} 



In [14]:
lvm = LinearSVC()

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

grid = GridSearchCV(lvm, params, scoring='roc_auc', cv=kf)

In [15]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best C:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Linear SVM")] = model.best_estimator_



sf1_numbers Cross Val Score: 0.922060569169138
sf1_numbers AUC Score: 0.8297385620915031
sf1_numbers Best C: {'C': 0.1} 





acs1_numbers Cross Val Score: 0.9495284892725504
acs1_numbers AUC Score: 0.867156862745098
acs1_numbers Best C: {'C': 0.01} 





social_numbers Cross Val Score: 0.9315897043698198
social_numbers AUC Score: 0.8271241830065359
social_numbers Best C: {'C': 0.1} 





economic_numbers Cross Val Score: 0.9347344891204733
economic_numbers AUC Score: 0.8467320261437907
economic_numbers Best C: {'C': 0.01} 





housing_numbers Cross Val Score: 0.9150969276392406
housing_numbers AUC Score: 0.8163398692810457
housing_numbers Best C: {'C': 0.1} 





In [16]:
svm = SVC(kernel = 'rbf', gamma = 'scale')

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

grid = GridSearchCV(svm, params, scoring='roc_auc', cv=kf)

In [17]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best C:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "RBF SVM")] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9631630924509651
sf1_numbers AUC Score: 0.9330065359477124
sf1_numbers Best C: {'C': 10} 

acs1_numbers Cross Val Score: 0.9629939425228017
acs1_numbers AUC Score: 0.9080065359477124
acs1_numbers Best C: {'C': 10} 

social_numbers Cross Val Score: 0.9571168033360948
social_numbers AUC Score: 0.9019607843137255
social_numbers Best C: {'C': 10} 

economic_numbers Cross Val Score: 0.9587816794931827
economic_numbers AUC Score: 0.9021241830065361
economic_numbers Best C: {'C': 10} 

housing_numbers Cross Val Score: 0.9399794815766409
housing_numbers AUC Score: 0.8679738562091504
housing_numbers Best C: {'C': 10} 



In [18]:
lr = LogisticRegression(solver='liblinear')

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l1','l2']}

grid = GridSearchCV(lr, params, scoring='roc_auc', cv=kf)

In [19]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best C And Penalty:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "Logistic Regression")] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9234270681506205
sf1_numbers AUC Score: 0.832516339869281
sf1_numbers Best C And Penalty: {'C': 10, 'penalty': 'l1'} 

acs1_numbers Cross Val Score: 0.9488972005778937
acs1_numbers AUC Score: 0.8586601307189542
acs1_numbers Best C And Penalty: {'C': 0.1, 'penalty': 'l2'} 

social_numbers Cross Val Score: 0.93115916405534
social_numbers AUC Score: 0.8217320261437909
social_numbers Best C And Penalty: {'C': 1, 'penalty': 'l2'} 

economic_numbers Cross Val Score: 0.9351531369935927
economic_numbers AUC Score: 0.8468954248366013
economic_numbers Best C And Penalty: {'C': 0.1, 'penalty': 'l2'} 

housing_numbers Cross Val Score: 0.9139254189128078
housing_numbers AUC Score: 0.805065359477124
housing_numbers Best C And Penalty: {'C': 10000, 'penalty': 'l1'} 



In [20]:
knn = KNeighborsClassifier()

params = {'n_neighbors': range(1,11)}

grid = GridSearchCV(knn, params, scoring='roc_auc', cv=kf)

In [21]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best K:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models[("{}".format(df.name), "K Nearest Neighbor")] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9456935275119761
sf1_numbers AUC Score: 0.9163398692810457
sf1_numbers Best K: {'n_neighbors': 3} 

acs1_numbers Cross Val Score: 0.9506956886098138
acs1_numbers AUC Score: 0.8993464052287582
acs1_numbers Best K: {'n_neighbors': 5} 

social_numbers Cross Val Score: 0.9432484414082353
social_numbers AUC Score: 0.9035947712418302
social_numbers Best K: {'n_neighbors': 5} 

economic_numbers Cross Val Score: 0.9422956181740323
economic_numbers AUC Score: 0.8759803921568627
economic_numbers Best K: {'n_neighbors': 5} 

housing_numbers Cross Val Score: 0.9222702484862311
housing_numbers AUC Score: 0.8349673202614378
housing_numbers Best K: {'n_neighbors': 6} 



In [22]:
models

{('sf1_numbers',
  'Gaussian Naive Bayes'): GaussianNB(priors=None, var_smoothing=1e-09),
 ('sf1_numbers',
  'Multinomial Naive Bayes'): MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 ('acs1_numbers',
  'Gaussian Naive Bayes'): GaussianNB(priors=None, var_smoothing=1e-09),
 ('acs1_numbers',
  'Multinomial Naive Bayes'): MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 ('social_numbers',
  'Gaussian Naive Bayes'): GaussianNB(priors=None, var_smoothing=1e-09),
 ('social_numbers',
  'Multinomial Naive Bayes'): MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 ('economic_numbers',
  'Gaussian Naive Bayes'): GaussianNB(priors=None, var_smoothing=1e-09),
 ('economic_numbers',
  'Multinomial Naive Bayes'): MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 ('housing_numbers',
  'Gaussian Naive Bayes'): GaussianNB(priors=None, var_smoothing=1e-09),
 ('housing_numbers',
  'Multinomial Naive Bayes'): MultinomialNB(alpha=1.0, class_prior=None, fit_pr

In [82]:
predictions_sf1 = pd.DataFrame()
predictions_acs1 = pd.DataFrame()
predictions_social = pd.DataFrame()
predictions_economic = pd.DataFrame()
predictions_housing = pd.DataFrame()

prediction_dfs = [predictions_sf1, predictions_acs1, predictions_social, predictions_economic, predictions_housing]

for df in prediction_dfs:
    df['target'] = predictions['Democrat']

In [92]:
for key, model in models.items():
    if key[0] == 'sf1_numbers':
        predictions_sf1[key[1]] = model.predict(sf1_numbers)
    elif key[0] == 'acs1_numbers':
        predictions_acs1[key[1]] = model.predict(acs1_numbers)
    elif key[0] == 'social_numbers':
        predictions_social[key[1]] = model.predict(social_numbers)
    elif key[0] == 'economic_numbers':
        predictions_economic[key[1]] = model.predict(economic_numbers)
    elif key[0] == 'housing_numbers':
        predictions_housing[key[1]] = model.predict(housing_numbers)

In [83]:
del models[('sf1_numbers',
  'Gaussian Naive Bayes')]
del models[('sf1_numbers',
  'Multinomial Naive Bayes')]
del models[('acs1_numbers',
  'Gaussian Naive Bayes')]
del models[('acs1_numbers',
  'Multinomial Naive Bayes')]
del models[('social_numbers',
  'Gaussian Naive Bayes')]
del models[('social_numbers',
  'Multinomial Naive Bayes')]
del models[('economic_numbers',
  'Gaussian Naive Bayes')]
del models[('economic_numbers',
  'Multinomial Naive Bayes')]
del models[('housing_numbers',
  'Gaussian Naive Bayes')]
del models[('housing_numbers',
  'Multinomial Naive Bayes')]

KeyError: ('sf1_numbers', 'Gaussian Naive Bayes')

In [91]:
predictions_sf1.groupby(['target']).mean()

Unnamed: 0_level_0,Random Forest,Decision Tree,Logistic Regression,K Nearest Neighbor
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.030172,0.018319,0.056034,0.0
1,0.931624,0.921856,0.578755,0.0
