In [1]:
import numpy as np
import pandas as pd

from sklearn import naive_bayes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
sf1_numbers = pd.read_csv('sf1_numbers_07.csv')
sf1_percentages = pd.read_csv('sf1_percentages_07.csv')

acs1_numbers = pd.read_csv('acs1_numbers_07.csv')
acs1_percentages = pd.read_csv('acs1_percentages_07.csv')

social_numbers = pd.read_csv('social_numbers_07.csv')
social_percentages = pd.read_csv('social_percentages_07.csv')

economic_numbers = pd.read_csv('economic_numbers_07.csv')
economic_percentages = pd.read_csv('economic_percentages_07.csv')

housing_numbers = pd.read_csv('housing_numbers_07.csv')
housing_percentages = pd.read_csv('housing_percentages_07.csv')

In [3]:
predictions = pd.read_csv('predictions.csv')

In [4]:
full_dfs = [sf1_numbers, sf1_percentages, acs1_numbers, acs1_percentages,
           social_numbers, social_percentages, economic_numbers,
           economic_percentages, housing_numbers, housing_percentages]

dfs = [sf1_numbers, acs1_numbers,social_numbers,economic_numbers,housing_numbers]

dfs_numbers = ['sf1_numbers', 'acs1_numbers', 'social_numbers', 'economic_numbers',
              'housing_numbers']

names = ['sf1_numbers', 'sf1_percentages', 'acs1_numbers',
         'acs1_percentages','social_numbers','social_percentages',
         'economic_numbers','economic_percentages','housing_numbers',
         'housing_percentages']

In [5]:
for df in full_dfs:
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
predictions.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
for index, df in enumerate(full_dfs):
    df.name = names[index]

In [7]:
y = predictions.loc[:,'Democrat']

scale = StandardScaler()

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

models = {}

In [12]:
nbg = naive_bayes.GaussianNB()
nbb = naive_bayes.BernoulliNB()
nbm = naive_bayes.MultinomialNB()

In [13]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    print(df.name,'\n')
    
    model1 = nbg.fit(X_train, y_train)
    
    print('Gaussian Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(model1,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, model1.predict(X_test)),'\n')
    
    if roc_auc_score(y_test, model1.predict(X_test)) >= 0.75:
        models["Gaussian Naive Bayes:".format(df.name)] = model1
    
    model2 = nbb.fit(X_train, y_train)
    
    print('Bernoulli Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(model2,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, model2.predict(X_test)),'\n')
    
    if roc_auc_score(y_test, model2.predict(X_test)) >= 0.75:
        models["Bernoulli Naive Bayes:".format(df.name)] = model2
    
    model3 = nbm.fit(X_train, y_train)
    
    print('Multinomial Naive Bayes')
    print("Cross Val Score:", np.mean(cross_val_score(model3,X_train,y_train,scoring='roc_auc',cv=kf)))
    print("AUC Score:", roc_auc_score(y_test, model3.predict(X_test)),'\n')
    
    if roc_auc_score(y_test, model3.predict(X_test)) >= 0.75:
        models["Multinomial Naive Bayes:".format(df.name)] = model3

sf1_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.8883482321019077
AUC Score: 0.788562091503268 

Bernoulli Naive Bayes
Cross Val Score: 0.5007692307692307
AUC Score: 0.5 

Multinomial Naive Bayes
Cross Val Score: 0.737861970681106
AUC Score: 0.7594771241830065 

acs1_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.9146520232277998
AUC Score: 0.7895424836601308 

Bernoulli Naive Bayes
Cross Val Score: 0.5383230176929881
AUC Score: 0.5704248366013072 

Multinomial Naive Bayes
Cross Val Score: 0.8142354868273083
AUC Score: 0.8001633986928105 

social_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.897576562493747
AUC Score: 0.7720588235294117 

Bernoulli Naive Bayes
Cross Val Score: 0.49923076923076926
AUC Score: 0.5 

Multinomial Naive Bayes
Cross Val Score: 0.8019401127772909
AUC Score: 0.7839869281045752 

economic_numbers 

Gaussian Naive Bayes
Cross Val Score: 0.8944264574403797
AUC Score: 0.7859477124183006 

Bernoulli Naive Bayes
Cross Val Score: 0.5192444345727629
AUC Sco

In [8]:
rf = RandomForestClassifier(n_estimators=10)

params = {'max_depth': range(1,21)}

grid = GridSearchCV(rf, params, scoring='roc_auc', cv=kf)

In [9]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best Depth:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models["Random Forest {}".format(df.name)] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9566226827228231
sf1_numbers AUC Score: 0.9245098039215686
sf1_numbers Best Depth {'max_depth': 8} 

acs1_numbers Cross Val Score: 0.9458523890136188
acs1_numbers AUC Score: 0.8751633986928105
acs1_numbers Best Depth {'max_depth': 12} 

social_numbers Cross Val Score: 0.9427918062375686
social_numbers AUC Score: 0.8357843137254901
social_numbers Best Depth {'max_depth': 12} 

economic_numbers Cross Val Score: 0.9349593657578051
economic_numbers AUC Score: 0.853921568627451
economic_numbers Best Depth {'max_depth': 8} 

housing_numbers Cross Val Score: 0.9359695357241478
housing_numbers AUC Score: 0.86781045751634
housing_numbers Best Depth {'max_depth': 16} 



In [10]:
dt = DecisionTreeClassifier()

params = {'max_depth': range(1,21)}

grid = GridSearchCV(dt, params, scoring='roc_auc', cv=kf)

In [11]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best Depth:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models["Decision Tree {}".format(df.name)] = model.best_estimator_

sf1_numbers Cross Val Score: 0.9385017532906718
sf1_numbers AUC Score: 0.930392156862745
sf1_numbers Best Depth {'max_depth': 15} 

acs1_numbers Cross Val Score: 0.881475769691003
acs1_numbers AUC Score: 0.8093137254901961
acs1_numbers Best Depth {'max_depth': 4} 

social_numbers Cross Val Score: 0.8842701412317456
social_numbers AUC Score: 0.77140522875817
social_numbers Best Depth {'max_depth': 4} 

economic_numbers Cross Val Score: 0.878266804336603
economic_numbers AUC Score: 0.8331699346405229
economic_numbers Best Depth {'max_depth': 4} 

housing_numbers Cross Val Score: 0.8802471599572582
housing_numbers AUC Score: 0.8112745098039216
housing_numbers Best Depth {'max_depth': 4} 



In [None]:
lvm = LinearSVC()

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

grid = GridSearchCV(lvm, params, scoring='roc_auc', cv=kf)

In [None]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best C:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models["Linear SVM {}".format(df.name)] = model.best_estimator_

In [14]:
svm = SVC(kernel = 'rbf', gamma = 'scale')

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

grid = GridSearchCV(svm, params, scoring='roc_auc', cv=kf)

In [15]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best C:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models["RBF SVM {}".format(df.name)] = model.best_estimator_

KeyboardInterrupt: 

In [None]:
lr = LogisticRegression()

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l1','l2']}

grid = GridSearchCV(lr, params, scoring='roc_auc', cv=kf)

In [None]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best C And Penalty:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models["Logistic Regression {}".format(df.name)] = model.best_estimator_

In [None]:
knn = KNeighborsClassifier()

params = {'n_neighbors': range(1,11)}

grid = GridSearchCV(knn, params, scoring='roc_auc', cv=kf)

In [None]:
for df in dfs:
    X = df.loc[:]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
    
    if df.name in dfs_numbers:
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    model = grid.fit(X_train, y_train)
    
    print(df.name, "Cross Val Score:", np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc',cv=kf)))
    print(df.name, "AUC Score:", roc_auc_score(y_test, model.predict(X_test)))
    print(df.name, "Best K:", model.best_params_, '\n')
    
    if roc_auc_score(y_test, model.predict(X_test)) >= 0.75:
        models["K Nearest Neighbor {}".format(df.name)] = model.best_estimator_