In [45]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


In [3]:
train_df = pd.read_csv('CAH-201803-train.csv')

train_df.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [4]:
test_df = pd.read_csv('CAH-201803-test.csv')
test_df.head()

Unnamed: 0,id_num,Q1,Q2,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,2,Female,78,Conservative,College degree,White,Yes,Yes,No,"Yes, very religious",Pro-Choice,Yes,Yes,Behave no differently,4,5,1,Yes
1,3,Male,59,Moderate,High school or less,Black,Yes,Yes,Yes,"Yes, very religious",Pro-Choice,No,No,More Willing,5,4,5,No
2,4,Male,59,Moderate,High school or less,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,No,Behave no differently,4,5,1,Yes
3,6,Male,52,Moderate,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,No,Yes,Less Willing,5,4,4,No
4,11,Female,33,Moderate,High school or less,White,No,No,Yes,"Yes, somewhat religious",Pro-Choice,No,No,More Willing,5,5,4,Yes


In [11]:
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
  ],
  remainder = "passthrough"
)

X_train = train_df.drop(['id_num', 'political_affiliation'], axis = 1)
y_train = train_df['political_affiliation']

In [38]:
logit_pipeline = Pipeline([
    ('preprocessor', ct),
    ('logit', LogisticRegression())
])

logit_params = {'logit__C': [0.01, 0.1, 1, 10, 100]}
logit_gscv = GridSearchCV(logit_pipeline, logit_params, cv=5, scoring='accuracy')
logit_gscv.fit(X_train, y_train)

print(f"Best Logitistic parameters: {logit_gscv.best_params_}")
print(f"Best Model Accuracy: {logit_gscv.best_score_.round(5)}")

Best Logitistic parameters: {'logit__C': 1}
Best Model Accuracy: 0.61569
Best Logitistic parameters: {'logit__C': 1}
Best Model Accuracy: 0.61569


In [16]:
tree_pipeline = Pipeline([
    ('preprocessor', ct),
    ('tree', DecisionTreeClassifier(random_state=10))
])

tree_params = {'tree__max_depth': range(1, 20), 'tree__min_samples_split': range(2, 10)}
tree_gscv = GridSearchCV(tree_pipeline, tree_params, cv=5, scoring='accuracy')
tree_gscv.fit(X_train, y_train)

print(f"Best Tree parameters: {tree_gscv.best_params_}")
print(f"Best Model Accuracy: {tree_gscv.best_score_.round(5)}")

Best Tree parameters: {'tree__max_depth': 1, 'tree__min_samples_split': 2}
Best Model Accuracy: 0.55633
Best Tree parameters: {'tree__max_depth': 1, 'tree__min_samples_split': 2}
Best Model Accuracy: 0.55633


In [18]:
KNN_pipeline = Pipeline(steps=[
    ('preprocessor', ct),
    ('knn', KNeighborsClassifier())
])

knn_params = {'knn__n_neighbors': range(1, 21)}
knn_gscv = GridSearchCV(KNN_pipeline, knn_params, cv=5, scoring='accuracy')
knn_gscv.fit(X_train, y_train)

print(f"Best KNN parameters: {knn_gscv.best_params_}")
print(f"Best Model Accuracy: {knn_gscv.best_score_.round(5)}")

Best KNN parameters: {'knn__n_neighbors': 17}
Best Model Accuracy: 0.49073
Best KNN parameters: {'knn__n_neighbors': 17}
Best Model Accuracy: 0.49073


In [34]:
import warnings
warnings.filterwarnings("ignore")

lda_pipeline = Pipeline([
    ('preprocessing', ct),
    ('lda', LinearDiscriminantAnalysis())
])

lda_model = lda_pipeline.fit(X_train, y_train)

cv_scores = cross_val_score(lda_model, X_train, y_train, cv=5, scoring="accuracy")

param_grid = {
    'lda__solver': ['svd', 'lsqr', 'eigen'],
    'lda__shrinkage': [None, 'auto'],
    'lda__tol': [.00001,.0001,.001,.01]
}

gscv = GridSearchCV(lda_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

gscv.fit(X_train, y_train)

print("Best Parameters:", gscv.best_params_)
print("Best Cross-Validated Accuracy:", gscv.best_score_)


Best Parameters: {'lda__shrinkage': 'auto', 'lda__solver': 'lsqr', 'lda__tol': 1e-05}
Best Cross-Validated Accuracy: 0.633511586452763
Best Parameters: {'lda__shrinkage': 'auto', 'lda__solver': 'lsqr', 'lda__tol': 1e-05}
Best Cross-Validated Accuracy: 0.633511586452763


In [35]:
qda_pipeline = Pipeline([
    ('preprocessing', ct),
    ('qda', QuadraticDiscriminantAnalysis())
])

param_grid = {
    'qda__reg_param': [0.0, 0.1, 0.2, 0.5, 1.0],
    'qda__tol': [.00001,.0001,.001,.01]
}

qda_gscv = GridSearchCV(qda_pipeline, param_grid, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)
qda_gscv.fit(X_train, y_train)

print("Best Parameters:", qda_gscv.best_params_)
print("Best Cross-Validated Accuracy:", qda_gscv.best_score_)

Best Parameters: {'qda__reg_param': 0.5, 'qda__tol': 1e-05}
Best Cross-Validated Accuracy: 0.5800356506238858
Best Parameters: {'qda__reg_param': 0.5, 'qda__tol': 1e-05}
Best Cross-Validated Accuracy: 0.5800356506238858


In [37]:
svm_pipeline = Pipeline(
[
    ('preprocessing', ct),
    ('svc', SVC(kernel='poly'))
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__degree': [2, 3, 4],
    'svc__gamma': ['scale', 'auto']
}

svm_gscv = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

svm_gscv.fit(X_train, y_train)

print("Best Parameters:", svm_gscv.best_params_)
print("Best Cross-Validated Accuracy:", svm_gscv.best_score_)

Best Parameters: {'svc__C': 0.1, 'svc__degree': 2, 'svc__gamma': 'auto'}
Best Cross-Validated Accuracy: 0.6153297682709448
Best Parameters: {'svc__C': 0.1, 'svc__degree': 2, 'svc__gamma': 'auto'}
Best Cross-Validated Accuracy: 0.6153297682709448


In [46]:
svc_pipeline = Pipeline([
    ('preprocessing', ct),
    ('svc', LinearSVC())
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__max_iter': [1000, 5000],
}

svc_gscv = GridSearchCV(svc_pipeline, param_grid, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)

svc_gscv.fit(X_train, y_train)

print("Best Parameters:", svc_gscv.best_params_)
print("Best Cross-Validated Accuracy:", svc_gscv.best_score_)

Best Parameters: {'svc__C': 0.1, 'svc__max_iter': 5000}
Best Cross-Validated Accuracy: 0.6452762923351159
Best Parameters: {'svc__C': 0.1, 'svc__max_iter': 5000}
Best Cross-Validated Accuracy: 0.6452762923351159


Best model appears to be the SVC model based on the cross validation accuracy measure for the above models.

In [39]:
X_test = test_df.drop(['id_num'],axis=1)

In [43]:
lda_pipeline = Pipeline([
    ('preprocessing', ct),
    ('lda', LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto', tol=0.00001))
])

lda_model = lda_pipeline.fit(X_train, y_train)

y_pred = lda_pipeline.predict(X_test)

final_predictions = pd.DataFrame(
    {"id_num": test_df['id_num'],
    "political_affiliation_predicted": lda_model.predict(test_df)}
)

final_predictions

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Independent
3,6,Republican
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat


In [44]:
final_predictions.to_csv('political_predictions.csv',index=False)

In [47]:
svc_pipeline = Pipeline([
    ('preprocessing', ct),
    ('svc', LinearSVC(C=0.1, max_iter=5000))
])

svc_model = svc_pipeline.fit(X_train, y_train)

final_predictions = pd.DataFrame(
    {"id_num": test_df['id_num'],
    "political_affiliation_predicted": svc_model.predict(test_df)}
)

final_predictions

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Democrat
3,6,Republican
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat


In [48]:
final_predictions.to_csv('political_predictions_svc.csv',index=False)