# Machine Learning: Mini-Project 4
### Richard Campo in collaboration with Julia Klauss

In [91]:
# Importing necessary libraries from scikit-learn for Support Vector Machine (SVM) classification
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Importing essential libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm

## 2 Data Analysis
### Q1.

In [92]:
vote_df = pd.read_csv("vote.csv")
work_df = pd.read_csv("work.csv")

display(vote_df.dtypes)
display(work_df.dtypes)

prtage       int64
pesex       object
ptdtrace    object
pehspnon    object
prcitshp    object
peeduca     object
vote        object
dtype: object

prtage       int64
pesex       object
ptdtrace    object
pehspnon    object
prcitshp    object
peeduca     object
work        object
dtype: object

### Q2.
#### (a.)

In [93]:
work_mapper = {'flexible': 1, "not flexible": 0}
vote_mapper = {'vote': 1, "did not vote": 0}

work_df['work'] = work_df['work'].replace(work_mapper)
vote_df['vote'] = vote_df['vote'].replace(vote_mapper)

#### (b.)

In [94]:
print(work_df.prcitshp.unique())
print(vote_df.prcitshp.unique())

# https://stackoverflow.com/a/25221271
citshp_diff = np.setxor1d(work_df.prcitshp.unique(), vote_df.prcitshp.unique())

print("prcitshp differences: ", citshp_diff)

['NATIVE, BORN IN THE UNITED' 'FOREIGN BORN, U.S. CITIZEN BY'
 'FOREIGN BORN, NOT A CITIZEN OF' 'NATIVE, BORN IN PUERTO RICO OR'
 'NATIVE, BORN ABROAD OF']
['NATIVE, BORN IN THE UNITED' 'FOREIGN BORN, U.S. CITIZEN BY'
 'NATIVE, BORN IN PUERTO RICO OR' 'NATIVE, BORN ABROAD OF']
prcitshp differences:  ['FOREIGN BORN, NOT A CITIZEN OF']


The work data has a category "FOREIGN BORN, NOT A CITIZEN OF" that the vote data does not have.

In [95]:
print(work_df.ptdtrace.unique())
print(vote_df.ptdtrace.unique())

race_diff = np.setxor1d(work_df.ptdtrace.unique(), vote_df.ptdtrace.unique())

print("ptdtrace differences: ", race_diff)

['White Only' 'Black Only' 'White-AI' 'Black-AI' 'Asian Only'
 'White-Asian' 'Hawaiian/Pacific Islander Only' 'Asian-HP'
 'White-Hawaiian' 'American Indian, Alaskan' 'White-Black' '2 or 3 Races'
 '4 or 5 Races']
['White Only' 'Black Only' 'Hawaiian/Pacific Islander Only' 'Asian Only'
 'American Indian, Alaskan' 'White-AI' 'Black-AI' 'W-B-AI' '2 or 3 Races'
 'White-Asian' 'Asian-HP' 'White-Black' 'White-Hawaiian' 'W-A-HP'
 'Black-Asian']
ptdtrace differences:  ['4 or 5 Races' 'Black-Asian' 'W-A-HP' 'W-B-AI']


The work data has a "4 or 5 races" category that the vote data does not have, and the vote data has categories for "Black-Asian," "W-A-HP," and "W-B-AI" that the work data does not have.

#### (c.) and (d.)

In [96]:
work_df = pd.get_dummies(work_df, columns=['prcitshp'], drop_first=True)
vote_df = pd.get_dummies(vote_df, columns=['prcitshp'], drop_first=True)

vote_df["prcitshp_FOREIGN BORN, NOT A CITIZEN OF"] = 0

work_df = pd.get_dummies(work_df, columns=['ptdtrace'])
vote_df = pd.get_dummies(vote_df, columns=['ptdtrace'])

vote_df['4 or 5 Races'] = 0
work_df['W-B-AI'] = 0
work_df['W-A-HP'] = 0
work_df['Black-Asian'] = 0

work_df = pd.get_dummies(work_df, columns=['pesex'], drop_first=True)
vote_df = pd.get_dummies(vote_df, columns=['pesex'], drop_first=True)
work_df = pd.get_dummies(work_df, columns=['pehspnon'], drop_first=True)
vote_df = pd.get_dummies(vote_df, columns=['pehspnon'], drop_first=True)
work_df = pd.get_dummies(work_df, columns=['peeduca'], drop_first=True)
vote_df = pd.get_dummies(vote_df, columns=['peeduca'], drop_first=True)

print(len(work_df.columns))
print(work_df.columns)

print(len(vote_df.columns))
print(vote_df.columns)

39
Index(['prtage', 'work', 'prcitshp_FOREIGN BORN, U.S. CITIZEN BY',
       'prcitshp_NATIVE, BORN ABROAD OF',
       'prcitshp_NATIVE, BORN IN PUERTO RICO OR',
       'prcitshp_NATIVE, BORN IN THE UNITED', 'ptdtrace_2 or 3 Races',
       'ptdtrace_4 or 5 Races', 'ptdtrace_American Indian, Alaskan',
       'ptdtrace_Asian Only', 'ptdtrace_Asian-HP', 'ptdtrace_Black Only',
       'ptdtrace_Black-AI', 'ptdtrace_Hawaiian/Pacific Islander Only',
       'ptdtrace_White Only', 'ptdtrace_White-AI', 'ptdtrace_White-Asian',
       'ptdtrace_White-Black', 'ptdtrace_White-Hawaiian', 'W-B-AI', 'W-A-HP',
       'Black-Asian', 'pesex_MALE', 'pehspnon_NON-HISPANIC',
       'peeduca_11TH GRADE', 'peeduca_12TH GRADE NO DIPLOMA',
       'peeduca_1ST, 2ND, 3RD OR 4TH GRADE', 'peeduca_5TH OR 6TH GRADE',
       'peeduca_7TH OR 8TH GRADE', 'peeduca_9TH GRADE',
       'peeduca_ASSOCIATE DEGREE-ACADEMIC',
       'peeduca_ASSOCIATE DEGREE-OCCUPATIONAL/', 'peeduca_BACHELOR'S DEGREE',
       'peeduca_DOCTORATE 

### Q3.

In [97]:
X_work = work_df.drop('work', axis = 1)
y_work = work_df['work']

scaler = StandardScaler()
X_work_scaled = scaler.fit_transform(X_work)

parameters = {'kernel': ('linear', 'poly', 'sigmoid'), 'C': [0.1, 1, 5, 10]}

svc = SVC() 
cv = KFold(n_splits=5, random_state=26, shuffle=True)

classifier = GridSearchCV(svc, parameters, cv=cv)

classifier.fit(X_work_scaled, y_work);

In [98]:
hyperparameter_sets = zip(
    classifier.cv_results_["params"], 
    classifier.cv_results_["mean_test_score"], 
    classifier.cv_results_["rank_test_score"]
    )

for params, mean_score, rank in hyperparameter_sets:
     error_rate = 1 - mean_score
     print("Hyperparameters:", params)
     print("Error Rate:", error_rate)
     print("Rank:", rank)
     print()

Hyperparameters: {'C': 0.1, 'kernel': 'linear'}
Error Rate: 0.13740000000000008
Rank: 1

Hyperparameters: {'C': 0.1, 'kernel': 'poly'}
Error Rate: 0.36440000000000006
Rank: 12

Hyperparameters: {'C': 0.1, 'kernel': 'sigmoid'}
Error Rate: 0.13859999999999995
Rank: 5

Hyperparameters: {'C': 1, 'kernel': 'linear'}
Error Rate: 0.1379999999999999
Rank: 2

Hyperparameters: {'C': 1, 'kernel': 'poly'}
Error Rate: 0.1774000000000001
Rank: 11

Hyperparameters: {'C': 1, 'kernel': 'sigmoid'}
Error Rate: 0.16059999999999997
Rank: 8

Hyperparameters: {'C': 5, 'kernel': 'linear'}
Error Rate: 0.1382
Rank: 3

Hyperparameters: {'C': 5, 'kernel': 'poly'}
Error Rate: 0.1469999999999999
Rank: 7

Hyperparameters: {'C': 5, 'kernel': 'sigmoid'}
Error Rate: 0.1754000000000001
Rank: 9

Hyperparameters: {'C': 10, 'kernel': 'linear'}
Error Rate: 0.1382
Rank: 3

Hyperparameters: {'C': 10, 'kernel': 'poly'}
Error Rate: 0.14540000000000008
Rank: 6

Hyperparameters: {'C': 10, 'kernel': 'sigmoid'}
Error Rate: 0.175999

### Q4.

In [99]:
best_params_index = np.argmax(classifier.cv_results_['mean_test_score'])

best_params = classifier.cv_results_['params'][best_params_index]

print("Best hyperparameter set:", best_params)

Best hyperparameter set: {'C': 0.1, 'kernel': 'linear'}


The hyperparameters that give us the lowest error rate (highest mean test score) are $C = 0.1$ and a linear kernel.

### Q5.

In [100]:
best_params_index = np.argmax(classifier.cv_results_['mean_test_score'])

best_accuracy = classifier.cv_results_['mean_test_score'][best_params_index]

print("Accuracy score of best hyperparameters: ", 
      round(best_accuracy, 4))

Accuracy score of best hyperparameters:  0.8626


The accuracy score we get from the model with the optimal hyperparameters is about 86.26%.

### Q6.

In [101]:
best_svc = SVC(C=5, kernel="poly")

best_svc.fit(X_work_scaled, y_work)

X_vote = vote_df.drop(columns="vote")
y_vote = vote_df["vote"]

scaler = StandardScaler()
X_vote_scaled = scaler.fit_transform(X_vote)

vote_df.loc[:,'imputed_work'] = best_svc.predict(X_vote_scaled)

vote_df['imputed_work'].describe()

count    5000.000000
mean        0.680800
std         0.466213
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: imputed_work, dtype: float64

### Q7.

In [102]:
vote_df.loc[:,"prtage_sq"] = np.power(vote_df["prtage"], 2)

X = vote_df[["imputed_work", "prtage", "prtage_sq", "pesex_MALE"]]
X = sm.add_constant(X)
y = vote_df["vote"]

result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,vote,R-squared:,0.548
Model:,OLS,Adj. R-squared:,0.548
Method:,Least Squares,F-statistic:,1515.0
Date:,"Sat, 02 Mar 2024",Prob (F-statistic):,0.0
Time:,10:01:06,Log-Likelihood:,-1636.5
No. Observations:,5000,AIC:,3283.0
Df Residuals:,4995,BIC:,3315.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.4684,0.039,37.714,0.000,1.392,1.545
imputed_work,0.1988,0.017,11.410,0.000,0.165,0.233
prtage,-0.0325,0.001,-22.733,0.000,-0.035,-0.030
prtage_sq,0.0002,1.49e-05,11.041,0.000,0.000,0.000
pesex_MALE,0.0198,0.010,2.071,0.038,0.001,0.039

0,1,2,3
Omnibus:,69.321,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,72.032
Skew:,-0.285,Prob(JB):,2.28e-16
Kurtosis:,3.144,Cond. No.,24900.0


Looking at the results, we can see that our imputed work variable is predictive of whether an individual will vote. Since this is a linear probability model, we can interpret the estimated coefficient of `imputed_work` to mean that having a job with flexible work hours increases the predicted probability that an individual will vote by 19.88 percentage points holding age, age squared, and sex constant. This result is also statistically significant at the 5% level since the p-value is approximately 0. This means that we can reject the null hypothesis that a flexible work schedule has no predicted effect on voting in favor of the alternative hypothesis that the predicted effect of having a flexible work schedule on whether an individual votes is different from zero. 

### Q8.

In [103]:
work_vote_relationship = result.params[1]

In [104]:
# attenuation bias correction formula
def compute_M(a,b):
    return 1 / (1 - 2 * b) * (1 - (1 - b) * b / a - (1 - b) * b / (1 - a))

In [105]:
# For the value of a, find the proportion of imputed work schedules that are flexible
a = sum(vote_df["imputed_work"])/(vote_df["imputed_work"].size)
print("The value of a is: ", a)

0.6808


In [106]:
# For the value of b, use the cross-validation error rate 
b = round(1 - max(classifier.cv_results_['mean_test_score']), 4)
print("The value of b is: ", b)

0.1374


In [110]:
M = compute_M(a,b)
print("The value of M is: ", round(M, 4))

The value of M is:  0.6269


We find the value of $a$ is about 0.6808, the value of $b$ is about 0.1374, and the value of $M$ is about 0.6269.

### Q9.

In [112]:
work_vote_bias_correction = work_vote_relationship / M
print(round(work_vote_bias_correction, 4))

0.3171


The bias-corrected version of the estimated coefficient of `imputed_work` is 0.3171, which is larger than the uncorrected estimate of 0.1988 we found earlier. This indicates that the estimated effect of having a job with flexible hours on the predicted probability of voting is even larger than we estimated previously, so the estimate is still statistically significant at the 5% level. Now, the interpretation is that we predict that having a job with flexible hours increases the expected probability of voting by 31.71 percentage points according to our linear probability model, holding age, age squared, and sex constant.