In [67]:
# Widget to manipulate plots in Jupyter notebooks
%matplotlib widget

import matplotlib.pyplot as plt # For general plotting
from matplotlib import cm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from math import ceil, floor

import numpy as np
import seaborn as sns
from scipy.stats import norm, multivariate_normal

np.set_printoptions(suppress=True)

plt.rc('font', size=22)          # controls default text sizes
plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=18)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)    # fontsize of the tick labels
plt.rc('ytick', labelsize=14)    # fontsize of the tick labels
plt.rc('legend', fontsize=16)    # legend fontsize
plt.rc('figure', titlesize=22)   # fontsize of the figure title

# Import dataset

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Dataset

demographics_df = pd.read_csv("demographics.csv")
features = [
    "pop",
    "hispanic_or_latino",
    "white",
    "black",
    "native_american",
    "asian",
    "nhpi",
    "other",
    "two_or_more",
    "sex_ratio",
    "bachelors",
    "income",
    "foreign_born",
    "age",
    "county",
    "state"
]

X = demographics_df[features]
y = demographics_df["margin"]

state_ids = {}
states = X['state'].unique()
for i in range(len(states)):
    state_ids[states[i]] = i

county_state_ids = {}
to_drop = []
for i, r in X.iterrows():
    county_state_str = r['county'] + r['state']
    if county_state_str not in county_state_ids.keys():
        county_state_ids[county_state_str] = i

    X.at[i, 'county'] = county_state_ids[county_state_str]
    X.at[i, 'state'] = state_ids[r['state']]

    if r['income'] == '-':
        to_drop.append(i)

X.drop(to_drop, axis=0, inplace=True)
y.drop(to_drop, axis=0, inplace=True)

X['pop'] = X['pop'].str.replace(',','')
X['income'] = X['income'].str.replace(',','')

X_standardized = StandardScaler().fit_transform(X)

y_binary = [0 if l < 0 else 1 for l in y]
print(y_binary)

X_train, X_test, y_train, y_test = train_test_split(X_standardized, y_binary, test_size=0.15, random_state=0)

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(to_drop, axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['pop'] = X['pop'].str.replace(',','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['income'] = X['income'].str.replace(',','')


In [69]:
# Some stats about the dataset (out of curiosity)
print("N samples: {}".format(len(X)))
print("N Training Samples: {}".format(len(X_train)))
print("N Test Samples: {}".format(len(X_test)))


N samples: 3151
N Training Samples: 2678
N Test Samples: 473


In [70]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV()

lr.fit(X_train, y_train)
y_preds_test = lr.predict(X_test)

score = lr.score(X_test, y_test)
print(score)

0.8921775898520085


In [71]:
# Prep for continuous regression to predict margin
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.15, random_state=0)

# SKLearn Linear Regression

In [72]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

linreg.score(X_test, y_test)
linpreds = linreg.predict(X_test)
# print(linpreds)

binary_lin_preds = []

for pred in linpreds:
    if pred < 0:
        binary_lin_preds.append(0)
    else:
        binary_lin_preds.append(1)

In [73]:
# Compute binary classification error using margin predictions
#  ie even if margin is incorrect, is the binary choice (trump v biden) correct

err_sum = 0
for i in range(len(binary_lin_preds)):
    if binary_lin_preds[i] != y_preds_test[i]:
        err_sum += 1

err = err_sum / len(binary_lin_preds)
print("Pr(mislassification): {}".format(err))

Pr(mislassification): 0.042283298097251586


In [74]:
# MSE of linear regression samples
print("MSE: {}".format(mean_squared_error(y_test, linpreds)))

MSE: 0.04090909900828439


In [75]:
from scipy import stats
misclassified_samples = []
misclassified_labels = []
labels_test = y_test.to_numpy()
for i in range(len(binary_lin_preds)):
    if binary_lin_preds[i] != labels_test[i]:
        misclassified_samples.append(X_test[i])
        misclassified_labels.append(labels_test[i])

# print(misclassified_samples)
# print(misclassified_labels)

print("Mean True Margin: {}".format(np.mean(misclassified_labels)))
print("Minimum True Margin: {}".format(np.min(misclassified_labels)))
print("Maximum True Margin: {}".format(np.max(misclassified_labels)))


# Doesn't quite work yet - going to find most misclassified state
# Maybe some additional stats as well

# states = []
# for i in range(len(misclassified_samples)):
#     states.append(misclassified_samples[i][-2])
# print("Most Misclassified State: {}".format(list(state_ids.keys()).index(stats.mode(states))))

Mean True Margin: -0.31359408033826636
Minimum True Margin: -0.88
Maximum True Margin: 0.73
