In [2]:
# Widget to manipulate plots in Jupyter notebooks
%matplotlib widget

import matplotlib.pyplot as plt  # For general plotting
from matplotlib import cm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from math import ceil, floor

import numpy as np
import seaborn as sns
from scipy.stats import norm, multivariate_normal
from scipy.special import logit, expit

np.set_printoptions(suppress=True)

plt.rc("font", size=22)  # controls default text sizes
plt.rc("axes", titlesize=18)  # fontsize of the axes title
plt.rc("axes", labelsize=18)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=14)  # fontsize of the tick labels
plt.rc("ytick", labelsize=14)  # fontsize of the tick labels
plt.rc("legend", fontsize=16)  # legend fontsize
plt.rc("figure", titlesize=22)  # fontsize of the figure title

# Import dataset

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Dataset

demographics_df = pd.read_csv("demographics.csv")
features = [
    "hispanic_or_latino",
    "white",
    "black",
    "native_american",
    "asian",
    "nhpi",
    "other",
    "two_or_more",
    "sex_ratio",
    "bachelors",
    "income",
    "foreign_born",
    "age",
    "county",
    "state",
]

X = demographics_df[features]
y = demographics_df["margin"]

state_ids = {}
states = X["state"].unique()
for i in range(len(states)):
    state_ids[states[i]] = i

county_state_ids = {}
to_drop = []
for i, r in X.iterrows():
    county_state_str = r["county"] + r["state"]
    if county_state_str not in county_state_ids.keys():
        county_state_ids[county_state_str] = i

    X.at[i, "county"] = county_state_ids[county_state_str]
    X.at[i, "state"] = state_ids[r["state"]]

X.drop(to_drop, axis=0, inplace=True)
y.drop(to_drop, axis=0, inplace=True)

X["income"] = X["income"].str.replace(",", "")

X_standardized = StandardScaler().fit_transform(X)

y_binary = [0 if l < 0 else 1 for l in y]
print(y_binary)

X_train, X_test, y_train, y_test = train_test_split(
    X_standardized, y_binary, test_size=0.15, random_state=0
)

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(to_drop, axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["income"] = X["income"].str.replace(",", "")


In [4]:
# Some stats about the dataset (out of curiosity)
print("N samples: {}".format(len(X)))
print("N Training Samples: {}".format(len(X_train)))
print("N Test Samples: {}".format(len(X_test)))

N samples: 3151
N Training Samples: 2678
N Test Samples: 473


In [5]:
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV()

lr.fit(X_train, y_train)
y_preds_test = lr.predict(X_test)

score = lr.score(X_test, y_test)
print(score)

0.8964059196617337


In [6]:
# Prep for continuous regression to predict margin
X_train, X_test, y_train, y_test = train_test_split(
    X_standardized, y, test_size=0.15, random_state=0
)

In [31]:
# What were our biggest misses?
# We can also get a sort of "implied margin" for each county doing an inverse normal on our probabilities with STD of 0.2
log_reg_pred = demographics_df.assign(
    pred=lr.predict(X_standardized),
    real_class=y_binary,
    prob=lr.predict_proba(X_standardized)[:, 1],
).assign(diff=lambda df: (df.real_class - df.prob).abs(), implied_margin=lambda df: norm.ppf(df.prob, loc=0, scale=0.2)).sort_values(
    by="diff", ascending=False
)

log_reg_pred.head(20)

Unnamed: 0,pop,hispanic_or_latino,white,black,native_american,asian,nhpi,other,two_or_more,sex_ratio,...,foreign_born,age,county,state,margin,pred,real_class,prob,diff,implied_margin
2575,1006038.0,0.153,0.555,0.097,0.003,0.157,0.001,0.003,0.032,0.972,...,0.213,37.0,Collin,Texas,-0.04,1,0,0.994648,0.994648,0.510439
1619,9153.0,0.035,0.892,0.015,0.014,0.006,0.001,0.001,0.036,1.119,...,0.017,49.0,Deer Lodge,Montana,0.08,0,1,0.006819,0.993181,-0.493335
2427,2372.0,0.043,0.932,0.0,0.011,0.004,0.0,0.0,0.01,1.02,...,0.01,40.6,Sanborn,South Dakota,0.79,0,1,0.009957,0.990043,-0.465592
1847,80320.0,0.029,0.898,0.039,0.002,0.012,0.0,0.002,0.016,1.058,...,0.044,40.3,Clinton,New York,0.05,0,1,0.011901,0.988099,-0.452061
1332,35709.0,0.018,0.881,0.025,0.041,0.006,0.0,0.0,0.028,1.101,...,0.014,41.2,Carlton,Minnesota,0.02,0,1,0.013243,0.986757,-0.443804
98,17760.0,0.085,0.478,0.006,0.148,0.193,0.003,0.006,0.081,1.188,...,0.186,35.5,District 32,Alaska,-0.11,1,0,0.986115,0.986115,0.440101
1361,10571.0,0.015,0.954,0.006,0.005,0.001,0.0,0.001,0.017,1.075,...,0.011,50.5,Lake,Minnesota,0.04,0,1,0.014872,0.985128,-0.434698
2824,26843.0,0.015,0.948,0.007,0.005,0.002,0.0,0.001,0.022,1.004,...,0.039,46.1,Orleans,Vermont,0.04,0,1,0.015142,0.984858,-0.43327
2221,39656.0,0.087,0.833,0.008,0.005,0.011,0.003,0.0,0.054,0.98,...,0.05,44.4,Clatsop,Oregon,0.11,0,1,0.01922,0.98078,-0.414028
1194,34415.0,0.013,0.947,0.005,0.004,0.009,0.0,0.0,0.023,0.961,...,0.024,51.2,Lincoln,Maine,0.1,0,1,0.019547,0.980453,-0.41264


In [33]:
# How'd Massachusetts do?
log_reg_pred.assign(diff=lambda df: (df.margin - df.implied_margin).abs()).sort_values(
    "diff", ascending=False
).query("state == ' Massachusetts'")

Unnamed: 0,pop,hispanic_or_latino,white,black,native_american,asian,nhpi,other,two_or_more,sex_ratio,...,foreign_born,age,county,state,margin,pred,real_class,prob,diff,implied_margin
1230,17430.0,0.035,0.839,0.041,0.014,0.006,0.0,0.02,0.045,0.949,...,0.123,49.0,Dukes,Massachusetts,0.57,0,1,0.286126,0.682947,-0.112947
1232,70529.0,0.042,0.9,0.01,0.002,0.015,0.001,0.002,0.029,0.949,...,0.047,47.0,Franklin,Massachusetts,0.44,0,1,0.164345,0.635351,-0.195351
1228,125927.0,0.05,0.877,0.025,0.001,0.017,0.0,0.002,0.027,0.942,...,0.056,47.2,Berkshire,Massachusetts,0.47,0,1,0.205737,0.634261,-0.164261
1234,161361.0,0.058,0.832,0.024,0.001,0.051,0.001,0.002,0.03,0.876,...,0.088,36.6,Hampshire,Massachusetts,0.47,1,1,0.605881,0.41628,0.05372
1227,213505.0,0.032,0.883,0.028,0.005,0.014,0.001,0.01,0.027,0.919,...,0.084,53.7,Barnstable,Massachusetts,0.24,0,1,0.317309,0.335047,-0.095047
1233,466647.0,0.257,0.62,0.077,0.001,0.024,0.0,0.002,0.019,0.934,...,0.088,39.4,Hampden,Massachusetts,0.17,0,1,0.269052,0.293136,-0.123136
1231,787038.0,0.214,0.695,0.032,0.001,0.034,0.0,0.003,0.021,0.931,...,0.175,40.9,Essex,Massachusetts,0.29,1,1,0.595047,0.241891,0.048109
1236,11212.0,0.141,0.722,0.072,0.01,0.014,0.001,0.004,0.037,1.183,...,0.156,41.7,Nantucket,Massachusetts,0.45,1,1,0.898103,0.195836,0.254164
1229,563301.0,0.084,0.801,0.039,0.001,0.023,0.0,0.018,0.035,0.938,...,0.128,41.0,Bristol,Massachusetts,0.12,0,1,0.391718,0.174969,-0.054969
1237,703740.0,0.047,0.737,0.068,0.001,0.113,0.0,0.006,0.028,0.927,...,0.185,40.9,Norfolk,Massachusetts,0.36,1,1,0.99366,0.138527,0.498527


In [34]:
# What's the MSE using our "implied margins?"
mean_squared_error(log_reg_pred.margin, log_reg_pred.implied_margin)

0.04473478167430454

In [35]:
# Which states were the most off? Looks like Northeastern states -- probably due to larger white populations that still vote quite liberally
log_reg_pred.groupby("state")["diff"].mean().sort_values(ascending=False).to_frame().reset_index()

Unnamed: 0,state,diff
0,Vermont,0.795893
1,New Hampshire,0.654215
2,Rhode Island,0.491343
3,Maine,0.452518
4,Massachusetts,0.430689
5,Delaware,0.30714
6,Colorado,0.290103
7,California,0.272857
8,Washington,0.27131
9,Alaska,0.258785


# SKLearn Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

linreg.score(X_test, y_test)
linpreds = linreg.predict(X_test)
# print(linpreds)

binary_lin_preds = []

for pred in linpreds:
    if pred < 0:
        binary_lin_preds.append(0)
    else:
        binary_lin_preds.append(1)

In [11]:
# Compute binary classification error using margin predictions
#  ie even if margin is incorrect, is the binary choice (trump v biden) correct

err_sum = 0
for i in range(len(binary_lin_preds)):
    if binary_lin_preds[i] != y_preds_test[i]:
        err_sum += 1

err = err_sum / len(binary_lin_preds)
print("Pr(mislassification): {}".format(err))

Pr(mislassification): 0.042283298097251586


In [12]:
# MSE of linear regression samples
print("MSE: {}".format(mean_squared_error(y_test, linpreds)))

MSE: 0.04111052645714843


In [13]:
from scipy import stats

misclassified_samples = []
misclassified_labels = []
labels_test = y_test.to_numpy()
for i in range(len(binary_lin_preds)):
    if binary_lin_preds[i] != labels_test[i]:
        misclassified_samples.append(X_test[i])
        misclassified_labels.append(labels_test[i])

# print(misclassified_samples)
# print(misclassified_labels)

print("Mean True Margin: {}".format(np.mean(misclassified_labels)))
print("Minimum True Margin: {}".format(np.min(misclassified_labels)))
print("Maximum True Margin: {}".format(np.max(misclassified_labels)))


# Doesn't quite work yet - going to find most misclassified state
# Maybe some additional stats as well

# states = []
# for i in range(len(misclassified_samples)):
#     states.append(misclassified_samples[i][-2])
# print("Most Misclassified State: {}".format(list(state_ids.keys()).index(stats.mode(states))))

Mean True Margin: -0.31359408033826636
Minimum True Margin: -0.88
Maximum True Margin: 0.73
