In [3]:
%matplotlib widget
import numpy as np
import pandas as pd
import scipy
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

np.random.seed(0)
plt.style.use("fivethirtyeight")

In [4]:
demographics_df = pd.read_csv("demographics.csv").assign(
    income=lambda x: x.income.str.replace(",", "").astype(int),
    pop=lambda x: x["pop"].str.replace(",", "").astype(float),
)
features = [
    "hispanic_or_latino",
    "white",
    "black",
    "native_american",
    "asian",
    "nhpi",
    "other",
    "two_or_more",
    "income",
    "foreign_born",
    "sex_ratio",
    "bachelors",
    "age",
]
X = demographics_df[features]
y = demographics_df["margin"]
features

['hispanic_or_latino',
 'white',
 'black',
 'native_american',
 'asian',
 'nhpi',
 'other',
 'two_or_more',
 'income',
 'foreign_born',
 'sex_ratio',
 'bachelors',
 'age']

In [None]:
X_standardized = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, random_state=0)

test_r_squares = []
train_r_squares = []

for i in range(1, 6):
    model = Pipeline([("poly", PolynomialFeatures(i)), ("ridge", RidgeCV())])
    model.fit(X_train, y_train)

    train_r_squares.append(mean_squared_error(y_train, model.predict(X_train)))
    test_r_squares.append(mean_squared_error(y_test, model.predict(X_test)))

In [14]:
pd.DataFrame(
    np.array([range(1, 6), train_r_squares, test_r_squares]).T,
    columns=["Degree", "Train MSE", "Test MSE"],
).astype({"Degree": int})

Unnamed: 0,Degree,Train MSE,Test MSE
0,1,0.042506,0.043745
1,2,0.029155,0.037511
2,3,0.0208,0.169745
3,4,0.012962,22.141404
4,5,0.008489,15896.997163


Optiaml model has degree 2:

In [7]:
opt_model = Pipeline([("poly", PolynomialFeatures(2)), ("ridge", RidgeCV())])
opt_model.fit(X_train, y_train)

r_sqr = opt_model.score(X_test, y_test)
opt_alpha = opt_model.named_steps["ridge"].alpha_

print(f"Optimal model R^2: {r_sqr:.4f}")
print(f"CV alpha: {opt_alpha}")

Optimal model R^2: 0.6518
CV alpha: 10.0


In [8]:
prediction_df = (
    demographics_df.assign(pred=lambda x: opt_model.predict(X_standardized))
    .assign(
        diff=lambda x: (x.pred - x.margin).abs(),
        classification=lambda x: np.where(x.pred > 0, "DEM", "REP"),
        true_class=lambda x: np.where(x.margin > 0, "DEM", "REP"),
    )
    .sort_values(by="diff", ascending=False)
)

In [9]:
two_classification_error_rate = (
    prediction_df.classification != prediction_df.true_class
).mean()
print(f"Classification error rate: {two_classification_error_rate:.3%}")

Classification error rate: 7.712%


In [15]:
def seven_class(margin):
    if margin >= 0.15:
        return "Solid D"
    if margin >= 0.1:
        return "Likely D"
    if margin >= 0.05:
        return "Lean D"
    if margin >= -0.05:
        return "Toss Up"
    if margin >= -0.1:
        return "Lean R"
    if margin >= -0.15:
        return "Likely R"
    else:
        return "Solid R"


prediction_df_seven_class = prediction_df.assign(
    classification=lambda x: [seven_class(y) for y in x.pred],
    true_class=lambda x: [seven_class(y) for y in x.margin],
)
seven_class_error_rate = (
    prediction_df_seven_class.classification != prediction_df_seven_class.true_class
).mean()
print(f"7-class error rate: {seven_class_error_rate:.2%}")
prediction_df_seven_class

7-class classification error rate: 21.39%


Unnamed: 0,pop,hispanic_or_latino,white,black,native_american,asian,nhpi,other,two_or_more,sex_ratio,...,income,foreign_born,age,county,state,margin,pred,diff,classification,true_class
1878,2270976.0,0.278,0.246,0.170,0.002,0.257,0.000,0.018,0.027,0.942,...,72028,0.469,39.3,Queens,New York,0.45,-0.915170,1.365170,Solid R,Solid D
2427,2372.0,0.043,0.932,0.000,0.011,0.004,0.000,0.000,0.010,1.020,...,55398,0.010,40.6,Sanborn,South Dakota,0.79,-0.516942,1.306942,Solid R,Solid D
558,979682.0,0.100,0.179,0.023,0.001,0.417,0.093,0.001,0.185,1.016,...,87722,0.195,38.2,Honolulu,Hawaii,0.27,1.251283,0.981283,Solid D,Solid D
264,1999.0,0.104,0.833,0.002,0.000,0.011,0.003,0.002,0.046,0.916,...,67763,0.064,41.1,Cheyenne,Colorado,-0.76,0.102113,0.862113,Likely D,Solid R
2583,1624.0,0.193,0.656,0.078,0.002,0.000,0.000,0.032,0.039,0.838,...,40250,0.002,37.3,Cottle,Texas,-0.65,-1.442915,0.792915,Solid R,Solid R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,2603.0,0.039,0.956,0.001,0.000,0.000,0.000,0.000,0.003,1.169,...,37404,0.002,46.8,Butte,Idaho,-0.72,-0.720057,0.000057,Solid R,Solid R
2916,11202.0,0.032,0.385,0.555,0.000,0.006,0.000,0.001,0.022,1.545,...,51701,0.021,41.0,Sussex,Virginia,0.12,0.120041,0.000041,Likely D,Likely D
2235,67606.0,0.134,0.768,0.006,0.036,0.011,0.001,0.000,0.044,0.996,...,48560,0.055,42.0,Klamath,Oregon,-0.41,-0.410032,0.000032,Solid R,Solid R
2390,4309.0,0.035,0.960,0.000,0.002,0.000,0.000,0.000,0.003,1.083,...,67396,0.017,42.9,Deuel,South Dakota,-0.46,-0.460014,0.000014,Solid R,Solid R
