In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [41]:
df = pd.read_csv('data/cleaned_data.csv')
df['county_fips'] = df['county_fips'].astype(str).str.zfill(5)

In [42]:
print(df['county_fips'].astype(str).apply(len).unique())

[5]


In [43]:
df.head()

Unnamed: 0,year,state,county_name,county_fips,party,candidatevotes,totalvotes,Population,prev_party,flipped
0,2000,ALABAMA,AUTAUGA,1001,REPUBLICAN,11993,17208,43671,,1
1,2004,ALABAMA,AUTAUGA,1001,REPUBLICAN,15196,20081,48366,REPUBLICAN,0
2,2008,ALABAMA,AUTAUGA,1001,REPUBLICAN,17403,23641,53277,REPUBLICAN,0
3,2012,ALABAMA,AUTAUGA,1001,REPUBLICAN,17379,23932,54970,REPUBLICAN,0
4,2016,ALABAMA,AUTAUGA,1001,REPUBLICAN,18172,24973,55302,REPUBLICAN,0


In [44]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Encode 'party' (target 1)
le_party = LabelEncoder()
df['party_encoded'] = le_party.fit_transform(df['party'])

# Prepare target dataframe for multi-output: 'party_encoded' and 'flipped'
y_multi = df[['party_encoded', 'flipped']]

# Drop columns that are targets or would leak info (like prev_party)
X = df.drop(columns=['party', 'party_encoded', 'flipped', 'prev_party'])

# Encode categorical features
X_encoded = pd.get_dummies(X, columns=['state', 'county_name'], drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_multi, test_size=0.2, random_state=42, stratify=y_multi['party_encoded']
)

# Initialize and train classifier
base_clf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_clf = MultiOutputClassifier(base_clf)
multi_clf.fit(X_train, y_train)

# Predict class labels
y_pred = multi_clf.predict(X_test)

# Predict probabilities for 'flipped' (second target)
flipped_probs = multi_clf.predict_proba(X_test)[1][:, 1]  # Probability of flipping (class 1)

# Evaluate accuracy
party_acc = accuracy_score(y_test['party_encoded'], y_pred[:, 0])
flip_acc = accuracy_score(y_test['flipped'], y_pred[:, 1])

print(f"Party prediction accuracy: {party_acc:.4f}")
print(f"Flip prediction accuracy: {flip_acc:.4f}")

# Classification reports
print(classification_report(y_test['party_encoded'], y_pred[:, 0], target_names=le_party.classes_))
print(classification_report(y_test['flipped'], y_pred[:, 1]))

# Create dataframe with probabilities
prob_output = X_test.copy()
prob_output['actual_flipped'] = y_test['flipped'].values
prob_output['predicted_flipped'] = y_pred[:, 1]
prob_output['flip_probability'] = flipped_probs

# View or save output
print(prob_output[['actual_flipped', 'predicted_flipped', 'flip_probability']].head())

Party prediction accuracy: 0.9256
Flip prediction accuracy: 0.9226
              precision    recall  f1-score   support

    DEMOCRAT       0.88      0.74      0.81       772
  REPUBLICAN       0.94      0.97      0.95      2964

    accuracy                           0.93      3736
   macro avg       0.91      0.86      0.88      3736
weighted avg       0.92      0.93      0.92      3736

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2891
           1       0.89      0.76      0.82       845

    accuracy                           0.92      3736
   macro avg       0.91      0.86      0.88      3736
weighted avg       0.92      0.92      0.92      3736

       actual_flipped  predicted_flipped  flip_probability
16234               1                  1              0.96
5209                0                  0              0.12
4242                0                  0              0.07
1721                1                  1   

In [45]:
df.head()

Unnamed: 0,year,state,county_name,county_fips,party,candidatevotes,totalvotes,Population,prev_party,flipped,party_encoded
0,2000,ALABAMA,AUTAUGA,1001,REPUBLICAN,11993,17208,43671,,1,1
1,2004,ALABAMA,AUTAUGA,1001,REPUBLICAN,15196,20081,48366,REPUBLICAN,0,1
2,2008,ALABAMA,AUTAUGA,1001,REPUBLICAN,17403,23641,53277,REPUBLICAN,0,1
3,2012,ALABAMA,AUTAUGA,1001,REPUBLICAN,17379,23932,54970,REPUBLICAN,0,1
4,2016,ALABAMA,AUTAUGA,1001,REPUBLICAN,18172,24973,55302,REPUBLICAN,0,1


In [46]:
df_2024 = pd.read_csv('data/df_2024_original.csv')
df_2024['county_fips'] = df_2024['county_fips'].astype(str).str.zfill(5)

In [47]:
df_2024.head()

Unnamed: 0,year,state,county_name,county_fips,prev_party,Population
0,2024,ALABAMA,AUTAUGA,1001,REPUBLICAN,61464.0
1,2024,ALABAMA,BALDWIN,1003,REPUBLICAN,261608.0
2,2024,ALABAMA,BARBOUR,1005,REPUBLICAN,24358.0
3,2024,ALABAMA,BIBB,1007,REPUBLICAN,22258.0
4,2024,ALABAMA,BLOUNT,1009,REPUBLICAN,60163.0


In [48]:
# First, encode prev_party
df_2024['prev_party_encoded'] = le_party.transform(df_2024['prev_party'])

# One-hot encode state and county_name, drop first to match training
df_2024_encoded = pd.get_dummies(df_2024, columns=['state', 'county_name'], drop_first=True)

# Ensure column alignment with training data
feature_cols = multi_clf.estimators_[0].feature_names_in_  # from trained model
X_2024 = df_2024_encoded.reindex(columns=feature_cols, fill_value=0)

# Predict class labels (party and flipped)
y_2024_pred = multi_clf.predict(X_2024)

# Predict probabilities for 'flipped' (the second target)
flip_probs_2024 = multi_clf.predict_proba(X_2024)[1][:, 1]  # Probability of flipped = 1

# Add predictions back to original dataframe
df_2024['predicted_party'] = le_party.inverse_transform(y_2024_pred[:, 0])
df_2024['predicted_flipped'] = y_2024_pred[:, 1]
df_2024['flip_probability'] = flip_probs_2024

In [49]:
df_2024.head()

Unnamed: 0,year,state,county_name,county_fips,prev_party,Population,prev_party_encoded,predicted_party,predicted_flipped,flip_probability
0,2024,ALABAMA,AUTAUGA,1001,REPUBLICAN,61464.0,1,REPUBLICAN,0,0.37
1,2024,ALABAMA,BALDWIN,1003,REPUBLICAN,261608.0,1,REPUBLICAN,0,0.42
2,2024,ALABAMA,BARBOUR,1005,REPUBLICAN,24358.0,1,REPUBLICAN,0,0.47
3,2024,ALABAMA,BIBB,1007,REPUBLICAN,22258.0,1,REPUBLICAN,0,0.38
4,2024,ALABAMA,BLOUNT,1009,REPUBLICAN,60163.0,1,REPUBLICAN,0,0.34


In [50]:
df_2024.to_csv('data/df_2024.csv')

In [51]:
import joblib
joblib.dump(multi_clf, 'model/rf_multi_model.pkl')
joblib.dump(le_party, 'model/le_party.pkl')

['model/le_party.pkl']

In [52]:
max(df_2024["flip_probability"])

0.66