In [10]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the data
data = pd.read_csv("presidential_primary_averages_2024 (1).csv")  # Replace with your file path

# Preprocess the data
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month

# Encode categorical features
le_state = LabelEncoder()
le_party = LabelEncoder()
le_candidate = LabelEncoder()

data['state_id'] = le_state.fit_transform(data['state'])
data['party_id'] = le_party.fit_transform(data['party'])
data['candidate_id'] = le_candidate.fit_transform(data['candidate'])

# Define features and target
X = data[['pct_trend_adjusted', 'state_id', 'party_id', 'candidate_id', 'year', 'month', 'cycle']]
y = (data['pct_estimate'] > 50).astype(int)  # Binary target: 1 if > 50%, else 0

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Class distribution:\n", pd.Series(y_resampled).value_counts())
print("\nFeature Importances:")
for feature, importance in zip(['pct_trend_adjusted', 'state_id', 'party_id', 'candidate_id', 'year', 'month', 'cycle'], model.feature_importances_):
    print(f"{feature}: {importance:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC-AUC Score:", roc_auc_score(y_test, y_proba))

# Handle unseen labels
def handle_unseen_labels(label_encoder, label):
    if label not in label_encoder.classes_:
        label_encoder.classes_ = np.append(label_encoder.classes_, label)
    return label_encoder.transform([label])[0]

# Example prediction
example = pd.DataFrame({
    'pct_trend_adjusted': [45.0],
    'state_id': [handle_unseen_labels(le_state, 'Texas')],
    'party_id': [handle_unseen_labels(le_party, 'Democrat')],
    'candidate_id': [handle_unseen_labels(le_candidate, 'Joe Biden')],
    'year': [2024],
    'month': [11],
    'cycle': [2024]
})

# Predict with probabilities
example_proba = model.predict_proba(example)[:, 1]
print(f"\nExample Probability of Winning: {example_proba[0]:.4f}")
print("Predicted Winner:", "Yes" if example_proba[0] > 0.3 else "No")


Class distribution:
 pct_estimate
0    159094
1    159094
Name: count, dtype: int64

Feature Importances:
pct_trend_adjusted: 0.0723
state_id: 0.0278
party_id: 0.0846
candidate_id: 0.3440
year: 0.2093
month: 0.0209
cycle: 0.2411

Confusion Matrix:
[[31815    51]
 [    5 31767]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     31866
           1       1.00      1.00      1.00     31772

    accuracy                           1.00     63638
   macro avg       1.00      1.00      1.00     63638
weighted avg       1.00      1.00      1.00     63638


ROC-AUC Score: 0.9999458776368079

Example Probability of Winning: 0.3200
Predicted Winner: Yes


