# Part 1: (Select models and train on our data)

The concept here is to actually build 3 different models to demonstrate understanding in the class as well as have multiple options to compare for accuracy. If we have time I would also like to do a 4th with a Neural network to compare modern tech against classsic machine learning accuracy.

### Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assume `data` is the cleaned DataFrame with all features and target
# Features (X) and Target (y)
X = data.drop(columns=['Election Result'])  # Keep 'State' in features
y = data['Election Result']

# Encode the target variable (Election Result) into binary labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # 'R' -> 0, 'D' -> 1

# Define categorical and numerical columns
categorical_features = ['State']
numerical_features = [col for col in X.columns if col != 'State']

# One-Hot Encode the State feature and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical data
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encode 'State'
    ]
)

# Apply transformations
X_transformed = preprocessor.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the logistic regression model
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



### k-Nearest Neighbors:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assume `data` is the cleaned DataFrame with all features and target
# Features (X) and Target (y)
X = data.drop(columns=['Election Result'])  # Keep 'State' in features
y = data['Election Result']

# Encode the target variable (Election Result) into binary labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # 'R' -> 0, 'D' -> 1

# Define categorical and numerical columns
categorical_features = ['State']
numerical_features = [col for col in X.columns if col != 'State']

# One-Hot Encode the State feature and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical data
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encode 'State'
    ]
)

# Apply transformations
X_transformed = preprocessor.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the k-NN model
model = KNeighborsClassifier(n_neighbors=3)  # Adjust `k` based on tuning
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assume `data` is the cleaned DataFrame with all features and target
# Features (X) and Target (y)
X = data.drop(columns=['Election Result'])  # Keep 'State' in features
y = data['Election Result']

# Encode the target variable (Election Result) into binary labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # 'R' -> 0, 'D' -> 1

# Define categorical and numerical columns
categorical_features = ['State']
numerical_features = [col for col in X.columns if col != 'State']

# One-Hot Encode the State feature and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical data
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encode 'State'
    ]
)

# Apply transformations
X_transformed = preprocessor.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the Decision Tree model
model = DecisionTreeClassifier(criterion='entropy', max_depth=5)  # Adjust `max_depth` for tree complexity
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Visualize the tree structure
# Ensure feature names are aligned with the one-hot encoding
feature_names = (
    numerical_features + 
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
)
print(export_text(model, feature_names=feature_names))



### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
import pandas as pd

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]  # For ROC and AUC

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Democrat', 'Republican']))

# Calculate AUC
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
auc_rf = auc(fpr_rf, tpr_rf)
print(f"Random Forest AUC: {auc_rf:.4f}")

# Confusion Matrix
ConfusionMatrixDisplay.from_estimator(rf_model, X_test, y_test, display_labels=['Democrat', 'Republican'])
plt.title("Random Forest Confusion Matrix")
plt.show()

# Feature Importance
feature_importances = pd.DataFrame(
    rf_model.feature_importances_,
    index=feature_names,
    columns=['Importance']
).sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(feature_importances)

# Cross-Validation Scores
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")



# Part 2: (Compare the models)

The concept here is to use the metrics covered in class to evaluate model performance and compare/contrast the models we chose to test.

### Confusion matrix:

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix with labels
print("Confusion Matrix:")
print(cm)

# Display the confusion matrix as a heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(cmap="Blues", values_format='d')  # Format as integers
plt.title("Confusion Matrix for Election Result Prediction")
plt.show()

# Additional performance metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


### Accuracy, Precision, Recall, F1-Score:

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Democrat', 'Republican']))

# Individual metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='D')  # Assuming 'D' represents Democrat
recall = recall_score(y_test, y_pred, pos_label='D')
f1 = f1_score(y_test, y_pred, pos_label='D')

# Print individual metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision (Democrat): {precision:.4f}")
print(f"Recall (Democrat): {recall:.4f}")
print(f"F1-Score (Democrat): {f1:.4f}")

# Note: Add metrics for 'Republican' as well if needed
precision_r = precision_score(y_test, y_pred, pos_label='R')
recall_r = recall_score(y_test, y_pred, pos_label='R')
f1_r = f1_score(y_test, y_pred, pos_label='R')

print(f"\nPrecision (Republican): {precision_r:.4f}")
print(f"Recall (Republican): {recall_r:.4f}")
print(f"F1-Score (Republican): {f1_r:.4f}")



### ROC Curve and AUC:

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get probabilities for the positive class (e.g., 'Democrat' or class 1)
y_prob = model.predict_proba(X_test)[:, 1]  # Ensure class probabilities are available

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label='D')  # Adjust pos_label for your positive class
auc_score = auc(fpr, tpr)

# Print AUC
print(f"AUC: {auc_score:.4f}")

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()



# Part 3: (compare a modern neural network)

The concept here is to run the data on a third model (Binary Classification Neural Network) to see if modern neural networks can outperform classic machine learning tasks on relatively simple datasets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv('your_dataset.csv')  # Replace with the path to your dataset

# Feature engineering
X = data.drop(columns=["Election Result", "State"])  # Features
y = data["Election Result"].apply(lambda x: 1 if x == "D" else 0)  # Binary target

# One-hot encode categorical columns
categorical_features = ["In Recession"]
numerical_features = [col for col in X.columns if col not in categorical_features]
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features),
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

# Convert to tensors
X_tensor = torch.tensor(X_preprocessed.toarray(), dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Define the neural network
class ElectionPredictor(nn.Module):
    def __init__(self, input_size):
        super(ElectionPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)  # First hidden layer with 32 neurons
        self.fc2 = nn.Linear(32, 16)         # Second hidden layer with 16 neurons
        self.fc3 = nn.Linear(16, 1)          # Output layer
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return self.sigmoid(x)

# Initialize the model
input_size = X_train.shape[1]
model = ElectionPredictor(input_size)

# Loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 100
for epoch in range(epochs):
    # Forward pass
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, y_train)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Evaluation
with torch.no_grad():
    y_pred = model(X_test).squeeze()
    y_pred_classes = (y_pred > 0.5).int()
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_classes, target_names=['Republican', 'Democrat']))


### Evaluate for accuracy

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, classification_report

# Assuming `y_test` is the true labels and `y_prob` is the predicted probabilities from the neural network

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred = (y_prob > 0.5).astype(int)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print classification report
print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Democrat', 'Republican']))

# Calculate ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc_score = auc(fpr, tpr)

print(f"Neural Network Accuracy: {accuracy:.4f}")
print(f"Neural Network Precision: {precision:.4f}")
print(f"Neural Network Recall: {recall:.4f}")
print(f"Neural Network F1 Score: {f1:.4f}")
print(f"Neural Network AUC: {auc_score:.4f}")


### Compare 