# Import essential libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Load the cleaned dataset
df = pd.read_csv('../data/elections_prepared.csv')

# Check the first rows
print(df.head())

   ANNEE DEPARTEMENT_CODE              DEPARTEMENT    WINNER  NB_INSCRITS  \
0   2024               01                      Ain  E.DROITE       446979   
1   2024               02                    Aisne    DROITE        73181   
2   2024               03                   Allier  E.DROITE       248529   
3   2024               04  Alpes-de-Haute-Provence    GAUCHE       128146   
4   2024               05             Hautes-Alpes    GAUCHE       114587   

   NB_VOTANTS PARTI_1  VOIX_1   PARTI_2  VOIX_2  ... PARTI_5   VOIX_5 PARTI_6  \
0      311188  GAUCHE   19964    CENTRE  103368  ...  DROITE  27040.0     NaN   
1       46620  DROITE   22933  E.DROITE   22409  ...     NaN      NaN     NaN   
2      171908  GAUCHE   43029    DROITE   46601  ...     NaN      NaN     NaN   
3       90407  GAUCHE   39040  E.DROITE   21536  ...     NaN      NaN     NaN   
4       82882  GAUCHE   40743  E.DROITE   34857  ...     NaN      NaN     NaN   

   VOIX_6 PARTI_7  VOIX_7 PARTI_8  VOIX_8 PARTI_9 

# Preparing Data for Modeling

In [5]:
# Define features and target
X = df.drop('WINNER', axis=1)
y = df['WINNER']

# Identify categorical and numerical features
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Target distribution: \n{y.value_counts()}")

Training set shape: (248, 23)
Test set shape: (62, 23)
Target distribution: 
WINNER
CENTRE      136
E.DROITE     75
GAUCHE       59
DROITE       39
E.GAUCHE      1
Name: count, dtype: int64


# Model Benchmarking

In [6]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'SVC': SVC(probability=True, random_state=42)
}

# Compare models using cross-validation
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    
    # Fit on training data
    pipeline.fit(X_train, y_train)
    
    # Predictions on test data
    y_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'pipeline': pipeline
    }
    
    print(f"{name} - CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f}), Test Accuracy: {test_accuracy:.4f}")

# Find best model
best_model_name = max(results.items(), key=lambda x: x[1]['cv_mean'])[0]
print(f"\nBest model: {best_model_name}")

NameError: name 'XGBClassifier' is not defined