# Import essential libraries

In [42]:
# Import essential libraries
import pandas as pd
import numpy as np

# ML preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# ML evaluation
from sklearn.metrics import accuracy_score

# Load the cleaned dataset
df = pd.read_csv('../data/elections_prepared.csv')

# Check the first rows
print(df.head())

# Check NaN values
print("NaN values per column:" )
print(df.isna().sum())

   ANNEE DEPARTEMENT_CODE              DEPARTEMENT    WINNER  NB_INSCRITS  \
0   2024               01                      Ain  E.DROITE       446979   
1   2024               02                    Aisne    DROITE        73181   
2   2024               03                   Allier  E.DROITE       248529   
3   2024               04  Alpes-de-Haute-Provence    GAUCHE       128146   
4   2024               05             Hautes-Alpes    GAUCHE       114587   

   NB_VOTANTS PARTI_1  VOIX_1   PARTI_2  VOIX_2  ... PARTI_5   VOIX_5 PARTI_6  \
0      311188  GAUCHE   19964    CENTRE  103368  ...  DROITE  27040.0     NaN   
1       46620  DROITE   22933  E.DROITE   22409  ...     NaN      NaN     NaN   
2      171908  GAUCHE   43029    DROITE   46601  ...     NaN      NaN     NaN   
3       90407  GAUCHE   39040  E.DROITE   21536  ...     NaN      NaN     NaN   
4       82882  GAUCHE   40743  E.DROITE   34857  ...     NaN      NaN     NaN   

   VOIX_6 PARTI_7  VOIX_7 PARTI_8  VOIX_8 PARTI_9 

# Preparing Data for Modeling

In [39]:
# Check unique values in the target variable
print("Unique values in WINNER column:", df['WINNER'].unique())

# Encode the target variable 
label_encoder = LabelEncoder()
df['WINNER_ENCODED'] = label_encoder.fit_transform(df['WINNER'])

# Map to see the correspondence
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)

# Define features and target
X = df.drop(['WINNER', 'WINNER_ENCODED'], axis=1)
y = df['WINNER_ENCODED']

# Identify categorical and numerical features
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline with proper NaN handling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NONE')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Target distribution: \n{y.value_counts()}")

Unique values in WINNER column: ['E.DROITE' 'DROITE' 'GAUCHE' 'CENTRE' 'E.GAUCHE']
Label mapping: {'CENTRE': np.int64(0), 'DROITE': np.int64(1), 'E.DROITE': np.int64(2), 'E.GAUCHE': np.int64(3), 'GAUCHE': np.int64(4)}
Training set shape: (248, 23)
Test set shape: (62, 23)
Target distribution: 
WINNER_ENCODED
0    136
2     75
4     59
1     39
3      1
Name: count, dtype: int64


# Model Benchmarking

In [40]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'SVC': SVC(probability=True, random_state=42)
}

# Compare models using cross-validation
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'accuracy': accuracy,
        'pipeline': pipeline
    }
    
    print(f"{name} - Accuracy: {accuracy:.4f}")

# Find best model
best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
print(f"\nBest model: {best_model_name}")

# Tune the best model
best_pipeline = results[best_model_name]['pipeline']

# For future predictions, we'll need to decode the numeric predictions back to party names
# So when we predict future elections:
latest_year = df['ANNEE'].max()
latest_data = df[df['ANNEE'] == latest_year].copy()

predict_2026 = latest_data.copy()
predict_2026['ANNEE'] = 2026

X_predict = predict_2026.drop(['WINNER', 'WINNER_ENCODED', 'DEPARTEMENT'], axis=1)

# Make predictions (these will be encoded values)
predictions_encoded = best_pipeline.predict(X_predict)

# Decode back to party names
predictions_2026 = label_encoder.inverse_transform(predictions_encoded)

# Create results dataframe
results_df = pd.DataFrame({
    'DEPARTEMENT_CODE': predict_2026['DEPARTEMENT_CODE'],
    'DEPARTEMENT': predict_2026['DEPARTEMENT'],
    'WINNER_2024': latest_data['WINNER'],
    'PREDICTED_WINNER_2026': predictions_2026
})

print("\n2026 Election Predictions (Sample):")
print(results_df.head(10))

Training Logistic Regression...
Logistic Regression - Accuracy: 0.7419
Training Random Forest...
Random Forest - Accuracy: 0.6935
Training Gradient Boosting...
Gradient Boosting - Accuracy: 0.7258
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Accuracy: 0.7419
Training SVC...
SVC - Accuracy: 0.7581

Best model: SVC


ValueError: columns are missing: {'DEPARTEMENT'}