# Import essential libraries

In [76]:
# Import joblib to save and load Python objects
import joblib
import os

# Import essential libraries
import pandas as pd
import numpy as np
import sqlite3
import os

# ML preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# ML evaluation
from sklearn.metrics import accuracy_score

# SQL treatement

In [63]:
# Path to db
db_path = "../database/ELECTIONS.db" 

# Connect to db
conn = sqlite3.connect(db_path)

Let's agregate our datas per mandants (2017-2022)

In [64]:
query = """
SELECT 
    DEPARTMENT_CODE,
    MAX(WINNER) as WINNER,
    ROUND(AVG(POVERTY_RATE), 2) as avg_POVERTY_RATE,
    ROUND(AVG(UNEMPLOYMENT_RATE), 2) as avg_UNEMPLOYMENT_RATE,
    ROUND(AVG(IMMIGRATION_RATE), 2) as avg_IMMIGRATION_RATE,
    ROUND(AVG(NUMBER_OF_VICTIMS), 0) as avg_NUMBER_OF_VICTIMS
FROM ELECTIONS_ALL
WHERE YEAR IN (2018, 2019, 2020, 2021, 2022, 2023)
GROUP BY DEPARTMENT_CODE

UNION ALL

SELECT 
    DEPARTMENT_CODE,
    MAX(WINNER) as WINNER,
    ROUND(AVG(POVERTY_RATE), 2) as avg_POVERTY_RATE,
    ROUND(AVG(UNEMPLOYMENT_RATE), 2) as avg_UNEMPLOYMENT_RATE,
    ROUND(AVG(IMMIGRATION_RATE), 2) as avg_IMMIGRATION_RATE,
    ROUND(AVG(NUMBER_OF_VICTIMS), 0) as avg_NUMBER_OF_VICTIMS
FROM ELECTIONS_ALL
WHERE YEAR IN (2017)
GROUP BY DEPARTMENT_CODE
ORDER BY DEPARTMENT_CODE;
"""

df = pd.read_sql_query(query, conn)
df.head(20)

Unnamed: 0,DEPARTMENT_CODE,WINNER,avg_POVERTY_RATE,avg_UNEMPLOYMENT_RATE,avg_IMMIGRATION_RATE,avg_NUMBER_OF_VICTIMS
0,1,GAUCHE,10.8,5.88,12.03,22002.0
1,1,DROITE,10.5,6.75,11.57,20552.0
2,2,E.DROITE,18.93,11.15,4.83,21547.0
3,2,E.DROITE,18.5,12.97,4.37,20206.0
4,3,GAUCHE,16.25,8.38,5.34,11864.0
5,3,CENTRE,15.4,9.68,4.93,11763.0
6,4,GAUCHE,17.13,9.08,8.3,7229.0
7,4,CENTRE,16.6,10.7,7.6,7030.0
8,5,CENTRE,14.7,7.55,5.77,5688.0
9,5,CENTRE,13.9,8.68,5.6,6002.0


Now that we have our training datas (first and 2nd mandats)  
We have to define our features (X) & target (Y)

## Features & Target

In [65]:
print("DataFrame Info:")
df.info()
print("\nMissing values check:")
print(df.isnull().sum())
print("\n---")

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DEPARTMENT_CODE        188 non-null    object 
 1   WINNER                 187 non-null    object 
 2   avg_POVERTY_RATE       188 non-null    float64
 3   avg_UNEMPLOYMENT_RATE  188 non-null    float64
 4   avg_IMMIGRATION_RATE   188 non-null    float64
 5   avg_NUMBER_OF_VICTIMS  188 non-null    float64
dtypes: float64(4), object(2)
memory usage: 8.9+ KB

Missing values check:
DEPARTMENT_CODE          0
WINNER                   1
avg_POVERTY_RATE         0
avg_UNEMPLOYMENT_RATE    0
avg_IMMIGRATION_RATE     0
avg_NUMBER_OF_VICTIMS    0
dtype: int64

---


In [66]:
# This causes an error during stratified splitting. We'll drop the row with the missing value.
df.dropna(subset=['WINNER'], inplace=True)

# Fixing error of E.GAUCHE unique sample in the DEPARTMENT_CODE column (which cause not enough categories error in the Dataset split)
df = df[df['WINNER'] != 'E.GAUCHE']

# Define features (X) and target (y)
X = df.drop('WINNER', axis=1)
y = df['WINNER']

Désormais, nous devons identifier les features numériques & catégorielles pour les différencier

In [67]:
# Identify categorical and numerical features

categorical_features = ['DEPARTMENT_CODE']
numerical_features = [col for col in X.columns if col.startswith('avg_')]

print(f"Identified Categorical features: {categorical_features}")
print(f"Identified Numerical features: {numerical_features}")
print("---\n")


Identified Categorical features: ['DEPARTMENT_CODE']
Identified Numerical features: ['avg_POVERTY_RATE', 'avg_UNEMPLOYMENT_RATE', 'avg_IMMIGRATION_RATE', 'avg_NUMBER_OF_VICTIMS']
---



Now, we have to encode our categorial feature DEPARTMENT_CODE bc the AI Model understand better numerical features  
Same thing for StandardScaling (minimize big values to smaller ones)

In [68]:
# Preprocessing
# Create preprocessor for X (features)
# OneHotEncoder for DEPARTMENT_CODE, StandardScaler for numerical features

preprocessor_X = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough' # This will keep any columns not specified (should be none here)
)

The LabelEncoder function will encode our label (target) y and give us different classes of

In [69]:
# Encode the target variable y
label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

print(f"Target (y) categories found: {label_encoder_y.classes_}")
print("---\n")

Target (y) categories found: ['CENTRE' 'DROITE' 'E.DROITE' 'GAUCHE']
---



## Spliting datas

Now that our features & target are encoded, we can split our datas into Training & Testing sets

In [70]:
# We use y_encoded for splitting and for training the models
# stratify=y_encoded is good practice for classification to maintain class proportions

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,  # 20% for testing
    random_state=42, # For reproducibility
    stratify=y_encoded
)

Fiting training datza is very important to avoid data leakage  
=> The test set remain completely unseen by the model, so no information will leak by accident into the training set  

With our preprocessor_X function, it will do :  
- StandardScaling : calculating the mean and standard deviation  
- OneHotEncoding : Indentify all unique categories

In [72]:
# FIT the preprocessor on the training data ONLY
# TRANSFORM both training and testing data
X_train_processed = preprocessor_X.fit_transform(X_train)
X_test_processed = preprocessor_X.transform(X_test)

In [73]:
print(f"Original X_train shape: {X_train.shape}")
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Original X_test shape: {X_test.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

Original X_train shape: (148, 5)
Processed X_train shape: (148, 94)
Original X_test shape: (38, 5)
Processed X_test shape: (38, 94)
y_train shape: (148,)
y_test shape: (38,)


## Export datas

We will export datas into the new notebook to start the ML training

In [79]:
# Get feature names from the preprocessor to use as column headers
try:
    # Modern scikit-learn versions have get_feature_names_out()
    feature_names = preprocessor_X.get_feature_names_out()
except AttributeError:
    # Fallback for older versions
    ohe_feature_names = preprocessor_X.named_transformers_['cat'].get_feature_names_out(categorical_features)
    feature_names = list(ohe_feature_names) + numerical_features

# Create DataFrames from the processed numpy arrays
X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Create DataFrames for the target variable
y_train_df = pd.DataFrame(y_train, columns=['WINNER_encoded'])
y_test_df = pd.DataFrame(y_test, columns=['WINNER_encoded'])

# Reset indices to ensure clean concatenation
X_train_processed_df.reset_index(drop=True, inplace=True)
y_train_df.reset_index(drop=True, inplace=True)
X_test_processed_df.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

# Combine features and target into final DataFrames
train_df = pd.concat([X_train_processed_df, y_train_df], axis=1)
test_df = pd.concat([X_test_processed_df, y_test_df], axis=1)

print("Created final training and testing DataFrames.")
print(f"Training DataFrame shape: {train_df.shape}")
print(f"Testing DataFrame shape: {test_df.shape}")
print("\n---\n")

Created final training and testing DataFrames.
Training DataFrame shape: (148, 95)
Testing DataFrame shape: (38, 95)

---

