# 02 - Feature engineering

Transformation du format long en variables explicatives par bloc et par bureau.

In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
sys.path.append(str(PROJECT_ROOT))

import pandas as pd
import numpy as np

RAW_DIR = PROJECT_ROOT / "data" / "raw"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

mapping_path = PROJECT_ROOT / "data" / "mapping_candidats_blocs.csv"

pd.set_option("display.max_columns", 50)


## Chargement du format long

In [2]:
if (INTERIM_DIR / "elections_long.parquet").exists():
    elections_long = pd.read_parquet(INTERIM_DIR / "elections_long.parquet")
else:
    elections_long = pd.read_csv(INTERIM_DIR / "elections_long.csv", sep=";")

elections_long.head()

Unnamed: 0,code_bv,nom_bv,annee,date_scrutin,type_scrutin,tour,inscrits,votants,abstentions,blancs,nuls,exprimes,code_candidature,nom_candidature,voix,N° tour,Code du département,Code de la commune,Nom de la commune,Numéro bureau de vote,NÂ° de dépôt du candidat 1,Nom 1,Prénom 1,Code Nuance 1,Voix 1,...,Sièges 30,Nuance 31,Libellé Abrégé Liste 31,Sièges 31,Nuance 32,Libellé Abrégé Liste 32,Sièges 32,Nuance 33,Libellé Abrégé Liste 33,Sièges 33,Nuance 34,Libellé Abrégé Liste 34,Sièges 34,Nuance 35,Libellé Abrégé Liste 35,Sièges 35,Nuance 36,Libellé Abrégé Liste 36,Sièges 36,Nuance 37,Libellé Abrégé Liste 37,Sièges 37,Nuance 38,Libellé Abrégé Liste 38,Sièges 38
0,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,2.0,TORREMOCHA,Sandra,LEXG,1,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,3.0,MARTINEZ,Jean-Claude,LDVD,3,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,4.0,JUY,Monique,LDIV,2,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,5.0,BOV�,Jos�,LVEC,46,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,6.0,ALIOT,Louis,LFN,245,...,,,,,,,,,,,,,,,,,,,,,,,,,


## Mapping candidats -> blocs politiques

In [3]:
mapping = pd.read_csv(mapping_path, sep=";")
elections_long = elections_long.merge(mapping, on="code_candidature", how="left")
elections_long.head()

Unnamed: 0,code_bv,nom_bv,annee,date_scrutin,type_scrutin,tour,inscrits,votants,abstentions,blancs,nuls,exprimes,code_candidature,nom_candidature_x,voix,N° tour,Code du département,Code de la commune,Nom de la commune,Numéro bureau de vote,NÂ° de dépôt du candidat 1,Nom 1,Prénom 1,Code Nuance 1,Voix 1,...,Nuance 32,Libellé Abrégé Liste 32,Sièges 32,Nuance 33,Libellé Abrégé Liste 33,Sièges 33,Nuance 34,Libellé Abrégé Liste 34,Sièges 34,Nuance 35,Libellé Abrégé Liste 35,Sièges 35,Nuance 36,Libellé Abrégé Liste 36,Sièges 36,Nuance 37,Libellé Abrégé Liste 37,Sièges 37,Nuance 38,Libellé Abrégé Liste 38,Sièges 38,nom_candidature_y,bloc_1,bloc_2,bloc_3
0,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,2.0,TORREMOCHA,Sandra,LEXG,1,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,3.0,MARTINEZ,Jean-Claude,LDVD,3,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,4.0,JUY,Monique,LDIV,2,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,5.0,BOV�,Jos�,LVEC,46,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,10001,,2014,2014-05-25,europeennes,1,1193,566,,,,549,,,0.0,1.0,34,1,Abeilhan,1,6.0,ALIOT,Louis,LFN,245,...,,,,,,,,,,,,,,,,,,,,,,,,,


## Repartition des voix par bloc

In [4]:
from pathlib import Path
import sys
PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
sys.path.append(str(PROJECT_ROOT))
import pandas as pd
import numpy as np
def voix_par_bloc(row) -> dict:
    blocs = [row.get("bloc_1"), row.get("bloc_2"), row.get("bloc_3")]
    blocs = [b for b in blocs if isinstance(b, str) and b]
    if not blocs:
        # fallback pour nuances non mappees : regrouper dans "divers"
        return {"divers": row.get("voix", 0)}
    part = row.get("voix", 0) / len(blocs)
    return {b: part for b in blocs}
records: list[dict] = []
for _, row in elections_long.iterrows():
    repartition = voix_par_bloc(row)
    for bloc, v in repartition.items():
        records.append(
            {
                "code_bv": row["code_bv"],
                "nom_bv": row.get("nom_bv"),
                "date_scrutin": row["date_scrutin"],
                "annee": row["annee"],
                "type_scrutin": row["type_scrutin"],
                "tour": row["tour"],
                "bloc": bloc,
                "voix_bloc": v,
                "exprimes": row["exprimes"],
                "inscrits": row["inscrits"],
                "votants": row["votants"],
            }
        )
elections_blocs = pd.DataFrame.from_records(records)
elections_blocs["part_bloc"] = elections_blocs["voix_bloc"] / elections_blocs["exprimes"]
elections_blocs.head()


Unnamed: 0,code_bv,nom_bv,date_scrutin,annee,type_scrutin,tour,bloc,voix_bloc,exprimes,inscrits,votants,part_bloc
0,10001,,2014-05-25,2014,europeennes,1,divers,0.0,549,1193,566,0.0
1,10001,,2014-05-25,2014,europeennes,1,divers,0.0,549,1193,566,0.0
2,10001,,2014-05-25,2014,europeennes,1,divers,0.0,549,1193,566,0.0
3,10001,,2014-05-25,2014,europeennes,1,divers,0.0,549,1193,566,0.0
4,10001,,2014-05-25,2014,europeennes,1,divers,0.0,549,1193,566,0.0


## Références nationales calculées automatiquement
Agrégation du score/participation national par scrutin & bloc si aucun fichier externe n'est fourni.


In [5]:
from src.pipeline import compute_national_reference, filter_target_communes, load_target_communes

# Calcul des références nationales directement à partir des données si aucune table externe
nat_ref = compute_national_reference(elections_blocs)

# Fusion des références nationales
elections_blocs = elections_blocs.merge(nat_ref, on=["date_scrutin", "bloc"], how="left")

# Écarts locaux vs national
elections_blocs["taux_participation_bv"] = elections_blocs["votants"] / elections_blocs["inscrits"]
elections_blocs["ecart_bloc_vs_national"] = elections_blocs["part_bloc"] - elections_blocs["part_bloc_national"]
elections_blocs["ecart_participation_vs_nat"] = (
    elections_blocs["taux_participation_bv"] - elections_blocs["taux_participation_national"]
)

# Restreindre aux communes cibles pour alléger la suite
target_communes = load_target_communes()
elections_blocs = filter_target_communes(elections_blocs, target_communes)


## Evolution de la population inscrite

In [6]:
bv_pop = elections_blocs.groupby(["code_bv", "annee"], as_index=False)["inscrits"].mean()
base_year = 2014
bv_base = (
    bv_pop[bv_pop["annee"] == base_year][["code_bv", "inscrits"]]
    .rename(columns={"inscrits": "inscrits_base"})
)
bv_pop = bv_pop.merge(bv_base, on="code_bv", how="left")
bv_pop["croissance_inscrits_depuis_base"] = (
    bv_pop["inscrits"] - bv_pop["inscrits_base"]
) / bv_pop["inscrits_base"]

elections_blocs = elections_blocs.merge(
    bv_pop[["code_bv", "annee", "croissance_inscrits_depuis_base"]],
    on=["code_bv", "annee"],
    how="left",
)

## Lags et variables de contexte

In [7]:
elections_blocs = elections_blocs.sort_values(["code_bv", "bloc", "date_scrutin"])
elections_blocs["part_bloc_lag1"] = elections_blocs.groupby(["code_bv", "bloc"])["part_bloc"].shift(1)
elections_blocs["ecart_bloc_vs_national_lag1"] = (
    elections_blocs.groupby(["code_bv", "bloc"])["ecart_bloc_vs_national"].shift(1)
)
elections_blocs["taux_participation_bv_lag1"] = (
    elections_blocs.groupby(["code_bv", "bloc"])["taux_participation_bv"].shift(1)
)
elections_blocs["annee_centre"] = elections_blocs["annee"] - elections_blocs["annee"].median()

elections_blocs.head()

Unnamed: 0,code_bv,nom_bv,date_scrutin,annee,type_scrutin,tour,bloc,voix_bloc,exprimes,inscrits,votants,part_bloc,part_bloc_national,taux_participation_national,taux_participation_bv,ecart_bloc_vs_national,ecart_participation_vs_nat,code_commune,nom_commune,croissance_inscrits_depuis_base,part_bloc_lag1,ecart_bloc_vs_national_lag1,taux_participation_bv_lag1,annee_centre
750,343010001,,2014-03-23,2014,municipales,1,divers,0.0,641,962,647,0.0,0.0,0.668182,0.672557,0.0,0.004375,34301,Sete,0.0,,,,0.0
751,343010001,,2014-03-23,2014,municipales,1,divers,0.0,641,962,647,0.0,0.0,0.668182,0.672557,0.0,0.004375,34301,Sete,0.0,0.0,0.0,0.672557,0.0
752,343010001,,2014-03-23,2014,municipales,1,divers,0.0,641,962,647,0.0,0.0,0.668182,0.672557,0.0,0.004375,34301,Sete,0.0,0.0,0.0,0.672557,0.0
753,343010001,,2014-03-23,2014,municipales,1,divers,0.0,641,962,647,0.0,0.0,0.668182,0.672557,0.0,0.004375,34301,Sete,0.0,0.0,0.0,0.672557,0.0
754,343010001,,2014-03-23,2014,municipales,1,divers,0.0,641,962,647,0.0,0.0,0.668182,0.672557,0.0,0.004375,34301,Sete,0.0,0.0,0.0,0.672557,0.0


## Sauvegarde

In [8]:
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
elections_blocs.to_parquet(PROCESSED_DIR / "elections_blocs.parquet", index=False)
elections_blocs.to_csv(PROCESSED_DIR / "elections_blocs.csv", sep=";", index=False)
elections_blocs.shape

(1650, 24)