In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# 2. Nettoyage des données de consommation

## 2.1 Inspection initiale

In [4]:
df_cons = pd.read_csv("/home/onyxia/france-grid-stress-prediction/data/processed/consommation_2019_long.csv")

df_cons.head()
df_cons.info()
df_cons.describe()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   datetime    17520 non-null  object
 1   date        17520 non-null  object
 2   year        17520 non-null  int64 
 3   statut      17520 non-null  object
 4   slot_index  17520 non-null  int64 
 5   load_mw     17520 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 821.4+ KB


Unnamed: 0,year,slot_index,load_mw
count,17520.0,17520.0,17520.0
mean,2019.0,23.5,53716.64903
std,0.0,13.853794,11555.650027
min,2019.0,0.0,30777.0
25%,2019.0,11.75,45030.75
50%,2019.0,23.5,52144.0
75%,2019.0,35.25,61459.0
max,2019.0,47.0,88450.0


## 2.2 Nettoyage structurel

In [5]:
df_cons["datetime"] = pd.to_datetime(df_cons["datetime"])
df_cons.dtypes


datetime      datetime64[ns]
date                  object
year                   int64
statut                object
slot_index             int64
load_mw                int64
dtype: object

In [6]:
df_cons["datetime"].is_unique


True

In [7]:
df_cons = df_cons.sort_values("datetime")
delta = df_cons["datetime"].diff()

delta.value_counts()



datetime
0 days 00:30:00    17519
Name: count, dtype: int64

In [8]:
delta[delta != pd.Timedelta("30min")].head()


0   NaT
Name: datetime, dtype: timedelta64[ns]

In [9]:
df_cons["statut"].value_counts()


statut
Provisoires    17520
Name: count, dtype: int64

In [10]:
df_cons = df_cons.drop(columns=["statut"])


In [11]:
df_cons["year"] = df_cons["datetime"].dt.year
df_cons["date"] = df_cons["datetime"].dt.date
df_cons["hour"] = df_cons["datetime"].dt.hour
df_cons["minute"] = df_cons["datetime"].dt.minute


In [12]:
df_cons = df_cons.drop(columns=["slot_index"])


In [13]:
df_cons.info()
df_cons.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  17520 non-null  datetime64[ns]
 1   date      17520 non-null  object        
 2   year      17520 non-null  int32         
 3   load_mw   17520 non-null  int64         
 4   hour      17520 non-null  int32         
 5   minute    17520 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(1), object(1)
memory usage: 616.1+ KB


Unnamed: 0,datetime,date,year,load_mw,hour,minute
0,2019-01-01 00:00:00,2019-01-01,2019,53574,0,0
1,2019-01-01 00:30:00,2019-01-01,2019,52882,0,30
2,2019-01-01 01:00:00,2019-01-01,2019,53140,1,0
3,2019-01-01 01:30:00,2019-01-01,2019,52870,1,30
4,2019-01-01 02:00:00,2019-01-01,2019,53476,2,0


## 2.3 Pipeline de nettoyage

In [14]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA_DIR = Path("/home/onyxia/france-grid-stress-prediction/data/processed")
OUT_PATH = DATA_DIR / "consommation_clean.parquet"


EXPECTED_FREQ = pd.Timedelta("30min")      # d'après ton fichier 2019 (48 slots/jour)

def clean_consumption_file(path: Path) -> pd.DataFrame:
    """Nettoie un fichier consommation_YYYY_long.csv et renvoie un DataFrame standardisé."""
    df = pd.read_csv(path)

    # 1) Normaliser noms de colonnes
    df.columns = [c.strip().lower() for c in df.columns]

    # 2) Vérifier présence des colonnes minimales
    required = {"datetime", "load_mw"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"{path.name}: colonnes manquantes: {missing}")

    # 3) Convertir datetime
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
    if df["datetime"].isna().any():
        bad = df[df["datetime"].isna()].head(3)
        raise ValueError(f"{path.name}: datetimes invalides. Exemples:\n{bad}")

    # 4) Trier + supprimer doublons exacts sur datetime
    df = df.sort_values("datetime")
    df = df.drop_duplicates(subset=["datetime"], keep="first")

    # 5) Sanity checks consommation
    df["load_mw"] = pd.to_numeric(df["load_mw"], errors="coerce")
    df.loc[df["load_mw"] < 0, "load_mw"] = np.nan

    # 6) Statut: si constant -> drop, sinon garder
    if "statut" in df.columns:
        if df["statut"].nunique(dropna=False) <= 1:
            df = df.drop(columns=["statut"])

    # 7) Reconstruire variables temporelles propres
    df["year"] = df["datetime"].dt.year
    df["date"] = df["datetime"].dt.date
    df["hour"] = df["datetime"].dt.hour
    df["minute"] = df["datetime"].dt.minute

    # 8) Slot index standardisé (0..47 si 30 min)
    df["slot_index"] = (df["hour"] * 60 + df["minute"]) // 30

    # 9) Garder uniquement les colonnes finales (schéma stable)
    df = df[["datetime", "year", "date", "hour", "minute", "slot_index", "load_mw"]]

    return df


def check_time_continuity(df: pd.DataFrame, freq: pd.Timedelta = EXPECTED_FREQ) -> dict:
    """Retourne un résumé de continuité temporelle (trous / pas atypiques)."""
    d = df["datetime"].sort_values().diff()
    vc = d.value_counts().head(5)
    n_bad = (d.notna() & (d != freq)).sum()
    return {
        "expected_freq": str(freq),
        "top_deltas": vc.to_dict(),
        "n_non_expected_steps": int(n_bad),
    }


# 1) Lister les fichiers consommation
files = sorted(DATA_DIR.glob("consommation_*_long.csv"))
if not files:
    raise FileNotFoundError(f"Aucun fichier consommation trouvé dans {DATA_DIR}")

# 2) Nettoyer + concaténer
cleaned = []
reports = []

for f in files:
    df_f = clean_consumption_file(f)
    rep = check_time_continuity(df_f)
    rep["file"] = f.name
    rep["rows"] = len(df_f)
    reports.append(rep)
    cleaned.append(df_f)

df_cons_all = pd.concat(cleaned, ignore_index=True).sort_values("datetime")

# 3) Contrôles globaux
df_cons_all = df_cons_all.drop_duplicates(subset=["datetime"], keep="first")

# 4) Sauvegarde
df_cons_all.to_parquet(OUT_PATH, index=False)

reports_df = pd.DataFrame(reports).sort_values("file")
reports_df


Unnamed: 0,expected_freq,top_deltas,n_non_expected_steps,file,rows
0,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2010_long.csv,17520
1,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2011_long.csv,17520
2,0 days 00:30:00,{0 days 00:30:00: 17567},0,consommation_2012_long.csv,17568
3,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2013_long.csv,17520
4,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2014_long.csv,17520
5,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2015_long.csv,17520
6,0 days 00:30:00,{0 days 00:30:00: 17567},0,consommation_2016_long.csv,17568
7,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2017_long.csv,17520
8,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2018_long.csv,17520
9,0 days 00:30:00,{0 days 00:30:00: 17519},0,consommation_2019_long.csv,17520


# 3. Nettoyage des données météo

In [16]:
df_weather = pd.read_csv(
    "/home/onyxia/france-grid-stress-prediction/weather_32_cities_2019.csv"
)
