In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

### Data analysis

In [None]:
#### Make a complete analysis on data preprocessing
# Inconsistencies
# Duplicates (data.duplicated().sum())
# Missing values (data.isnull().sum())
# Categorical
# Outliers
# Feature Engineering
# Feature Selection and/or Dimensionality Reduction

In [None]:
data = pd.concat([df_train, df_test], axis=0)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]

    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
data['date'] = pd.to_datetime(data['date'])

In [None]:
data

In [None]:
data['wind_speed']

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

In [None]:
plot_feature_over_time(data, 'humidity', '2016-06-01', '2016-12-01')

### Data Preprocessing Evaluation Strategy

In [None]:
# Provide a complete data preprocessing transformations

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

def find_bad_targets(df, target="electricity_demand", n_splits=5, thr_abs=1000):
    X = df.drop(columns=[target]).copy()
    y = pd.to_numeric(df[target], errors="coerce")
    # trier par date si dispo
    if "date" in X.columns:
        order = pd.to_datetime(X["date"], errors="coerce").sort_values().index
        X, y = X.loc[order].reset_index(drop=True), y.loc[order].reset_index(drop=True)

    tscv = TimeSeriesSplit(n_splits=n_splits)
    bad_rows = []

    for k, (tr, va) in enumerate(tscv.split(X), 1):
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        # critères simples : négatifs / trop grands en absolu
        bad_tr_idx = y_tr.index[(y_tr <= 0) | (y_tr.abs() > thr_abs)].tolist()
        bad_va_idx = y_va.index[(y_va <= 0) | (y_va.abs() > thr_abs)].tolist()

        if bad_tr_idx or bad_va_idx:
            bad_rows.append({
                "fold": k,
                "bad_train_count": len(bad_tr_idx),
                "bad_val_count": len(bad_va_idx),
                "bad_train_examples": bad_tr_idx[:5],
                "bad_val_examples": bad_va_idx[:5],
            })

    return pd.DataFrame(bad_rows)

# Exemple:
# bad = find_bad_targets(df_train, target="electricity_demand", n_splits=5, thr_abs=1000)
# print(bad)


In [None]:
def sanitize_target(df, target="electricity_demand"):
    df = df.copy()
    # passer en numérique propre
    df[target] = pd.to_numeric(df[target], errors="coerce")
    # enlever impossibles / manifestement corrompus
    df.loc[df[target] <= 0, target] = np.nan
    # winsorize par quantiles robustes (évite les énormes outliers)
    lo, hi = df[target].quantile([0.005, 0.995])
    df[target] = df[target].clip(lo, hi)
    # drop les NaN restants sur la cible
    df = df.dropna(subset=[target]).reset_index(drop=True)
    return df


In [None]:
# 1) assainir
df_clean = sanitize_target(df_train, target="electricity_demand")

# 2) préparer X/y
X = df_clean.drop(columns=["electricity_demand"])
y = df_clean["electricity_demand"].copy()

# 3) (re)lancer ton éval (ta version “camarade” ou la mienne strict-CV)
mean_val_mse, X_trs, X_vas, y_trs, y_vas = evaluate_pipeline(X, y, n_splits=5, alpha=10.0, scale=True)
print("\nScore final (mean Val MSE):", mean_val_mse)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

def diagnose_timeseries(X: pd.DataFrame, y: pd.Series, n_splits=5):
    # On trie par date si elle existe
    if "date" in X.columns:
        order = pd.to_datetime(X["date"], errors="coerce").sort_values().index
        X = X.loc[order].reset_index(drop=True)
        y = y.loc[order].reset_index(drop=True)

    tscv = TimeSeriesSplit(n_splits=n_splits)
    rows = []

    for k, (tr, va) in enumerate(tscv.split(X), 1):
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        # baselines : moyenne du train / valeur décalée (naïf)
        mean_pred = np.full_like(y_va, fill_value=y_tr.mean(), dtype=np.float64)
        lag1_pred = pd.Series(y).iloc[va-1].reset_index(drop=True) if (va[0]-1)>=0 else pd.Series(y_va).fillna(y_tr.mean())
        lag1_pred = np.array(lag1_pred.fillna(method='bfill'))

        rows.append({
            "fold": k,
            "y_train_mean": float(y_tr.mean()), "y_val_mean": float(y_va.mean()),
            "y_train_std": float(y_tr.std()),   "y_val_std": float(y_va.std()),
            "MSE_mean_baseline": float(mean_squared_error(y_va, mean_pred)),
            "MSE_lag1_baseline": float(mean_squared_error(y_va, lag1_pred)),
            "val_len": int(len(y_va))
        })

    return pd.DataFrame(rows)


In [None]:
diag = diagnose_timeseries(
    df_train.drop(columns=["electricity_demand"]),
    df_train["electricity_demand"], n_splits=5
)
print(diag)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Sélection de features *stables*
SELECTED_COLS = [
    "humidity",
    "temperature_station1","temperature_station2","temperature_station3",
    "temperature_station4","temperature_station5","temperature_station6",
    "temperature_station7","temperature_station8","temperature_station9",
    "temperature_station10"
]

def _to_ms(val):
    try:
        if isinstance(val, str):
            s = val.strip()
            if "km/h" in s: return float(s.replace("km/h","").strip())/3.6
            if "m/s"  in s: return float(s.replace("m/s","").strip())
        return float(val)
    except:
        return np.nan

def _deterministic_preclean(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # wind_speed -> m/s
    if "wind_speed" in X.columns:
        X["wind_speed"] = X["wind_speed"].apply(_to_ms)
    # date -> mois / jour semaine
    if "date" in X.columns:
        d = pd.to_datetime(X["date"], errors="coerce")
        X["month"] = d.dt.month
        X["dayofweek"] = d.dt.dayofweek
        X = X.drop(columns=["date"])
    # humidity bornée
    if "humidity" in X.columns:
        X["humidity"] = pd.to_numeric(X["humidity"], errors="coerce").clip(0, 100)
    return X

def _select_features(X: pd.DataFrame) -> pd.DataFrame:
    keep = [c for c in SELECTED_COLS if c in X.columns]
    for extra in ["month","dayofweek"]:
        if extra in X.columns: keep.append(extra)
    if not keep: keep = list(X.columns)
    return X[keep]

def evaluate_pipeline_strict(X: pd.DataFrame, y: pd.Series,
                             n_splits=5, alpha=150.0, clip_quantiles=(0.01, 0.99)):
    # Trier par date si dispo (sécurité)
    if "date" in X.columns:
        order = pd.to_datetime(X["date"], errors="coerce").sort_values().index
        X = X.loc[order].reset_index(drop=True)
        y = y.loc[order].reset_index(drop=True)

    # Aucune étape apprenante hors CV
    tscv = TimeSeriesSplit(n_splits=n_splits)

    train_scores, val_scores = [], []

    for fold, (tr, va) in enumerate(tscv.split(X), 1):
        print(f"Processing fold {fold}/{n_splits}...")

        X_tr, X_va = X.iloc[tr].copy(), X.iloc[va].copy()
        y_tr, y_va = y.iloc[tr].copy(), y.iloc[va].copy()

        # Nettoyage déterministe + sélection stable (dans le fold, pas de fit global)
        X_tr = _select_features(_deterministic_preclean(X_tr))
        X_va = _select_features(_deterministic_preclean(X_va))

        # Pipeline impute -> scale -> ridge
        pipe = Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("ridge", Ridge(alpha=alpha, random_state=0))
        ])

        pipe.fit(X_tr, y_tr)

        # Prédictions
        y_tr_pred = pipe.predict(X_tr)
        y_va_pred = pipe.predict(X_va)

        # CLIP fort basé sur le train (quantiles)
        if clip_quantiles is not None:
            lo = float(y_tr.quantile(clip_quantiles[0]))
            hi = float(y_tr.quantile(clip_quantiles[1]))
            y_tr_pred = np.clip(y_tr_pred, lo, hi)
            y_va_pred = np.clip(y_va_pred, lo, hi)

        tr_mse = mean_squared_error(y_tr, y_tr_pred)
        va_mse = mean_squared_error(y_va, y_va_pred)
        train_scores.append(tr_mse); val_scores.append(va_mse)

        print(f"Fold {fold} — Train MSE: {tr_mse:.4f} | Val MSE: {va_mse:.4f}")

    print("\nTrain MSE:")
    print(f"Mean: {np.mean(train_scores):.4f}, Max: {np.max(train_scores):.4f}, Min: {np.min(train_scores):.4f}")
    print("\nValidation MSE:")
    print(f"Mean: {np.mean(val_scores):.4f}, Max: {np.max(val_scores):.4f}, Min: {np.min(val_scores):.4f}")
    return float(np.mean(val_scores))



In [None]:
X = df_train.drop(columns=["electricity_demand"])
y = df_train["electricity_demand"].copy()

print(diagnose_timeseries(X, y, n_splits=5))  # <- d'abord le diag

mean_val_mse = evaluate_pipeline_strict(X, y, n_splits=5, alpha=150.0, clip_quantiles=(0.01,0.99))
print("\nScore final (mean Val MSE):", mean_val_mse)



In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

# mêmes colonnes stables que dans l’éval
SELECTED_COLS = [
    "humidity",
    "temperature_station1","temperature_station2","temperature_station3",
    "temperature_station4","temperature_station5","temperature_station6",
    "temperature_station7","temperature_station8","temperature_station9",
    "temperature_station10"
]

def _to_ms(val):
    try:
        if isinstance(val, str):
            s = val.strip()
            if "km/h" in s: return float(s.replace("km/h","").strip())/3.6
            if "m/s"  in s: return float(s.replace("m/s","").strip())
        return float(val)
    except:
        return np.nan

def deterministic_preclean(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    if "wind_speed" in X.columns:
        X["wind_speed"] = X["wind_speed"].apply(_to_ms)
    if "date" in X.columns:
        d = pd.to_datetime(X["date"], errors="coerce")
        X["month"] = d.dt.month
        X["dayofweek"] = d.dt.dayofweek
        X = X.drop(columns=["date"])
    if "humidity" in X.columns:
        X["humidity"] = pd.to_numeric(X["humidity"], errors="coerce").clip(0, 100)
    return X

def select_features(X: pd.DataFrame) -> pd.DataFrame:
    keep = [c for c in SELECTED_COLS if c in X.columns]
    for extra in ["month","dayofweek"]:
        if extra in X.columns: keep.append(extra)
    return X[keep] if keep else X

def build_final_pipeline(alpha: float = 150.0) -> Pipeline:
    return Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=alpha, random_state=0))
    ])


In [None]:
# 1) (optionnel) assainir la cible — comme on l’a fait pour fixer l’explosion
def sanitize_target(df, target="electricity_demand"):
    df = df.copy()
    df[target] = pd.to_numeric(df[target], errors="coerce")
    df.loc[df[target] <= 0, target] = np.nan
    lo, hi = df[target].quantile([0.005, 0.995])
    df[target] = df[target].clip(lo, hi)
    df = df.dropna(subset=[target]).reset_index(drop=True)
    return df

# --- prépare train
df_clean = sanitize_target(df_train, target="electricity_demand")
X_train_full = df_clean.drop(columns=["electricity_demand"])
y_train_full = df_clean["electricity_demand"].copy()

# mêmes pré-traitements déterministes que pendant la CV
X_train_full = select_features(deterministic_preclean(X_train_full))

# fit pipeline final
pipe_final = build_final_pipeline(alpha=150.0)
pipe_final.fit(X_train_full, y_train_full)

# --- prépare test
X_test = df_test.copy()
X_test_proc = select_features(deterministic_preclean(X_test))

# IMPORTANT: aligner les colonnes train/test (au cas où)
X_test_proc = X_test_proc.reindex(columns=X_train_full.columns, fill_value=np.nan)

# prédiction brute
y_pred = pipe_final.predict(X_test_proc)

# clipping doux par quantiles du train (sécurise encore un peu)
lo = float(y_train_full.quantile(0.01))
hi = float(y_train_full.quantile(0.99))
y_pred = np.clip(y_pred, lo, hi)


### Generating Submission File

In [None]:
# Generating Submission File
# Variante A : si on attend une colonne 'electricity_demand' (sans id)
submission = pd.DataFrame({
    "electricity_demand": y_pred
})

# Variante B : si on attend ['id','electricity_demand']
if "id" in df_test.columns:
    submission = pd.DataFrame({
        "id": df_test["id"].values,
        "electricity_demand": y_pred
    })

# Variante C : si on attend ['date','electricity_demand']
if "date" in df_test.columns:
    submission = pd.DataFrame({
        "date": df_test["date"].values,
        "electricity_demand": y_pred
    })

# Sauvegarde
submission.to_csv("submission.csv", index=False)
print("Submission saved -> submission.csv")


In [None]:
from google.colab import files  # uniquement si tu es sur Google Colab

# Sauvegarde locale
submission.to_csv("submission.csv", index=False)

# Téléchargement
files.download("submission.csv")
