In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'


def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f"Téléchargé : {file_name}")


download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')


df_train = pd.read_csv("module5_exercise_train.csv")
df_test = pd.read_csv("module5_exercise_test.csv")


df_train["date"] = pd.to_datetime(df_train["date"])
df_test["date"] = pd.to_datetime(df_test["date"])

print("Taille train :", df_train.shape)
print("Taille test :", df_test.shape)

df_train.head()


Téléchargé : module5_exercise_train.csv
Téléchargé : module5_exercise_test.csv
Taille train : (1909, 16)
Taille test : (365, 15)


Unnamed: 0,date,weather_condition,humidity,wind_speed,oil_brent_price_indicator,temperature_station1,temperature_station2,temperature_station3,temperature_station4,temperature_station5,temperature_station6,temperature_station7,temperature_station8,temperature_station9,temperature_station10,electricity_demand
0,2015-01-08,Cloudy,69.304377,27.74 km/h,Moderate,0.369411,,0.159977,0.118224,-0.444455,0.313675,0.106192,0.36943,1.27947,-0.384661,273.511353
1,2015-01-09,Sunny,55.955975,21.78 km/h,Moderate,2.009903,,1.617242,,2.160063,2.515627,1.867474,1.953165,1.878233,1.582365,258.827085
2,2015-01-10,Rainy,62.701614,6.83 m/s,Moderate,-2.603544,-2.422001,-3.685819,-2.392354,-1.936704,-2.950332,-3.074828,-2.69865,-2.35486,-2.770883,302.535033
3,2015-01-11,Snowy,60.375001,5.79 m/s,Moderate,-3.789836,-3.974054,-3.217545,-4.397143,-3.375188,-3.650454,-5.004991,-3.233724,-4.227899,,296.270989
4,2015-01-12,Snowy,,25.14 km/h,High,-2.405522,-2.161129,-2.880773,-2.587199,,,,,-2.790422,-2.033098,323.49809



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [2]:

def clean_wind_speed(series):
    return (
        series.astype(str)
        .str.replace("km/h", "", regex=False)
        .str.replace("m/s", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.strip()
        .astype(float)
    )

df_train["wind_speed"] = clean_wind_speed(df_train["wind_speed"])
df_test["wind_speed"] = clean_wind_speed(df_test["wind_speed"])


num_cols = df_train.select_dtypes(include=["float64", "int64"]).columns
for col in num_cols:
    if col != "electricity_demand":
        median_value = df_train[col].median()
        df_train[col] = df_train[col].fillna(median_value)
        df_test[col] = df_test[col].fillna(median_value)

cat_cols = ["weather_condition", "oil_brent_price_indicator"]
for col in cat_cols:
    df_train[col] = df_train[col].fillna("Unknown")
    df_test[col] = df_test[col].fillna("Unknown")


before = df_train.shape[0]
df_train = df_train.drop_duplicates()
after = df_train.shape[0]




In [3]:

categorical_cols = ["weather_condition", "oil_brent_price_indicator"]


df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=True)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)


df_test = df_test.reindex(columns=df_train.columns.drop("electricity_demand"), fill_value=0)

print("Taille train après encodage :", df_train.shape)
print("Taille test après encodage :", df_test.shape)


Taille train après encodage : (1819, 22)
Taille test après encodage : (365, 21)


In [4]:

df_train["is_weekend"] = (df_train["date"].dt.dayofweek >= 5).astype(int)
df_test["is_weekend"] = (df_test["date"].dt.dayofweek >= 5).astype(int)


for df in [df_train, df_test]:
    df["month"] = df["date"].dt.month
    df["dayofyear"] = df["date"].dt.dayofyear


    df["dayofyear_sin"] = np.sin(2 * np.pi * df["dayofyear"] / 365)
    df["dayofyear_cos"] = np.cos(2 * np.pi * df["dayofyear"] / 365)




In [5]:

before = df_train.shape[0]
df_train = df_train[df_train["electricity_demand"] > 0]
after = df_train.shape[0]




In [6]:
from sklearn.ensemble import RandomForestRegressor


X = df_train.drop(columns=["electricity_demand", "date"])
y = df_train["electricity_demand"]


pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        max_depth=10
    ))
])


tscv = TimeSeriesSplit(n_splits=3)
mse_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)

    mse = mean_squared_error(y_val, y_val_pred)
    mse_scores.append(mse)
    print(f"Fold {fold+1} MSE: {mse:.2f}")

print("\nMSE moyen :", np.mean(mse_scores))


Fold 1 MSE: 525.58
Fold 2 MSE: 439.37
Fold 3 MSE: 458.29

MSE moyen : 474.4138681412331


In [7]:

final_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    max_depth=10
)

X = df_train.drop(columns=["electricity_demand", "date"])
y = df_train["electricity_demand"]

final_model.fit(X, y)


X_test = df_test.drop(columns=["date"])


y_test_pred = final_model.predict(X_test)


In [9]:

submission = pd.DataFrame({
    "date": df_test["date"],
    "electricity_demand": y_test_pred
})


submission.to_csv("submissionformodule5.csv", index=False)


