## loading dataset

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('merged_hourly_regional.csv')



## Defining region names and weather codes for better understanding

In [None]:
# I define the region names here so I can replace the numeric INSEE codes with something readable
region_map = {
    11: "Île-de-France",
    24: "Centre-Val de Loire",
    27: "Bourgogne-Franche-Comté",
    28: "Normandie",
    32: "Hauts-de-France",
    44: "Grand Est",
    52: "Pays de la Loire",
    53: "Bretagne",
    75: "Nouvelle-Aquitaine",
    76: "Occitanie",
    84: "Auvergne-Rhône-Alpes",
    93: "Provence-Alpes-Côte d’Azur"
}

# I  define descriptions for each weather code here so they are  easier to interpret later
weather_map = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    51: "Drizzle: Light intensity",
    53: "Drizzle: Moderate intensity",
    55: "Drizzle: Dense intensity",
    61: "Rain: Slight intensity",
    63: "Rain: Moderate intensity",
    65: "Rain: Heavy intensity",
    71: "Snow fall: Slight intensity",
    73: "Snow fall: Moderate intensity",
    75: "Snow fall: Heavy intensity"
}

# Here I am applying the mapping dictionaries and turning these columns into categorical types
df["region_name"] = df["insee_region"].map(region_map).astype("category")
df["insee_region"] = df["insee_region"].astype("category")

df["weather_desc"] = df["weather_code"].map(weather_map).astype("category")
df["weather_code"] = df["weather_code"].astype("category")

# I reorder the columns so that the descriptive fields sit right next to their corresponding codes
cols = df.columns.tolist()

insee_idx = cols.index("insee_region")
cols.insert(insee_idx + 1, cols.pop(cols.index("region_name")))

weather_idx = cols.index("weather_code")
cols.insert(weather_idx + 1, cols.pop(cols.index("weather_desc")))

df = df[cols]


## dropping redundant date columns

In [None]:
#DROPPING REDUNT DATE AND DATETIME COLUMN
df = df.drop(columns=["date", "datetime"])


## Checking electricity load and  weather values are in valid ranges

In [None]:
#CHECKING WEATHER VALUES IN VALID RANGE
print("Relative humidity out of range:")
print(df[(df["relative_humidity_2m"] < 0) | (df["relative_humidity_2m"] > 100)].shape)

print("Cloud cover out of range:")
cloud_cols = ["cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high"]
for col in cloud_cols:
    out_of_range = df[(df[col] < 0) | (df[col] > 100)]
    print(f"{col}: {out_of_range.shape[0]} rows out of range")

# I check the wind direction here since it must stay between 0 and 360 degrees
wind_cols = ["wind_direction_10m", "wind_direction_100m"]
for col in wind_cols:
    out_of_range = df[(df[col] < 0) | (df[col] > 360)]
    print(f"{col}: {out_of_range.shape[0]} rows out of range")

# I check that precipitation related values are not negative
precip_cols = ["precipitation", "rain", "snowfall", "snow_depth"]
for col in precip_cols:
    negative_values = df[df[col] < 0]
    print(f"{col}: {negative_values.shape[0]} rows with negative values")


# I check for impossible electricity consumption values here
invalid_elec = df[df["conso_elec_mw"] <= 0]
print("Number of rows with non-positive electricity consumption:", invalid_elec.shape[0])
print(invalid_elec)

## dropping invalid electricity rows and saving to new file

In [None]:
#DROPPING ALL ROWS WHERE ELEC IS 0 OR BELOW 0 
df_clean = df[df["conso_elec_mw"] > 0].copy()

print("Original shape:", df.shape)
print("New shape after dropping non-positive electricity rows:", df_clean.shape)


#CONVERTIG DATETIME_HOUR to datetime for further use
df_clean["datetime_hour"] = pd.to_datetime(df_clean["datetime_hour"])


#SAVE Dataframe to a new clean CSV
df_clean.to_csv("Hourly_Elec_Cleaned.csv", index=False)


## Creating pickle file to make sure categorical features are preserved categorical

In [None]:
#CREATING PICKLE FILE TO PRESERVE FORMAT
# I  convert the categorical fields again here to make sure they stay preserved inside the pickle
categorical_cols = ["region_name", "weather_desc", "insee_region", "weather_code"]
for col in categorical_cols:
    df1[col] = df1[col].astype("category")

df1["datetime_hour"] = pd.to_datetime(df1["datetime_hour"])

df1.to_pickle("Hourly_Elec_Cleaned.pkl")



df2 = pd.read_pickle("Hourly_Elec_Cleaned.pkl")