## loading dataset and setting date format

In [None]:
import pandas as pd
import glob
import holidays


df = pd.read_csv('Hourly_Elec_Cleaned.csv', low_memory=False, dtype={'datetime_hour': str})
df['datetime_hour'] = pd.to_datetime(df['datetime_hour'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Extracting year and hour for later use
df['year'] = df['datetime_hour'].dt.year
df['hour'] = df['datetime_hour'].dt.hour

## Manually Adding School zones

In [None]:
# Adding school zones manually for later use

region_to_zone = {
    11: 'C', 24: 'B', 27: 'A', 28: 'B', 32: 'B', 44: 'A',
    52: 'B', 53: 'A', 75: 'A', 76: 'B', 84: 'C', 93: 'C'
}
df['school_zone'] = df['insee_region'].map(region_to_zone)

## Adding France school vacations

In [None]:
vac_df = pd.read_csv("France_Vacations_2013-2024.csv", low_memory=False)

# reshaping vacation data to  right format
zone_cols = {"A": ("Zone A Start", "Zone A End"),
             "B": ("Zone B Start", "Zone B End"),
             "C": ("Zone C Start", "Zone C End")}

vac_long_list = []
for zone, (col_start, col_end) in zone_cols.items():
    temp = vac_df[["Vacation", col_start, col_end]].copy()
    temp = temp.rename(columns={col_start: "Start", col_end: "End"})
    temp["Zone"] = zone
    vac_long_list.append(temp)
vac_long = pd.concat(vac_long_list, ignore_index=True)

# converting to datetime
vac_long["Start"] = pd.to_datetime(vac_long["Start"], errors="coerce", dayfirst=True)
vac_long["End"] = pd.to_datetime(vac_long["End"], errors="coerce", dayfirst=True)
vac_long["Vacation"] = vac_long["Vacation"].fillna("No vacation").astype(str)

df["vacation_name"] = "No vacation"
df["if_vacation"] = False

# assignen vacation names
for _, vac in vac_long.iterrows():
    mask = (df['school_zone'] == vac['Zone']) & df['datetime_hour'].between(vac['Start'], vac['End'])
    df.loc[mask, 'vacation_name'] = vac['Vacation']
    df.loc[mask, 'if_vacation'] = True

## Adding French Bank Holidays

In [None]:
## Adding the french holidays
years = df['datetime_hour'].dt.year.unique()
fr_holidays = holidays.France(years=years)
df['date_only'] = df['datetime_hour'].dt.date
df['holiday_name'] = df['date_only'].map(lambda x: fr_holidays.get(x, 'No holiday'))
df['Bank_holiday'] = df['holiday_name'] != 'No holiday'
df.drop(columns=['date_only'], inplace=True)


## Adding the Electricity Spot prices

In [None]:
# here i for merge the electricity spot prices
elec_prices = pd.read_csv('France_hourly_spot_prices.csv')
elec_prices = elec_prices[elec_prices['Country'] == 'France'].copy()
elec_prices['datetime_hour'] = pd.to_datetime(elec_prices['Datetime (Local)'], errors='coerce')
elec_prices = elec_prices.rename(columns={'Price (EUR/MWhe)': 'spot_price_elec_mw'})
elec_prices = elec_prices[['datetime_hour', 'spot_price_elec_mw']]

df = pd.merge(df, elec_prices, on='datetime_hour', how='left')

# moving spot_price_elec_mw next to consumption
cols = df.columns.tolist()
conso_idx = cols.index('conso_elec_mw')
cols.insert(conso_idx + 1, cols.pop(cols.index('spot_price_elec_mw')))
df = df[cols

## Adding all region Features such as (GDPs, Income, Population)

In [None]:
#Merging regional info feature based on year and region
df_features = pd.read_excel("France_Info_Features.xlsx")
df["year"] = df["datetime_hour"].dt.year
df = df.merge(df_features, how="left", left_on=["insee_region", "year"], right_on=["Region_code", "Year"])
df.drop(columns=["Region", "Region_code", "Year"], inplace=True)

## Adding all total active employment data

In [None]:
#Merging employment data
df_employed = pd.read_excel("Total_Employed_Employees_Regional.xlsx")
df["quarter"] = df["datetime_hour"].dt.quarter
df_employed["quarter"] = df_employed["Quarter_of_a_year"].str.replace("Q", "").astype(int)
df = df.merge(df_employed, how="left",
              left_on=["insee_region", "year", "quarter"],
              right_on=["region_code", "Date", "quarter"])
df.drop(columns=["Region", "region_code", "Date", "Quarter_of_a_year"], inplace=True)

## Adding all total unemployment rates data

In [None]:
#Merging unemployment data
files = glob.glob("region_*_quarterly_values.xlsx")
unemp_dfs = []
for f in files:
    temp = pd.read_excel(f)
    temp = temp.rename(columns={"Unemployment rates localized by region": "unemployment_rate"})
    unemp_dfs.append(temp)
df_unemp = pd.concat(unemp_dfs, ignore_index=True)

df = df.merge(df_unemp[["region_code", "year", "quarter", "unemployment_rate"]],
              how="left",
              left_on=["insee_region", "year", "quarter"],
              right_on=["region_code", "year", "quarter"])
df.drop(columns=["region_code"], inplace=True)

## Adding all regional electricity generation production per region

In [None]:
#Merging electricity production per region
df_region = pd.read_excel("all_regions_hourly.xls", parse_dates=["Timestamp"])
df_region = df_region.rename(columns={
    "Timestamp": "datetime_hour",
    "RegionCode": "insee_region",
    "Périmètre": "region_name"
})

# Ensuring the merge columns having the same type
df['insee_region'] = df['insee_region'].astype(int)
df_region['insee_region'] = df_region['insee_region'].astype(int)
df['region_name'] = df['region_name'].str.strip()
df_region['region_name'] = df_region['region_name'].str.strip()

df = df.merge(df4, how="left", on=["datetime_hour", "insee_region", "region_name"])

## Fixing issue of unemployment rate

In [None]:
#Fixing issue regarding region 52
df_r52 = pd.read_excel("region_52_quarterly_values_final.xlsx")
df_r52.rename(columns={'region_code': 'insee_region',
                       'unemployment rates localized by region': 'unemployment_rate'}, inplace=True)
df_r52['year'] = pd.to_numeric(df_r52['year'], errors='coerce')
df_r52['quarter'] = pd.to_numeric(df_r52['quarter'], errors='coerce')
df_r52['unemployment_rate'] = pd.to_numeric(df_r52['unemployment_rate'], errors='coerce')
r52_dict = df_r52.set_index(['year','quarter'])['unemployment_rate'].to_dict()
mask_r52 = df['insee_region'] == 52
df.loc[mask_r52, 'unemployment_rate'] = df.loc[mask_r52].apply(
    lambda row: r52_dict.get((row['year'], row['quarter']), row['unemployment_rate']),
    axis=1
)

## Saving Final Dataset

In [None]:
# Saving the final dataset

df.to_csv("Hourly_Elec_Full_Finished_compleet.csv", index=False, date_format='%Y-%m-%d %H:%M:%S')