The aim of this notebook is to obtain the carbon emission of each region, and merge this data to the already preprocessed one called ``df_process.csv``. We only have the emmissions for the whole country from 2019 to 2023, and the emmissions per region for 2019 and 2020, so we will calculate weigths for each region between 2019 and 2020 and apply them to the 2019-2023 data. The emission time step is on a daily basis, so we will synchronize the already processed data with the emmission data. Finally, we also add to this dataset the regular and thermosensitive part of the energy curve, for both electricity and gaz. We save the obtained dataset in ``carbon_data.csv``.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.options.plotting.backend = "plotly"

from energydisaggregation.models.stat_model import (
    Stats,
    preprocess,
    split_year,
)

## Data processsing

### Collect source variables (Date, Région, Temperature)

In [2]:
# Import data
df_src = pd.read_csv("../Data/df_process_2022.csv",sep=",")
# df_src.info()
df_src.columns

Index(['Date - Heure', 'Région', 'consommation brute totale (mw) ',
       'température (°c)', 'nebulosité totale', 'vitesse du vent moyen 10 mn',
       'humidité', 'vitesse du vent en km/h', 'température ressentie',
       'saison', 'week_day', 'month', 'hour', 'is_holiday', 'is_bank_holiday',
       'day_of_year', 'years', 'regions', 'temperature_seuil', 'saturation',
       'diff_seuil', 'température ressentie.1', 'température (°c)_mean_48',
       'température (°c)_std_48', 'température (°c)_max_48',
       'température (°c)_min_48', 'nebulosité totale_mean_3',
       'nebulosité totale_std_3', 'month_sin', 'month_cos', 'week_day_sin',
       'week_day_cos', 'hour_sin', 'hour_cos',
       'consommation brute totale (mw) - lag_1',
       'consommation brute totale (mw) - lag_2',
       'consommation brute totale (mw) - lag_3',
       'consommation brute totale (mw) - lag_4',
       'consommation brute totale (mw) - lag_5',
       'consommation brute totale (mw) - lag_6',
       'co

In [3]:
vars_selected = [
    "Date - Heure",
    "Région",
    "saison",
    "week_day",
    "is_holiday",
    "is_bank_holiday",
    "consommation brute totale (mw) ",
    "consommation brute totale (mw) - lag_1",
    "consommation brute totale (mw) - lag_2",
    "consommation brute totale (mw) - lag_3",
    "consommation brute totale (mw) - lag_4",
    "consommation brute totale (mw) - lag_5",
    "consommation brute totale (mw) - lag_6",
    "consommation brute totale (mw) - lag_7",
    "température (°c)",
    "température (°c)_lag_1",
    "température (°c)_lag_2",
    "température (°c)_lag_3",
    "température (°c)_lag_4",
    "température (°c)_lag_5",
    "température (°c)_lag_6",
    "température (°c)_lag_7",
    "ratio gaz électricité",
]

key_vars = ["Date", "Region"]

values_var = [
    "saison",
    "week_day",
    "is_holiday",
    "is_bank_holiday",
    "c",
    "c1",
    "c2",
    "c3",
    "c4",
    "c5",
    "c6",
    "c7",
    "Temp",
    "Temp1",
    "Temp2",
    "Temp3",
    "Temp4",
    "Temp5",
    "Temp6",
    "Temp7",
    "ratio_cge",
]

tot_vars = key_vars + values_var

renamed_vars = dict(zip(vars_selected, tot_vars))
renamed_vars

{'Date - Heure': 'Date',
 'Région': 'Region',
 'saison': 'saison',
 'week_day': 'week_day',
 'is_holiday': 'is_holiday',
 'is_bank_holiday': 'is_bank_holiday',
 'consommation brute totale (mw) ': 'c',
 'consommation brute totale (mw) - lag_1': 'c1',
 'consommation brute totale (mw) - lag_2': 'c2',
 'consommation brute totale (mw) - lag_3': 'c3',
 'consommation brute totale (mw) - lag_4': 'c4',
 'consommation brute totale (mw) - lag_5': 'c5',
 'consommation brute totale (mw) - lag_6': 'c6',
 'consommation brute totale (mw) - lag_7': 'c7',
 'température (°c)': 'Temp',
 'température (°c)_lag_1': 'Temp1',
 'température (°c)_lag_2': 'Temp2',
 'température (°c)_lag_3': 'Temp3',
 'température (°c)_lag_4': 'Temp4',
 'température (°c)_lag_5': 'Temp5',
 'température (°c)_lag_6': 'Temp6',
 'température (°c)_lag_7': 'Temp7',
 'ratio gaz électricité': 'ratio_cge'}

In [4]:
df_X = df_src.copy()

df_X = df_X[vars_selected]

# We truncate "Date - Heure" so that we only keep the Date
def truncate(s, start=0, end=10):
    return s[start:end]

var = "Date - Heure"
res_map = map(truncate, df_src[var])
date_truncated = pd.Series(np.array(list(res_map)))
date_truncated = pd.to_datetime(date_truncated, format="%Y-%m-%d")
df_X[var] = date_truncated

In [5]:
# On renomme les variables
df_X = df_X.rename(columns=renamed_vars)
df_X

Unnamed: 0,Date,Region,saison,week_day,is_holiday,is_bank_holiday,c,c1,c2,c3,...,c7,Temp,Temp1,Temp2,Temp3,Temp4,Temp5,Temp6,Temp7,ratio_cge
0,2013-01-01,Auvergne-Rhône-Alpes,3,1,True,True,8105.0,,,,...,,9.375000,,,,,,,,1.655636
1,2013-01-01,Bourgogne-Franche-Comté,3,1,True,True,7196.0,,,,...,,2.675000,,,,,,,,1.308630
2,2013-01-01,Bretagne,3,1,True,True,10441.0,,,,...,,9.475000,,,,,,,,1.247310
3,2013-01-01,Centre-Val de Loire,3,1,True,True,3252.0,,,,...,,8.200000,,,,,,,,0.000000
4,2013-01-01,Grand Est,3,1,True,True,4911.0,,,,...,,5.075000,,,,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147963,2023-11-30,Nouvelle-Aquitaine,0,3,False,False,3515.0,-330.0,-203.0,-752.0,...,-887.0,8.566667,-1.466667,-1.608333,-1.308333,-1.008333,-1.533333,-2.333333,-3.133333,0.000000
1147964,2023-11-30,Occitanie,0,3,False,False,8373.0,-269.0,32.0,-673.0,...,-263.0,9.166667,-0.666667,-1.108333,-1.475000,-1.841667,-2.183333,-2.516667,-2.850000,1.379369
1147965,2023-11-30,Pays de la Loire,0,3,False,False,4644.0,17.0,577.0,260.0,...,182.0,4.516667,-0.066667,-0.433333,-0.900000,-1.366667,-2.858333,-4.691667,-6.525000,0.000000
1147966,2023-11-30,Provence-Alpes-Côte d'Azur,0,3,False,False,3989.0,190.0,179.0,-487.0,...,-363.0,2.908333,0.166667,-1.841667,-4.575000,-7.308333,-8.266667,-8.633333,-9.000000,0.000000


In [6]:
# On sépare les variables de consommation des autres variables
conso_vars = np.array(tot_vars)[(np.array(tot_vars) >= "c") * (np.array(tot_vars) <= "c9")]

# On génère les datasets consommation VS autres variables
df_conso = df_X[key_vars].join(df_X[conso_vars])
df_not_conso = df_X.drop(columns=conso_vars)

# Il faut sommer la consommation au lieu de la moyenner
# Tandis qu'on moyenne des autres variables telles que la température
df_conso = df_conso.groupby(key_vars).sum()
df_not_conso = df_not_conso.groupby(key_vars).mean()

# On joint les deux datasets
df_X = df_not_conso.join(df_conso, on=key_vars)
df_X

Unnamed: 0_level_0,Unnamed: 1_level_0,saison,week_day,is_holiday,is_bank_holiday,Temp,Temp1,Temp2,Temp3,Temp4,Temp5,...,Temp7,ratio_cge,c,c1,c2,c3,c4,c5,c6,c7
Date,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-01-01,Auvergne-Rhône-Alpes,3.0,1.0,1.0,1.0,6.137500,-0.307609,-0.613258,-0.846429,-0.992917,-1.068421,...,-1.305882,0.690045,209632.0,8080.0,7785.0,7945.0,8964.0,10785.0,12438.0,13480.0
2013-01-01,Bourgogne-Franche-Comté,3.0,1.0,1.0,1.0,5.512500,0.015217,0.034848,0.047619,0.058333,0.074123,...,0.179902,0.730528,257686.0,2561.0,3124.0,3617.0,4259.0,5229.0,6097.0,6609.0
2013-01-01,Bretagne,3.0,1.0,1.0,1.0,8.370833,-0.092391,-0.107197,-0.144048,-0.237917,-0.390789,...,-0.545588,0.624062,217198.0,2957.0,2892.0,2781.0,3495.0,4717.0,5800.0,6148.0
2013-01-01,Centre-Val de Loire,3.0,1.0,1.0,1.0,7.590625,-0.211594,-0.428030,-0.636905,-0.828333,-0.997807,...,-1.227941,0.524010,195120.0,2252.0,2334.0,2556.0,3052.0,3875.0,4626.0,4787.0
2013-01-01,Grand Est,3.0,1.0,1.0,1.0,6.761458,0.010870,0.029167,0.036905,0.032083,0.016667,...,-0.023529,0.410463,169152.0,4994.0,5317.0,5601.0,6656.0,8336.0,9974.0,10975.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-30,Nouvelle-Aquitaine,0.0,3.0,0.0,0.0,9.439583,0.188194,0.445833,0.703125,0.936806,1.146875,...,1.607292,0.278390,110842.0,-679.0,-1284.0,-1824.0,-2777.0,-3498.0,-4517.0,-5565.0
2023-11-30,Occitanie,0.0,3.0,0.0,0.0,9.281250,0.075694,0.204167,0.307292,0.359028,0.359375,...,0.525347,0.511076,157781.0,-417.0,-689.0,-1271.0,-1861.0,-2310.0,-3032.0,-3173.0
2023-11-30,Pays de la Loire,0.0,3.0,0.0,0.0,6.558333,-0.073264,-0.147917,-0.211458,-0.259722,-0.292708,...,-0.215625,0.295363,108013.0,-38.0,-63.0,-95.0,-348.0,-453.0,-581.0,-414.0
2023-11-30,Provence-Alpes-Côte d'Azur,0.0,3.0,0.0,0.0,6.721875,-0.037500,-0.175000,-0.282292,-0.315972,-0.276042,...,0.073264,0.317127,117280.0,-547.0,-1214.0,-1808.0,-2412.0,-3077.0,-3679.0,-4176.0


In [7]:
df_X = df_X.reset_index()
df_X

Unnamed: 0,Date,Region,saison,week_day,is_holiday,is_bank_holiday,Temp,Temp1,Temp2,Temp3,...,Temp7,ratio_cge,c,c1,c2,c3,c4,c5,c6,c7
0,2013-01-01,Auvergne-Rhône-Alpes,3.0,1.0,1.0,1.0,6.137500,-0.307609,-0.613258,-0.846429,...,-1.305882,0.690045,209632.0,8080.0,7785.0,7945.0,8964.0,10785.0,12438.0,13480.0
1,2013-01-01,Bourgogne-Franche-Comté,3.0,1.0,1.0,1.0,5.512500,0.015217,0.034848,0.047619,...,0.179902,0.730528,257686.0,2561.0,3124.0,3617.0,4259.0,5229.0,6097.0,6609.0
2,2013-01-01,Bretagne,3.0,1.0,1.0,1.0,8.370833,-0.092391,-0.107197,-0.144048,...,-0.545588,0.624062,217198.0,2957.0,2892.0,2781.0,3495.0,4717.0,5800.0,6148.0
3,2013-01-01,Centre-Val de Loire,3.0,1.0,1.0,1.0,7.590625,-0.211594,-0.428030,-0.636905,...,-1.227941,0.524010,195120.0,2252.0,2334.0,2556.0,3052.0,3875.0,4626.0,4787.0
4,2013-01-01,Grand Est,3.0,1.0,1.0,1.0,6.761458,0.010870,0.029167,0.036905,...,-0.023529,0.410463,169152.0,4994.0,5317.0,5601.0,6656.0,8336.0,9974.0,10975.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47827,2023-11-30,Nouvelle-Aquitaine,0.0,3.0,0.0,0.0,9.439583,0.188194,0.445833,0.703125,...,1.607292,0.278390,110842.0,-679.0,-1284.0,-1824.0,-2777.0,-3498.0,-4517.0,-5565.0
47828,2023-11-30,Occitanie,0.0,3.0,0.0,0.0,9.281250,0.075694,0.204167,0.307292,...,0.525347,0.511076,157781.0,-417.0,-689.0,-1271.0,-1861.0,-2310.0,-3032.0,-3173.0
47829,2023-11-30,Pays de la Loire,0.0,3.0,0.0,0.0,6.558333,-0.073264,-0.147917,-0.211458,...,-0.215625,0.295363,108013.0,-38.0,-63.0,-95.0,-348.0,-453.0,-581.0,-414.0
47830,2023-11-30,Provence-Alpes-Côte d'Azur,0.0,3.0,0.0,0.0,6.721875,-0.037500,-0.175000,-0.282292,...,0.073264,0.317127,117280.0,-547.0,-1214.0,-1808.0,-2412.0,-3077.0,-3679.0,-4176.0


### Emissions weigts per region

In [8]:
df_emissions = pd.read_csv("../Data/emissions_2020_2021_reg.csv", sep=";")
df_emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   years                 24 non-null     int64 
 1   Région                24 non-null     object
 2   CO2e (kg) par hab     24 non-null     int64 
 3   Nb hab (en milliers)  24 non-null     object
 4   CO2e (kg) total       24 non-null     object
 5   Poids                 24 non-null     object
dtypes: int64(2), object(4)
memory usage: 1.3+ KB


In [9]:
# Convert the variables to numeric
vars = ["Nb hab (en milliers)", "CO2e (kg) total", "Poids"]

for var in vars:
    df_emissions[var] = df_emissions[var].str.replace(
        r"[,]", ".", regex=True
    )  # convert , into .
    df_emissions[var] = pd.to_numeric(df_emissions[var])


# Obtain the weigts for each region
carbon_weights = df_emissions.groupby(["Région"]).mean()["Poids"]
carbon_weights

Région
Auvergne-Rhône-Alpes          0.120715
Bourgogne-Franche-Comté       0.041468
Bretagne                      0.055237
Centre-Val de Loire           0.037048
Grand Est                     0.080020
Hauts-de-France               0.099611
Normandie                     0.050932
Nouvelle-Aquitaine            0.093203
Occitanie                     0.089981
Pays de la Loire              0.057626
Provence-Alpes-Côte d'Azur    0.084029
Île-de-France                 0.190130
Name: Poids, dtype: float64

### Total emissions per day (MtCO2 / day)

In [10]:
df_carbon_eu = pd.read_csv(
    "../Data/carbonmonitor-eu_datas_2024-02-23.csv", sep=","
)
df_carbon_eu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345402 entries, 0 to 345401
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   country    345402 non-null  object 
 1   date       345402 non-null  object 
 2   sector     345402 non-null  object 
 3   value      345402 non-null  float64
 4   timestamp  345402 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 13.2+ MB


In [11]:
df_carbon_eu["sector"].unique()

array(['Power', 'Industry', 'Ground Transport', 'Residential',
       'International Aviation', 'Domestic Aviation'], dtype=object)

In [12]:
# Only electricity emissions
df_carbon_fr = df_carbon_eu[df_carbon_eu["sector"] == "Power"]  
# Only France emissions
df_carbon_fr = df_carbon_fr[df_carbon_fr["country"] == "France"]
# Convert to datetime
df_carbon_fr["date"] = pd.to_datetime(df_carbon_fr["date"], format="%d/%m/%Y")
df_carbon_fr = pd.DataFrame(
    {"Date": df_carbon_fr["date"], "Emissions": df_carbon_fr["value"]},
    columns=["Date", "Emissions"],
)
df_carbon_fr = df_carbon_fr.reset_index(drop=True)
df_carbon_fr

Unnamed: 0,Date,Emissions
0,2019-01-01,0.052318
1,2019-01-02,0.080056
2,2019-01-03,0.094601
3,2019-01-04,0.106710
4,2019-01-05,0.115383
...,...,...
1852,2024-01-27,0.053079
1853,2024-01-28,0.052237
1854,2024-01-29,0.057638
1855,2024-01-30,0.063976


### Synchronize `Date` of `df_X` and `df_carbon_fr`

In [13]:
start_date = "2019-01-01"
end_date = "2022-12-31"

In [14]:
df_X = df_X[(df_X["Date"] >= start_date) * (df_X["Date"] <= end_date)]
df_X = df_X.reset_index(drop=True)
df_X

Unnamed: 0,Date,Region,saison,week_day,is_holiday,is_bank_holiday,Temp,Temp1,Temp2,Temp3,...,Temp7,ratio_cge,c,c1,c2,c3,c4,c5,c6,c7
0,2019-01-01,Auvergne-Rhône-Alpes,3.0,1.0,1.0,1.0,3.832292,-0.165278,-0.308333,-0.425000,...,-0.593056,0.432909,154747.0,-546.0,-846.0,-970.0,-1250.0,-2089.0,-3383.0,-4756.0
1,2019-01-01,Bourgogne-Franche-Comté,3.0,1.0,1.0,1.0,6.077083,-0.054861,-0.112500,-0.169792,...,-0.388542,0.446671,142703.0,-97.0,-92.0,-11.0,139.0,76.0,-182.0,-443.0
2,2019-01-01,Bretagne,3.0,1.0,1.0,1.0,8.797917,0.004861,0.012500,0.019792,...,0.042014,0.502980,155547.0,-228.0,-268.0,-206.0,-204.0,-451.0,-1010.0,-1694.0
3,2019-01-01,Centre-Val de Loire,3.0,1.0,1.0,1.0,7.047917,-0.002778,0.016667,0.029167,...,0.013194,0.350377,120887.0,-86.0,-88.0,34.0,55.0,-144.0,-411.0,-740.0
4,2019-01-01,Grand Est,3.0,1.0,1.0,1.0,5.632292,0.021528,0.062500,0.086458,...,-0.026042,0.432330,136332.0,-37.0,73.0,258.0,366.0,147.0,-354.0,-1039.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17527,2022-12-31,Nouvelle-Aquitaine,3.0,5.0,1.0,0.0,12.836458,-0.090972,-0.179167,-0.272917,...,-0.515972,0.285808,111921.0,697.0,1201.0,595.0,95.0,-972.0,-1953.0,-2934.0
17528,2022-12-31,Occitanie,3.0,5.0,1.0,0.0,12.822917,0.007292,0.102083,0.156250,...,0.098611,0.422212,158178.0,771.0,529.0,-846.0,-1179.0,-2174.0,-2571.0,-3100.0
17529,2022-12-31,Pays de la Loire,3.0,5.0,1.0,0.0,16.056250,-0.054514,-0.052083,-0.029167,...,0.156250,0.397934,132647.0,53.0,-153.0,-538.0,-1079.0,-1779.0,-2455.0,-3108.0
17530,2022-12-31,Provence-Alpes-Côte d'Azur,3.0,5.0,1.0,0.0,11.986458,0.077431,0.256250,0.442708,...,0.855208,0.447660,132575.0,-255.0,-792.0,-1528.0,-2315.0,-3030.0,-3380.0,-3565.0


In [15]:
df_carbon_fr = df_carbon_fr[(df_carbon_fr["Date"] >= start_date) * (df_carbon_fr["Date"] <= end_date)]
df_carbon_fr = df_carbon_fr.reset_index(drop=True)
df_carbon_fr

Unnamed: 0,Date,Emissions
0,2019-01-01,0.052318
1,2019-01-02,0.080056
2,2019-01-03,0.094601
3,2019-01-04,0.106710
4,2019-01-05,0.115383
...,...,...
1456,2022-12-27,0.064782
1457,2022-12-28,0.054745
1458,2022-12-29,0.048935
1459,2022-12-30,0.044451


Now `df_X` and `df_carbon_fr` are the same length, modulo the number of regions :

In [16]:
len(df_X) / len(df_carbon_fr)

12.0

In [17]:
REGIONS = np.unique(df_emissions["Région"])
print(len(REGIONS))
print(REGIONS)

12
['Auvergne-Rhône-Alpes' 'Bourgogne-Franche-Comté' 'Bretagne'
 'Centre-Val de Loire' 'Grand Est' 'Hauts-de-France' 'Normandie'
 'Nouvelle-Aquitaine' 'Occitanie' 'Pays de la Loire'
 "Provence-Alpes-Côte d'Azur" 'Île-de-France']


### Apply the weigts to `df_carbon_fr`

In [18]:
df_y = pd.merge(
    df_carbon_fr,
    pd.DataFrame({"Region": REGIONS, "weights": np.array(carbon_weights)}),
    how="cross",
)
df_y["Emissions"] = df_y["Emissions"] * df_y["weights"]
df_y = df_y.drop(columns="weights")
df_y

Unnamed: 0,Date,Emissions,Region
0,2019-01-01,0.006316,Auvergne-Rhône-Alpes
1,2019-01-01,0.002170,Bourgogne-Franche-Comté
2,2019-01-01,0.002890,Bretagne
3,2019-01-01,0.001938,Centre-Val de Loire
4,2019-01-01,0.004186,Grand Est
...,...,...,...
17527,2022-12-31,0.004107,Nouvelle-Aquitaine
17528,2022-12-31,0.003965,Occitanie
17529,2022-12-31,0.002540,Pays de la Loire
17530,2022-12-31,0.003703,Provence-Alpes-Côte d'Azur


### Add regular and sensitive consumption

In [19]:
# Import data
df_conso = pd.read_csv("../Data/ctr_regions_2022.csv",sep=",")
df_conso

Unnamed: 0,Date - Heure,Région,c,t,r
0,2019-01-01 00:00:00+01:00,Auvergne-Rhône-Alpes,6636.638243,834.022786,5802.615457
1,2019-01-01 01:00:00+01:00,Auvergne-Rhône-Alpes,6636.638243,834.022786,5802.615457
2,2019-01-01 02:00:00+01:00,Auvergne-Rhône-Alpes,6768.029766,965.414309,5802.615457
3,2019-01-01 03:00:00+01:00,Auvergne-Rhône-Alpes,6768.029766,880.805962,5887.223804
4,2019-01-01 04:00:00+01:00,Auvergne-Rhône-Alpes,6768.029766,965.414309,5802.615457
...,...,...,...,...,...
420763,2022-12-31 19:00:00+01:00,Île-de-France,6192.229955,233.662122,5958.567832
420764,2022-12-31 20:00:00+01:00,Île-de-France,6192.229955,233.662122,5958.567832
420765,2022-12-31 21:00:00+01:00,Île-de-France,6192.229955,198.926862,5993.303093
420766,2022-12-31 22:00:00+01:00,Île-de-France,6184.505908,246.983946,5937.521962


In [20]:
# We truncate "Date - Heure" so that we only keep the Date
var = "Date - Heure"
res_map = map(truncate, df_conso[var])
date_truncated = pd.Series(np.array(list(res_map)))
date_truncated = pd.to_datetime(date_truncated, format="%Y-%m-%d")
df_conso[var] = date_truncated
df_conso

Unnamed: 0,Date - Heure,Région,c,t,r
0,2019-01-01,Auvergne-Rhône-Alpes,6636.638243,834.022786,5802.615457
1,2019-01-01,Auvergne-Rhône-Alpes,6636.638243,834.022786,5802.615457
2,2019-01-01,Auvergne-Rhône-Alpes,6768.029766,965.414309,5802.615457
3,2019-01-01,Auvergne-Rhône-Alpes,6768.029766,880.805962,5887.223804
4,2019-01-01,Auvergne-Rhône-Alpes,6768.029766,965.414309,5802.615457
...,...,...,...,...,...
420763,2022-12-31,Île-de-France,6192.229955,233.662122,5958.567832
420764,2022-12-31,Île-de-France,6192.229955,233.662122,5958.567832
420765,2022-12-31,Île-de-France,6192.229955,198.926862,5993.303093
420766,2022-12-31,Île-de-France,6184.505908,246.983946,5937.521962


In [21]:
renamed_vars = dict(zip(df_conso.columns, key_vars + ['c_pred','t_pred','r_pred']))

# On renomme les variables
df_conso = df_conso.rename(columns=renamed_vars)

# Il faut sommer la consommation au lieu de la moyenner
df_conso = df_conso.groupby(key_vars).sum()

In [22]:
# On joint les deux datasets
df_X = df_X.set_index(key_vars).join(df_conso, on=key_vars)
df_X

Unnamed: 0_level_0,Unnamed: 1_level_0,saison,week_day,is_holiday,is_bank_holiday,Temp,Temp1,Temp2,Temp3,Temp4,Temp5,...,c1,c2,c3,c4,c5,c6,c7,c_pred,t_pred,r_pred
Date,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019-01-01,Auvergne-Rhône-Alpes,3.0,1.0,1.0,1.0,3.832292,-0.165278,-0.308333,-0.425000,-0.513889,-0.575000,...,-546.0,-846.0,-970.0,-1250.0,-2089.0,-3383.0,-4756.0,162107.166762,22704.366296,139402.800466
2019-01-01,Bourgogne-Franche-Comté,3.0,1.0,1.0,1.0,6.077083,-0.054861,-0.112500,-0.169792,-0.225694,-0.280208,...,-97.0,-92.0,-11.0,139.0,76.0,-182.0,-443.0,172379.800496,28859.060207,143520.740289
2019-01-01,Bretagne,3.0,1.0,1.0,1.0,8.797917,0.004861,0.012500,0.019792,0.025694,0.030208,...,-228.0,-268.0,-206.0,-204.0,-451.0,-1010.0,-1694.0,190223.324470,49560.070355,140663.254115
2019-01-01,Centre-Val de Loire,3.0,1.0,1.0,1.0,7.047917,-0.002778,0.016667,0.029167,0.025000,0.004167,...,-86.0,-88.0,34.0,55.0,-144.0,-411.0,-740.0,157710.405968,7547.136531,150163.269437
2019-01-01,Grand Est,3.0,1.0,1.0,1.0,5.632292,0.021528,0.062500,0.086458,0.081250,0.046875,...,-37.0,73.0,258.0,366.0,147.0,-354.0,-1039.0,192130.334456,50910.424973,141219.909483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31,Nouvelle-Aquitaine,3.0,5.0,1.0,0.0,12.836458,-0.090972,-0.179167,-0.272917,-0.375000,-0.485417,...,697.0,1201.0,595.0,95.0,-972.0,-1953.0,-2934.0,147726.789870,13024.476700,134702.313170
2022-12-31,Occitanie,3.0,5.0,1.0,0.0,12.822917,0.007292,0.102083,0.156250,0.127083,0.014583,...,771.0,529.0,-846.0,-1179.0,-2174.0,-2571.0,-3100.0,170558.338970,12657.987785,157900.351184
2022-12-31,Pays de la Loire,3.0,5.0,1.0,0.0,16.056250,-0.054514,-0.052083,-0.029167,0.002083,0.041667,...,53.0,-153.0,-538.0,-1079.0,-1779.0,-2455.0,-3108.0,145642.991192,3116.536489,142526.454702
2022-12-31,Provence-Alpes-Côte d'Azur,3.0,5.0,1.0,0.0,11.986458,0.077431,0.256250,0.442708,0.605556,0.744792,...,-255.0,-792.0,-1528.0,-2315.0,-3030.0,-3380.0,-3565.0,162368.384547,17058.684725,145309.699822


`ce_pred` is a prediction, it is not the truth. So we take the percentage among `re_pred` and `te_pred`, and we apply those percentages to `ce`, giving new variables `re` and `te`. We add those variables to the list of predictive variables called `values_var`.

In [23]:
values_var += ['t','r']

df_X['t_pred'] = df_X['t_pred'] / df_X['c_pred']
df_X['r_pred'] = df_X['r_pred'] / df_X['c_pred']
df_X['t'] = df_X['t_pred'] * df_X['c']
df_X['r'] = df_X['r_pred'] * df_X['c']
df_X = df_X.drop(columns=['c_pred','t_pred','r_pred'])
df_X

Unnamed: 0_level_0,Unnamed: 1_level_0,saison,week_day,is_holiday,is_bank_holiday,Temp,Temp1,Temp2,Temp3,Temp4,Temp5,...,c,c1,c2,c3,c4,c5,c6,c7,t,r
Date,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019-01-01,Auvergne-Rhône-Alpes,3.0,1.0,1.0,1.0,3.832292,-0.165278,-0.308333,-0.425000,-0.513889,-0.575000,...,154747.0,-546.0,-846.0,-970.0,-1250.0,-2089.0,-3383.0,-4756.0,21673.517843,133073.482157
2019-01-01,Bourgogne-Franche-Comté,3.0,1.0,1.0,1.0,6.077083,-0.054861,-0.112500,-0.169792,-0.225694,-0.280208,...,142703.0,-97.0,-92.0,-11.0,139.0,76.0,-182.0,-443.0,23890.702141,118812.297859
2019-01-01,Bretagne,3.0,1.0,1.0,1.0,8.797917,0.004861,0.012500,0.019792,0.025694,0.030208,...,155547.0,-228.0,-268.0,-206.0,-204.0,-451.0,-1010.0,-1694.0,40525.631044,115021.368956
2019-01-01,Centre-Val de Loire,3.0,1.0,1.0,1.0,7.047917,-0.002778,0.016667,0.029167,0.025000,0.004167,...,120887.0,-86.0,-88.0,34.0,55.0,-144.0,-411.0,-740.0,5784.974607,115102.025393
2019-01-01,Grand Est,3.0,1.0,1.0,1.0,5.632292,0.021528,0.062500,0.086458,0.081250,0.046875,...,136332.0,-37.0,73.0,258.0,366.0,147.0,-354.0,-1039.0,36125.061027,100206.938973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31,Nouvelle-Aquitaine,3.0,5.0,1.0,0.0,12.836458,-0.090972,-0.179167,-0.272917,-0.375000,-0.485417,...,111921.0,697.0,1201.0,595.0,95.0,-972.0,-1953.0,-2934.0,9867.624268,102053.375732
2022-12-31,Occitanie,3.0,5.0,1.0,0.0,12.822917,0.007292,0.102083,0.156250,0.127083,0.014583,...,158178.0,771.0,529.0,-846.0,-1179.0,-2174.0,-2571.0,-3100.0,11739.180881,146438.819119
2022-12-31,Pays de la Loire,3.0,5.0,1.0,0.0,16.056250,-0.054514,-0.052083,-0.029167,0.002083,0.041667,...,132647.0,53.0,-153.0,-538.0,-1079.0,-1779.0,-2455.0,-3108.0,2838.442223,129808.557777
2022-12-31,Provence-Alpes-Côte d'Azur,3.0,5.0,1.0,0.0,11.986458,0.077431,0.256250,0.442708,0.605556,0.744792,...,132575.0,-255.0,-792.0,-1528.0,-2315.0,-3030.0,-3380.0,-3565.0,13928.543625,118646.456375


### Final merge

In [24]:
df_processed = df_X.join(df_y.set_index(key_vars), on=key_vars)
df_processed = df_processed.reset_index()
df_processed

Unnamed: 0,Date,Region,saison,week_day,is_holiday,is_bank_holiday,Temp,Temp1,Temp2,Temp3,...,c1,c2,c3,c4,c5,c6,c7,t,r,Emissions
0,2019-01-01,Auvergne-Rhône-Alpes,3.0,1.0,1.0,1.0,3.832292,-0.165278,-0.308333,-0.425000,...,-546.0,-846.0,-970.0,-1250.0,-2089.0,-3383.0,-4756.0,21673.517843,133073.482157,0.006316
1,2019-01-01,Bourgogne-Franche-Comté,3.0,1.0,1.0,1.0,6.077083,-0.054861,-0.112500,-0.169792,...,-97.0,-92.0,-11.0,139.0,76.0,-182.0,-443.0,23890.702141,118812.297859,0.002170
2,2019-01-01,Bretagne,3.0,1.0,1.0,1.0,8.797917,0.004861,0.012500,0.019792,...,-228.0,-268.0,-206.0,-204.0,-451.0,-1010.0,-1694.0,40525.631044,115021.368956,0.002890
3,2019-01-01,Centre-Val de Loire,3.0,1.0,1.0,1.0,7.047917,-0.002778,0.016667,0.029167,...,-86.0,-88.0,34.0,55.0,-144.0,-411.0,-740.0,5784.974607,115102.025393,0.001938
4,2019-01-01,Grand Est,3.0,1.0,1.0,1.0,5.632292,0.021528,0.062500,0.086458,...,-37.0,73.0,258.0,366.0,147.0,-354.0,-1039.0,36125.061027,100206.938973,0.004186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17527,2022-12-31,Nouvelle-Aquitaine,3.0,5.0,1.0,0.0,12.836458,-0.090972,-0.179167,-0.272917,...,697.0,1201.0,595.0,95.0,-972.0,-1953.0,-2934.0,9867.624268,102053.375732,0.004107
17528,2022-12-31,Occitanie,3.0,5.0,1.0,0.0,12.822917,0.007292,0.102083,0.156250,...,771.0,529.0,-846.0,-1179.0,-2174.0,-2571.0,-3100.0,11739.180881,146438.819119,0.003965
17529,2022-12-31,Pays de la Loire,3.0,5.0,1.0,0.0,16.056250,-0.054514,-0.052083,-0.029167,...,53.0,-153.0,-538.0,-1079.0,-1779.0,-2455.0,-3108.0,2838.442223,129808.557777,0.002540
17530,2022-12-31,Provence-Alpes-Côte d'Azur,3.0,5.0,1.0,0.0,11.986458,0.077431,0.256250,0.442708,...,-255.0,-792.0,-1528.0,-2315.0,-3030.0,-3380.0,-3565.0,13928.543625,118646.456375,0.003703


## Save dataset

In [25]:
integer_vars = ['saison', 'week_day', 'is_holiday', 'is_bank_holiday']
df_processed[integer_vars] = df_processed[integer_vars].astype(int)

In [26]:
df_processed.dtypes

Date               datetime64[ns]
Region                     object
saison                      int32
week_day                    int32
is_holiday                  int32
is_bank_holiday             int32
Temp                      float64
Temp1                     float64
Temp2                     float64
Temp3                     float64
Temp4                     float64
Temp5                     float64
Temp6                     float64
Temp7                     float64
ratio_cge                 float64
c                         float64
c1                        float64
c2                        float64
c3                        float64
c4                        float64
c5                        float64
c6                        float64
c7                        float64
t                         float64
r                         float64
Emissions                 float64
dtype: object

In [54]:
df_processed.to_csv('../Data/carbon_data_2022.csv', index=False)