# Import des librairies et chargement des données

In [51]:
import os

# grammaire abstraite de l'arbre syntaxique de Python
import ast

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import is_string_dtype, is_numeric_dtype

pd.options.mode.chained_assignment = None  # default='warn'

In [52]:
dossier_data = "data/"
nom_fichier = "batiments_data.csv"
batiments_data = pd.read_csv(dossier_data+nom_fichier)

In [53]:
pd.set_option("display.max_columns", None)
batiments_data.head()

Unnamed: 0,OSEBuildingID,BuildingType,PrimaryPropertyType,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFABuilding,ENERGYSTARScore,SiteEnergyUse,SteamUse,Electricity,NaturalGas,OtherFuelUse,GHGEmissions,BuildingAge,IsSteamUser,IsElectricityUser,IsNaturalGasUser,IsOtherFuelUser,NumberOfUsedEnergies,MostUsedEnergy,GatheredPrimaryPropertyType,LogSiteEnergyUse,LogGHGEmissions
0,3,NonResidential,Hotel,DOWNTOWN,1969,1.0,41.0,961990.0,18.0,73130656.0,19660404.0,49762435.0,3709900.0,0.0,2061.48,46,1.0,1.0,1.0,0.0,3.0,Electricity,Hotel,18.107758,7.631664
1,5,NonResidential,Hotel,DOWNTOWN,1926,1.0,10.0,61320.0,1.0,28229320.0,23458518.0,2769023.0,2001894.0,0.0,1936.34,89,1.0,1.0,1.0,0.0,3.0,Steam,Hotel,17.155872,7.569071
2,8,NonResidential,Hotel,DOWNTOWN,1980,1.0,18.0,107430.0,67.0,14829099.0,0.0,6066245.0,8763105.0,0.0,507.7,35,0.0,1.0,1.0,0.0,2.0,NaturalGas,Hotel,16.512102,6.231858
3,19,NonResidential,Hotel,DOWNTOWN,1922,1.0,11.0,67390.0,14.0,10711451.0,4403788.0,4089407.0,2218425.0,0.0,486.25,93,1.0,1.0,1.0,0.0,3.0,Steam,Hotel,16.186824,6.188777
4,25,NonResidential,Hotel,DOWNTOWN,1916,1.0,10.0,104352.0,83.0,7845112.0,3205497.0,1790665.0,2849024.0,0.0,411.22,99,1.0,1.0,1.0,0.0,3.0,Steam,Hotel,15.875401,6.021557


# Sélection des variables finales

In [54]:
batiments_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2320 entries, 0 to 2319
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   OSEBuildingID                2320 non-null   int64  
 1   BuildingType                 2320 non-null   object 
 2   PrimaryPropertyType          2320 non-null   object 
 3   Neighborhood                 2320 non-null   object 
 4   YearBuilt                    2320 non-null   int64  
 5   NumberofBuildings            2320 non-null   float64
 6   NumberofFloors               2311 non-null   float64
 7   PropertyGFABuilding          2318 non-null   float64
 8   ENERGYSTARScore              1532 non-null   float64
 9   SiteEnergyUse                2320 non-null   float64
 10  SteamUse                     2320 non-null   float64
 11  Electricity                  2320 non-null   float64
 12  NaturalGas                   2320 non-null   float64
 13  OtherFuelUse      

In [55]:
variables_ecartees = ["OSEBuildingID", "PrimaryPropertyType", "ENERGYSTARScore", "YearBuilt", "SteamUse", "Electricity",
                      "NaturalGas", "OtherFuelUse", "SiteEnergyUse", "GHGEmissions"]
batiments_data_modeles = batiments_data.drop(columns=variables_ecartees)

# Préparation des données

In [56]:
batiments_data_modeles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2320 entries, 0 to 2319
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BuildingType                 2320 non-null   object 
 1   Neighborhood                 2320 non-null   object 
 2   NumberofBuildings            2320 non-null   float64
 3   NumberofFloors               2311 non-null   float64
 4   PropertyGFABuilding          2318 non-null   float64
 5   BuildingAge                  2320 non-null   int64  
 6   IsSteamUser                  2320 non-null   float64
 7   IsElectricityUser            2320 non-null   float64
 8   IsNaturalGasUser             2320 non-null   float64
 9   IsOtherFuelUser              2320 non-null   float64
 10  NumberOfUsedEnergies         2320 non-null   float64
 11  MostUsedEnergy               2320 non-null   object 
 12  GatheredPrimaryPropertyType  2320 non-null   object 
 13  LogSiteEnergyUse  

## Séparation du dataset en training et test sets

Cette séparation sera faite deux fois : 
- la première pour la consommation d'énergie totale
- la seconde pour les émissions de GES

In [57]:
X = batiments_data_modeles.drop(columns=["LogSiteEnergyUse", "LogGHGEmissions"])
y_conso_energie = batiments_data_modeles["LogSiteEnergyUse"]
y_emissions = batiments_data_modeles["LogGHGEmissions"]

In [58]:
from sklearn.model_selection import train_test_split

# Séparation pour la consommation d'énergie
X_conso_train, X_conso_test, y_conso_train, y_conso_test = train_test_split(X, y_conso_energie)

# Séparation pour les émissions de GES
X_emissions_train, X_emissions_test, y_emissions_train, y_emissions_test = train_test_split(X, y_emissions)

## Encodage des variables catégorielles

In [59]:
# Identification des variables catégorielles (celles dont le type est object)
s = (X.dtypes == 'object')
variables_categorielles = list(s[s].index)

In [60]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

############################################################################################
#                                   Consommation d'énergie
############################################################################################

# On encode chaque modalité des variables catégorielles par 1 ou 0 (présence ou absence de la modalité)
OH_X_conso_train = pd.DataFrame(OH_encoder.fit_transform(X_conso_train[variables_categorielles]))
OH_X_conso_test = pd.DataFrame(OH_encoder.transform(X_conso_test[variables_categorielles]))

# Remise en forme des datasets encodés, remise des noms des colonnes et des index
OH_X_conso_train.columns = OH_encoder.get_feature_names(variables_categorielles)
OH_X_conso_test.columns = OH_encoder.get_feature_names(variables_categorielles)

OH_X_conso_train.index = X_conso_train.index
OH_X_conso_test.index = X_conso_test.index

# Suppression des variables catégorielles et remplacement par les variables encodées
num_X_conso_train = X_conso_train.drop(columns=variables_categorielles)
num_X_conso_test = X_conso_test.drop(columns=variables_categorielles)

OH_X_conso_train = pd.concat([num_X_conso_train, OH_X_conso_train], axis=1)
OH_X_conso_test = pd.concat([num_X_conso_test, OH_X_conso_test], axis=1)

############################################################################################
#                                   Emissions de GES
############################################################################################

# On encode chaque modalité des variables catégorielles par 1 ou 0 (présence ou absence de la modalité)
OH_X_emissions_train = pd.DataFrame(OH_encoder.fit_transform(X_emissions_train[variables_categorielles]))
OH_X_emissions_test = pd.DataFrame(OH_encoder.transform(X_emissions_test[variables_categorielles]))

# Remise en forme des datasets encodés, remise des noms des colonnes et des index
OH_X_emissions_train.columns = OH_encoder.get_feature_names(variables_categorielles)
OH_X_emissions_test.columns = OH_encoder.get_feature_names(variables_categorielles)

OH_X_emissions_train.index = X_emissions_train.index
OH_X_emissions_test.index = X_emissions_test.index

# Suppression des variables catégorielles et remplacement par les variables encodées
num_X_emissions_train = X_emissions_train.drop(columns=variables_categorielles)
num_X_emissions_test = X_emissions_test.drop(columns=variables_categorielles)

OH_X_emissions_train = pd.concat([num_X_emissions_train, OH_X_emissions_train], axis=1)
OH_X_emissions_test = pd.concat([num_X_emissions_test, OH_X_emissions_test], axis=1)

## Imputation des données manquantes

In [61]:
OH_X_conso_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1740 entries, 1052 to 1835
Data columns (total 43 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   NumberofBuildings                                  1740 non-null   float64
 1   NumberofFloors                                     1734 non-null   float64
 2   PropertyGFABuilding                                1740 non-null   float64
 3   BuildingAge                                        1740 non-null   int64  
 4   IsSteamUser                                        1740 non-null   float64
 5   IsElectricityUser                                  1740 non-null   float64
 6   IsNaturalGasUser                                   1740 non-null   float64
 7   IsOtherFuelUser                                    1740 non-null   float64
 8   NumberOfUsedEnergies                               1740 non-null   float64
 9   Build

In [62]:
OH_X_conso_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580 entries, 2137 to 2097
Data columns (total 43 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   NumberofBuildings                                  580 non-null    float64
 1   NumberofFloors                                     577 non-null    float64
 2   PropertyGFABuilding                                578 non-null    float64
 3   BuildingAge                                        580 non-null    int64  
 4   IsSteamUser                                        580 non-null    float64
 5   IsElectricityUser                                  580 non-null    float64
 6   IsNaturalGasUser                                   580 non-null    float64
 7   IsOtherFuelUser                                    580 non-null    float64
 8   NumberOfUsedEnergies                               580 non-null    float64
 9   Buildi

Pour rappel, il reste des données manquantes sur le nombre d'étages et la surface des bâtiments. La première sera imputée par son mode et la seconde par la médiane.

In [63]:
from sklearn.impute import SimpleImputer

imputer_nb_etage = SimpleImputer(strategy="most_frequent")
imputer_surface_batiments = SimpleImputer(strategy="median")

############################################################################################
#                                   Consommation d'énergie
############################################################################################

# Imputation : on donne à l'imputer plusieurs colonnes car ne fonctionne pas avec une seule
imputed_X_conso_train = pd.DataFrame(imputer_nb_etage.fit_transform(OH_X_conso_train[["NumberofBuildings", "NumberofFloors"]]))
imputed_X_conso_train2 = pd.DataFrame(imputer_surface_batiments.fit_transform(OH_X_conso_train[["PropertyGFABuilding", 
                                                                                                "BuildingAge"]]))
imputed_X_conso_test = pd.DataFrame(imputer_nb_etage.transform(OH_X_conso_test[["NumberofBuildings", "NumberofFloors"]]))
imputed_X_conso_test2 = pd.DataFrame(imputer_surface_batiments.transform(OH_X_conso_test[["PropertyGFABuilding", 
                                                                                          "BuildingAge"]]))


# Remise en forme des datasets imputés, remise des noms des colonnes et des index
imputed_X_conso_train = pd.concat([imputed_X_conso_train, imputed_X_conso_train2], axis=1)
imputed_X_conso_test = pd.concat([imputed_X_conso_test, imputed_X_conso_test2], axis=1)

imputed_X_conso_train.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]
imputed_X_conso_test.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]

imputed_X_conso_train.index = OH_X_conso_train.index
imputed_X_conso_test.index = OH_X_conso_test.index

donnees_restantes_train = OH_X_conso_train.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                         "PropertyGFABuilding", "BuildingAge"])
donnees_restantes_test = OH_X_conso_test.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                       "PropertyGFABuilding", "BuildingAge"])

imputed_X_conso_train = pd.concat([imputed_X_conso_train, donnees_restantes_train], axis=1)
imputed_X_conso_test = pd.concat([imputed_X_conso_test, donnees_restantes_test], axis=1)

############################################################################################
#                                      Emissions de GES
############################################################################################

# Imputation : on donne à l'imputer plusieurs colonnes car ne fonctionne pas avec une seule
imputed_X_emissions_train = pd.DataFrame(imputer_nb_etage.fit_transform(OH_X_emissions_train[["NumberofBuildings", 
                                                                                              "NumberofFloors"]]))
imputed_X_emissions_train2 = pd.DataFrame(imputer_surface_batiments.fit_transform(OH_X_emissions_train[["PropertyGFABuilding",
                                                                                                        "BuildingAge"]]))
imputed_X_emissions_test = pd.DataFrame(imputer_nb_etage.transform(OH_X_emissions_test[["NumberofBuildings", 
                                                                                        "NumberofFloors"]]))
imputed_X_emissions_test2 = pd.DataFrame(imputer_surface_batiments.transform(OH_X_emissions_test[["PropertyGFABuilding", 
                                                                                                  "BuildingAge"]]))


# Remise en forme des datasets imputés, remise des noms des colonnes et des index
imputed_X_emissions_train = pd.concat([imputed_X_emissions_train, imputed_X_emissions_train2], axis=1)
imputed_X_emissions_test = pd.concat([imputed_X_emissions_test, imputed_X_emissions_test2], axis=1)

imputed_X_emissions_train.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]
imputed_X_emissions_test.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]

imputed_X_emissions_train.index = OH_X_emissions_train.index
imputed_X_emissions_test.index = OH_X_emissions_test.index

donnees_restantes_train = OH_X_emissions_train.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                             "PropertyGFABuilding", "BuildingAge"])
donnees_restantes_test = OH_X_emissions_test.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                           "PropertyGFABuilding", "BuildingAge"])

imputed_X_emissions_train = pd.concat([imputed_X_emissions_train, donnees_restantes_train], axis=1)
imputed_X_emissions_test = pd.concat([imputed_X_emissions_test, donnees_restantes_test], axis=1)

In [68]:
imputed_X_emissions_test.isna().sum()

NumberofBuildings                                    0
NumberofFloors                                       0
PropertyGFABuilding                                  0
BuildingAge                                          0
IsSteamUser                                          0
IsElectricityUser                                    0
IsNaturalGasUser                                     0
IsOtherFuelUser                                      0
NumberOfUsedEnergies                                 0
BuildingType_Campus                                  0
BuildingType_NonResidential                          0
BuildingType_Nonresidential COS                      0
BuildingType_Nonresidential WA                       0
BuildingType_SPS-District K-12                       0
Neighborhood_BALLARD                                 0
Neighborhood_CENTRAL                                 0
Neighborhood_DELRIDGE                                0
Neighborhood_DOWNTOWN                                0
Neighborho

## Standardisation des variables

In [69]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

############################################################################################
#                                   Consommation d'énergie
############################################################################################

# Standardisation des données
std_X_conso_train = pd.DataFrame(scaler.fit_transform(imputed_X_conso_train))
std_X_conso_test = pd.DataFrame(scaler.transform(imputed_X_conso_test))

# Remise des colonnes et des index du dataset initial
std_X_conso_train.columns = imputed_X_conso_train.columns
std_X_conso_test.columns = imputed_X_conso_test.columns

std_X_conso_train.index = imputed_X_conso_train.index
std_X_conso_test.index = imputed_X_conso_test.index

############################################################################################
#                                   Emissions de GES
############################################################################################

# Standardisation des données
std_X_emissions_train = pd.DataFrame(scaler.fit_transform(imputed_X_emissions_train))
std_X_emissions_test = pd.DataFrame(scaler.transform(imputed_X_emissions_test))

# Remise des colonnes et des index du dataset initial
std_X_emissions_train.columns = imputed_X_emissions_train.columns
std_X_emissions_test.columns = imputed_X_emissions_test.columns

std_X_emissions_train.index = imputed_X_emissions_train.index
std_X_emissions_test.index = imputed_X_emissions_test.index

Toutes nos variables ont un écart-type quasiment égal à 1 dans les jeux d'entraînement.

# Modélisation

## Sur la consommation d'énergie

In [95]:
def inverser(log_y_plus_1):
    return np.expm1(log_y_plus_1)

In [105]:
inverser(pred)

array([2923975.16198454, 2923975.16198454, 2923975.16198454, ...,
       2926902.3548816 , 2926902.3548816 , 2926902.3548816 ])

In [114]:
from sklearn import dummy
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import time

metriques = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']

start_time = time.time()
dum = dummy.DummyRegressor(strategy='mean')
#reg = cross_validate(dum, std_X_conso_train, y_conso_train, cv=5, scoring=metriques)
y_pred = cross_val_predict(dum, std_X_conso_train, y_conso_train, cv=5)
end_time = time.time()

duree_traitement = end_time-start_time

MAE = round(mean_absolute_error(inverser(y_conso_train), inverser(y_pred)), 3)
R2 = round(r2_score(inverser(y_conso_train), inverser(y_pred)), 3)
RMSE = round(mean_squared_error(inverser(y_conso_train), inverser(y_pred), squared=False), 3)

print("MAE : {}, R² : {:.3}, RMSE : {}, Temps de traitement : {:.3}s".format(MAE, R2, RMSE, duree_traitement))

MAE : 7387720.273, R² : -0.035, RMSE : 30881682.25, Temps de traitement : 0.016s


In [80]:
from sklearn import dummy
from sklearn.model_selection import GridSearchCV

dum = dummy.DummyRegressor(strategy='mean')
param_grid = {}
metriques = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']

reg = GridSearchCV(dum, param_grid, cv=5, scoring=metriques, refit=False)
reg.fit(std_X_conso_train, y_conso_train)

# Afficher le(s) hyperparamètre(s) optimaux
#print("Meilleur(s) hyperparamètre(s) sur le jeu d'entraînement:")
#print(reg.best_params_)

# Afficher les performances correspondantes
print("Résultats de la validation croisée :")
for mean, std, params in zip(
        reg.cv_results_['mean_test_score'], # score moyen
        reg.cv_results_['std_test_score'],  # écart-type du score
        reg.cv_results_['params']           # valeur de l'hyperparamètre
    ):

    print("{} = {:.3f} (+/-{:.03f}) for {}".format(
        score,
        mean,
        std*2,
        params
    ) )

Résultats de la validation croisée :


KeyError: 'mean_test_score'

In [81]:
reg.cv_results_

{'mean_fit_time': array([0.00239873]),
 'std_fit_time': array([0.00048949]),
 'mean_score_time': array([0.00239649]),
 'std_score_time': array([0.00048967]),
 'params': [{}],
 'split0_test_r2': array([-0.00203046]),
 'split1_test_r2': array([-0.00318065]),
 'split2_test_r2': array([-0.01037114]),
 'split3_test_r2': array([-0.00219042]),
 'split4_test_r2': array([-0.00257473]),
 'mean_test_r2': array([-0.00406948]),
 'std_test_r2': array([0.00317565]),
 'rank_test_r2': array([1]),
 'split0_test_neg_root_mean_squared_error': array([-1.36888854]),
 'split1_test_neg_root_mean_squared_error': array([-1.28396707]),
 'split2_test_neg_root_mean_squared_error': array([-1.33216756]),
 'split3_test_neg_root_mean_squared_error': array([-1.40260578]),
 'split4_test_neg_root_mean_squared_error': array([-1.31467844]),
 'mean_test_neg_root_mean_squared_error': array([-1.34046148]),
 'std_test_neg_root_mean_squared_error': array([0.04146127]),
 'rank_test_neg_root_mean_squared_error': array([1]),
 'spl

In [79]:
dir(reg)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refi

In [None]:
from sklearn import dummy
dum = dummy.DummyRegressor(strategy='mean')

# Entraînement
dum.fit(X_train_std, y_train)

# Prédiction sur le jeu de test
y_pred_dum = dum.predict(X_test_std)

# Evaluate
print("RMSE : {:.2f}".format(np.sqrt(metrics.mean_squared_error(y_test, y_pred_dum)) ))
