# Import des librairies et chargement des données

In [1]:
import os

# grammaire abstraite de l'arbre syntaxique de Python
import ast

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import is_string_dtype, is_numeric_dtype

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
dossier_data = "data/"
nom_fichier = "batiments_data.csv"
batiments_data = pd.read_csv(dossier_data+nom_fichier)

In [3]:
pd.set_option("display.max_columns", None)
batiments_data.head()

Unnamed: 0,OSEBuildingID,BuildingType,PrimaryPropertyType,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFABuilding,ENERGYSTARScore,SiteEnergyUse,SteamUse,Electricity,NaturalGas,OtherFuelUse,GHGEmissions,BuildingAge,IsSteamUser,IsElectricityUser,IsNaturalGasUser,IsOtherFuelUser,NumberOfUsedEnergies,MostUsedEnergy,GatheredPrimaryPropertyType,LogSiteEnergyUse,LogGHGEmissions
0,3,NonResidential,Hotel,DOWNTOWN,1969,1.0,41.0,961990.0,18.0,73130656.0,19660404.0,49762435.0,3709900.0,0.0,2061.48,46,1.0,1.0,1.0,0.0,3.0,Electricity,Hotel,18.107758,7.631664
1,5,NonResidential,Hotel,DOWNTOWN,1926,1.0,10.0,61320.0,1.0,28229320.0,23458518.0,2769023.0,2001894.0,0.0,1936.34,89,1.0,1.0,1.0,0.0,3.0,Steam,Hotel,17.155872,7.569071
2,8,NonResidential,Hotel,DOWNTOWN,1980,1.0,18.0,107430.0,67.0,14829099.0,0.0,6066245.0,8763105.0,0.0,507.7,35,0.0,1.0,1.0,0.0,2.0,NaturalGas,Hotel,16.512102,6.231858
3,19,NonResidential,Hotel,DOWNTOWN,1922,1.0,11.0,67390.0,14.0,10711451.0,4403788.0,4089407.0,2218425.0,0.0,486.25,93,1.0,1.0,1.0,0.0,3.0,Steam,Hotel,16.186824,6.188777
4,25,NonResidential,Hotel,DOWNTOWN,1916,1.0,10.0,104352.0,83.0,7845112.0,3205497.0,1790665.0,2849024.0,0.0,411.22,99,1.0,1.0,1.0,0.0,3.0,Steam,Hotel,15.875401,6.021557


# Sélection des variables finales

In [4]:
batiments_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2320 entries, 0 to 2319
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   OSEBuildingID                2320 non-null   int64  
 1   BuildingType                 2320 non-null   object 
 2   PrimaryPropertyType          2320 non-null   object 
 3   Neighborhood                 2320 non-null   object 
 4   YearBuilt                    2320 non-null   int64  
 5   NumberofBuildings            2320 non-null   float64
 6   NumberofFloors               2311 non-null   float64
 7   PropertyGFABuilding          2318 non-null   float64
 8   ENERGYSTARScore              1532 non-null   float64
 9   SiteEnergyUse                2320 non-null   float64
 10  SteamUse                     2320 non-null   float64
 11  Electricity                  2320 non-null   float64
 12  NaturalGas                   2320 non-null   float64
 13  OtherFuelUse      

In [5]:
variables_ecartees = ["OSEBuildingID", "PrimaryPropertyType", "ENERGYSTARScore", "YearBuilt", "SteamUse", "Electricity",
                      "NaturalGas", "OtherFuelUse", "SiteEnergyUse", "GHGEmissions"]
batiments_data_modeles = batiments_data.drop(columns=variables_ecartees)

# Préparation des données

In [6]:
batiments_data_modeles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2320 entries, 0 to 2319
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BuildingType                 2320 non-null   object 
 1   Neighborhood                 2320 non-null   object 
 2   NumberofBuildings            2320 non-null   float64
 3   NumberofFloors               2311 non-null   float64
 4   PropertyGFABuilding          2318 non-null   float64
 5   BuildingAge                  2320 non-null   int64  
 6   IsSteamUser                  2320 non-null   float64
 7   IsElectricityUser            2320 non-null   float64
 8   IsNaturalGasUser             2320 non-null   float64
 9   IsOtherFuelUser              2320 non-null   float64
 10  NumberOfUsedEnergies         2320 non-null   float64
 11  MostUsedEnergy               2320 non-null   object 
 12  GatheredPrimaryPropertyType  2320 non-null   object 
 13  LogSiteEnergyUse  

## Séparation du dataset en training et test sets

Cette séparation sera faite deux fois : 
- la première pour la consommation d'énergie totale
- la seconde pour les émissions de GES

In [7]:
X = batiments_data_modeles.drop(columns=["LogSiteEnergyUse", "LogGHGEmissions"])
y_conso_energie = batiments_data_modeles["LogSiteEnergyUse"]
y_emissions = batiments_data_modeles["LogGHGEmissions"]

In [8]:
from sklearn.model_selection import train_test_split

# Séparation pour la consommation d'énergie
X_conso_train, X_conso_test, y_conso_train, y_conso_test = train_test_split(X, y_conso_energie)

# Séparation pour les émissions de GES
X_emissions_train, X_emissions_test, y_emissions_train, y_emissions_test = train_test_split(X, y_emissions)

## Encodage des variables catégorielles

In [9]:
# Identification des variables catégorielles (celles dont le type est object)
s = (X.dtypes == 'object')
variables_categorielles = list(s[s].index)

In [10]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

############################################################################################
#                                   Consommation d'énergie
############################################################################################

# On encode chaque modalité des variables catégorielles par 1 ou 0 (présence ou absence de la modalité)
OH_X_conso_train = pd.DataFrame(OH_encoder.fit_transform(X_conso_train[variables_categorielles]))
OH_X_conso_test = pd.DataFrame(OH_encoder.transform(X_conso_test[variables_categorielles]))

# Remise en forme des datasets encodés, remise des noms des colonnes et des index
OH_X_conso_train.columns = OH_encoder.get_feature_names(variables_categorielles)
OH_X_conso_test.columns = OH_encoder.get_feature_names(variables_categorielles)

OH_X_conso_train.index = X_conso_train.index
OH_X_conso_test.index = X_conso_test.index

# Suppression des variables catégorielles et remplacement par les variables encodées
num_X_conso_train = X_conso_train.drop(columns=variables_categorielles)
num_X_conso_test = X_conso_test.drop(columns=variables_categorielles)

OH_X_conso_train = pd.concat([num_X_conso_train, OH_X_conso_train], axis=1)
OH_X_conso_test = pd.concat([num_X_conso_test, OH_X_conso_test], axis=1)

############################################################################################
#                                   Emissions de GES
############################################################################################

# On encode chaque modalité des variables catégorielles par 1 ou 0 (présence ou absence de la modalité)
OH_X_emissions_train = pd.DataFrame(OH_encoder.fit_transform(X_emissions_train[variables_categorielles]))
OH_X_emissions_test = pd.DataFrame(OH_encoder.transform(X_emissions_test[variables_categorielles]))

# Remise en forme des datasets encodés, remise des noms des colonnes et des index
OH_X_emissions_train.columns = OH_encoder.get_feature_names(variables_categorielles)
OH_X_emissions_test.columns = OH_encoder.get_feature_names(variables_categorielles)

OH_X_emissions_train.index = X_emissions_train.index
OH_X_emissions_test.index = X_emissions_test.index

# Suppression des variables catégorielles et remplacement par les variables encodées
num_X_emissions_train = X_emissions_train.drop(columns=variables_categorielles)
num_X_emissions_test = X_emissions_test.drop(columns=variables_categorielles)

OH_X_emissions_train = pd.concat([num_X_emissions_train, OH_X_emissions_train], axis=1)
OH_X_emissions_test = pd.concat([num_X_emissions_test, OH_X_emissions_test], axis=1)

## Imputation des données manquantes

In [11]:
OH_X_conso_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1740 entries, 448 to 1248
Data columns (total 43 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   NumberofBuildings                                  1740 non-null   float64
 1   NumberofFloors                                     1733 non-null   float64
 2   PropertyGFABuilding                                1738 non-null   float64
 3   BuildingAge                                        1740 non-null   int64  
 4   IsSteamUser                                        1740 non-null   float64
 5   IsElectricityUser                                  1740 non-null   float64
 6   IsNaturalGasUser                                   1740 non-null   float64
 7   IsOtherFuelUser                                    1740 non-null   float64
 8   NumberOfUsedEnergies                               1740 non-null   float64
 9   Buildi

In [12]:
OH_X_conso_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580 entries, 85 to 2289
Data columns (total 43 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   NumberofBuildings                                  580 non-null    float64
 1   NumberofFloors                                     578 non-null    float64
 2   PropertyGFABuilding                                580 non-null    float64
 3   BuildingAge                                        580 non-null    int64  
 4   IsSteamUser                                        580 non-null    float64
 5   IsElectricityUser                                  580 non-null    float64
 6   IsNaturalGasUser                                   580 non-null    float64
 7   IsOtherFuelUser                                    580 non-null    float64
 8   NumberOfUsedEnergies                               580 non-null    float64
 9   Building

Pour rappel, il reste des données manquantes sur le nombre d'étages et la surface des bâtiments. La première sera imputée par son mode et la seconde par la médiane.

In [13]:
from sklearn.impute import SimpleImputer

imputer_nb_etage = SimpleImputer(strategy="most_frequent")
imputer_surface_batiments = SimpleImputer(strategy="median")

############################################################################################
#                                   Consommation d'énergie
############################################################################################

# Imputation : on donne à l'imputer plusieurs colonnes car ne fonctionne pas avec une seule
imputed_X_conso_train = pd.DataFrame(imputer_nb_etage.fit_transform(OH_X_conso_train[["NumberofBuildings", "NumberofFloors"]]))
imputed_X_conso_train2 = pd.DataFrame(imputer_surface_batiments.fit_transform(OH_X_conso_train[["PropertyGFABuilding", 
                                                                                                "BuildingAge"]]))
imputed_X_conso_test = pd.DataFrame(imputer_nb_etage.transform(OH_X_conso_test[["NumberofBuildings", "NumberofFloors"]]))
imputed_X_conso_test2 = pd.DataFrame(imputer_surface_batiments.transform(OH_X_conso_test[["PropertyGFABuilding", 
                                                                                          "BuildingAge"]]))


# Remise en forme des datasets imputés, remise des noms des colonnes et des index
imputed_X_conso_train = pd.concat([imputed_X_conso_train, imputed_X_conso_train2], axis=1)
imputed_X_conso_test = pd.concat([imputed_X_conso_test, imputed_X_conso_test2], axis=1)

imputed_X_conso_train.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]
imputed_X_conso_test.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]

imputed_X_conso_train.index = OH_X_conso_train.index
imputed_X_conso_test.index = OH_X_conso_test.index

donnees_restantes_train = OH_X_conso_train.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                         "PropertyGFABuilding", "BuildingAge"])
donnees_restantes_test = OH_X_conso_test.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                       "PropertyGFABuilding", "BuildingAge"])

imputed_X_conso_train = pd.concat([imputed_X_conso_train, donnees_restantes_train], axis=1)
imputed_X_conso_test = pd.concat([imputed_X_conso_test, donnees_restantes_test], axis=1)

############################################################################################
#                                      Emissions de GES
############################################################################################

# Imputation : on donne à l'imputer plusieurs colonnes car ne fonctionne pas avec une seule
imputed_X_emissions_train = pd.DataFrame(imputer_nb_etage.fit_transform(OH_X_emissions_train[["NumberofBuildings", 
                                                                                              "NumberofFloors"]]))
imputed_X_emissions_train2 = pd.DataFrame(imputer_surface_batiments.fit_transform(OH_X_emissions_train[["PropertyGFABuilding",
                                                                                                        "BuildingAge"]]))
imputed_X_emissions_test = pd.DataFrame(imputer_nb_etage.transform(OH_X_emissions_test[["NumberofBuildings", 
                                                                                        "NumberofFloors"]]))
imputed_X_emissions_test2 = pd.DataFrame(imputer_surface_batiments.transform(OH_X_emissions_test[["PropertyGFABuilding", 
                                                                                                  "BuildingAge"]]))


# Remise en forme des datasets imputés, remise des noms des colonnes et des index
imputed_X_emissions_train = pd.concat([imputed_X_emissions_train, imputed_X_emissions_train2], axis=1)
imputed_X_emissions_test = pd.concat([imputed_X_emissions_test, imputed_X_emissions_test2], axis=1)

imputed_X_emissions_train.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]
imputed_X_emissions_test.columns = ["NumberofBuildings", "NumberofFloors", "PropertyGFABuilding", "BuildingAge"]

imputed_X_emissions_train.index = OH_X_emissions_train.index
imputed_X_emissions_test.index = OH_X_emissions_test.index

donnees_restantes_train = OH_X_emissions_train.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                             "PropertyGFABuilding", "BuildingAge"])
donnees_restantes_test = OH_X_emissions_test.drop(columns=["NumberofBuildings", "NumberofFloors", 
                                                           "PropertyGFABuilding", "BuildingAge"])

imputed_X_emissions_train = pd.concat([imputed_X_emissions_train, donnees_restantes_train], axis=1)
imputed_X_emissions_test = pd.concat([imputed_X_emissions_test, donnees_restantes_test], axis=1)

In [14]:
imputed_X_emissions_test.isna().sum()

NumberofBuildings                                    0
NumberofFloors                                       0
PropertyGFABuilding                                  0
BuildingAge                                          0
IsSteamUser                                          0
IsElectricityUser                                    0
IsNaturalGasUser                                     0
IsOtherFuelUser                                      0
NumberOfUsedEnergies                                 0
BuildingType_Campus                                  0
BuildingType_NonResidential                          0
BuildingType_Nonresidential COS                      0
BuildingType_SPS-District K-12                       0
Neighborhood_BALLARD                                 0
Neighborhood_CENTRAL                                 0
Neighborhood_DELRIDGE                                0
Neighborhood_DOWNTOWN                                0
Neighborhood_EAST                                    0
Neighborho

## Standardisation des variables

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

############################################################################################
#                                   Consommation d'énergie
############################################################################################

# Standardisation des données
std_X_conso_train = pd.DataFrame(scaler.fit_transform(imputed_X_conso_train))
std_X_conso_test = pd.DataFrame(scaler.transform(imputed_X_conso_test))

# Remise des colonnes et des index du dataset initial
std_X_conso_train.columns = imputed_X_conso_train.columns
std_X_conso_test.columns = imputed_X_conso_test.columns

std_X_conso_train.index = imputed_X_conso_train.index
std_X_conso_test.index = imputed_X_conso_test.index

############################################################################################
#                                   Emissions de GES
############################################################################################

# Standardisation des données
std_X_emissions_train = pd.DataFrame(scaler.fit_transform(imputed_X_emissions_train))
std_X_emissions_test = pd.DataFrame(scaler.transform(imputed_X_emissions_test))

# Remise des colonnes et des index du dataset initial
std_X_emissions_train.columns = imputed_X_emissions_train.columns
std_X_emissions_test.columns = imputed_X_emissions_test.columns

std_X_emissions_train.index = imputed_X_emissions_train.index
std_X_emissions_test.index = imputed_X_emissions_test.index

Toutes nos variables ont un écart-type quasiment égal à 1 dans les jeux d'entraînement.

# Modélisation

## Sur la consommation d'énergie

In [16]:
def inverser(log_y_plus_1):
    return np.expm1(log_y_plus_1)

In [17]:
from sklearn import dummy
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import time

metriques = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']

start_time = time.time()
dum = dummy.DummyRegressor(strategy='mean')
y_pred = cross_val_predict(dum, std_X_conso_train, y_conso_train, cv=5)
end_time = time.time()

duree_traitement = end_time-start_time

MAE = round(mean_absolute_error(inverser(y_conso_train), inverser(y_pred)), 3)
R2 = round(r2_score(inverser(y_conso_train), inverser(y_pred)), 3)
RMSE = round(mean_squared_error(inverser(y_conso_train), inverser(y_pred), squared=False), 3)

print("MAE : {}, R² : {:.3}, RMSE : {}, Temps de traitement : {:.3}s".format(MAE, R2, RMSE, duree_traitement))

MAE : 7325197.299, R² : -0.052, RMSE : 25427106.229, Temps de traitement : 0.0156s


In [69]:
from sklearn import dummy
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn import linear_model

resultats = pd.DataFrame(columns=['R2', 'MAE', 'RMSE', 'Temps'], 
                         index=['baseline', 'regression_lineaire', 'ridge', 'lasso', 'elastic_net', 
                                'xg_boost', 'random_forest'])

metriques = {'r2': calculer_R2, 'MAE': calculer_MAE, 'RMSE': calculer_RMSE}

def calculer_R2(reg, X_test, y_test):
    y_pred = reg.predict(X_test)
    return round(r2_score(inverser(y_test), inverser(y_pred)), 3)

def calculer_MAE(reg, X_test, y_test):
    y_pred = reg.predict(X_test)
    return round(mean_absolute_error(inverser(y_test), inverser(y_pred)), 3)

def calculer_RMSE(reg, X_test, y_test):
    y_pred = reg.predict(X_test)
    return round(mean_squared_error(inverser(y_test), inverser(y_pred), squared=False), 3)

# retourne les résultats de la validation croisée
def realiser_validation_croisee(modele):
    cv_resultats = cross_validate(modele, std_X_conso_train, y_conso_train, cv=5, scoring=metriques)
    cv_resultats['full_time'] = cv_resultats['fit_time'] + cv_resultats['score_time']
    return cv_resultats

# retourne une liste contenant la moyenne de chaque métrique calculée lors de la validation croisée
def calculer_moyennes_metriques(cv_resultats):
    return [cv_resultats['test_r2'].mean(), cv_resultats['test_MAE'].mean(),
            cv_resultats['test_RMSE'].mean(), cv_resultats['full_time'].mean()]

dum = dummy.DummyRegressor(strategy='mean')
lr = linear_model.LinearRegression()
modeles_cv = {'baseline': dum, 'regression_lineaire': lr}

for nom, modele in modeles_cv.items():
    resultats_validation_croisee = realiser_validation_croisee(modele)
    resultats.loc[nom] = calculer_moyennes_metriques(resultats_validation_croisee)

In [76]:
# choix du modèle ridge
n_alphas = 200
alphas = np.logspace(-5, 5, n_alphas)
param_grid = {'alphas': alphas}

ridge = linear_model.Ridge()

reg = GridSearchCV(
    ridge,
    param_grid,
    cv=5,
    scoring=metriques,
    refit=False
)

reg.fit(std_X_conso_train, y_conso_train)

ValueError: Invalid parameter alphas for estimator Ridge(). Check the list of available parameters with `estimator.get_params().keys()`.

In [74]:
reg.cv_results_

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [67]:
resultats

Unnamed: 0,R2,MAE,RMSE,Temps
baseline,-0.0582,7325200.0,25101200.0,0.0049952
regression_lineaire,-155.261,17773400.0,182322000.0,0.0185818
ridge,,,,
lasso,,,,
elastic_net,,,,
xg_boost,,,,
random_forest,,,,


In [61]:
resultats

Unnamed: 0,R2,MAE,RMSE,Temps
baseline,-0.0582,7325200.0,25101200.0,0.00599499
regression_lineaire,-155.261,17773400.0,182322000.0,0.0213784
lasso,,,,
ridge,,,,
elastic_net,,,,
xg_boost,,,,
random_forest,,,,


In [19]:
start_time = time.time()
dum.fit(std_X_conso_train, y_conso_train)
y_pred = dum.predict(std_X_conso_test)
end_time = time.time()

duree_traitement = end_time-start_time

MAE = round(mean_absolute_error(inverser(y_conso_test), inverser(y_pred)), 3)
R2 = round(r2_score(inverser(y_conso_test), inverser(y_pred)), 3)
RMSE = round(mean_squared_error(inverser(y_conso_test), inverser(y_pred), squared=False), 3)

print("MAE : {}, R² : {:.3}, RMSE : {}, Temps de traitement : {:.3}s".format(MAE, R2, RMSE, duree_traitement))

MAE : 7045001.007, R² : -0.02, RMSE : 38279392.63, Temps de traitement : 0.001s


In [20]:
# Division du train set en 5 folds
from sklearn.model_selection import KFold

dum = dummy.DummyRegressor(strategy='mean')
MAE = []
R2 = []
RMSE = []
TEMPS = []

# Copie du train set et reset des index
X_train_set, y_train_set = std_X_conso_train, y_conso_train
X_train_set.reset_index(drop=True, inplace=True)
y_train_set.reset_index(drop=True, inplace=True)

kf = KFold(n_splits=5)
for cv_train_index, cv_test_index in kf.split(X_train_set):
    #print("TRAIN:", cv_train_index, "TEST:", cv_test_index)
    X_train_cv, X_test_cv = X_train_set.iloc[cv_train_index, :], X_train_set.iloc[cv_test_index, :]
    y_train_cv, y_test_cv = y_train_set[cv_train_index], y_train_set[cv_test_index]
    dum.fit(X_train_cv, y_train_cv)
    y_pred = dum.predict(X_test_cv)
    MAE.append(round(mean_absolute_error(inverser(y_test_cv), inverser(y_pred)), 3))
    R2.append(round(r2_score(inverser(y_test_cv), inverser(y_pred)), 3))
    RMSE.append(round(mean_squared_error(inverser(y_test_cv), inverser(y_pred), squared=False), 3))

In [21]:
MAE

[7723044.378, 6908162.331, 6781968.717, 7791746.724, 7421064.345]

In [22]:
R2

[-0.083, -0.061, -0.045, -0.073, -0.029]

In [23]:
RMSE

[22652049.589, 22012422.177, 24314505.208, 23458828.72, 33068334.803]

In [24]:
std_X_conso_train.index

RangeIndex(start=0, stop=1740, step=1)

In [25]:
from sklearn import linear_model

start_time = time.time()
lr = linear_model.LinearRegression()
y_pred = cross_val_predict(lr, std_X_conso_train, y_conso_train, cv=5)
end_time = time.time()

duree_traitement = end_time-start_time

MAE = round(mean_absolute_error(inverser(y_conso_train), inverser(y_pred)), 3)
R2 = round(r2_score(inverser(y_conso_train), inverser(y_pred)), 3)
RMSE = round(mean_squared_error(inverser(y_conso_train), inverser(y_pred), squared=False), 3)

print("MAE : {}, R² : {:.3}, RMSE : {}, Temps de traitement : {:.3}s".format(MAE, R2, RMSE, duree_traitement))

MAE : 17773431.266, R² : -1.2e+02, RMSE : 272275214.27, Temps de traitement : 0.0578s


In [26]:
n_alphas = 200
alphas = np.logspace(-5, 5, n_alphas)

start_time = time.time()
rd = linear_model.RidgeCV(alphas=alphas, cv=5).fit(std_X_conso_train, y_conso_train)
#y_pred = cross_val_predict(rd, std_X_conso_train, y_conso_train, cv=5)
rd.best_score_
#end_time = time.time()

0.5577860730295148

In [27]:
rd.alpha_

108.43659686896109

In [28]:
start_time = time.time()
rd = linear_model.Ridge(alpha=615)
y_pred = cross_val_predict(rd, std_X_conso_train, y_conso_train, cv=5)
end_time = time.time()

MAE = round(mean_absolute_error(inverser(y_conso_train), inverser(y_pred)), 3)
R2 = round(r2_score(inverser(y_conso_train), inverser(y_pred)), 3)
RMSE = round(mean_squared_error(inverser(y_conso_train), inverser(y_pred), squared=False), 3)

print("MAE : {}, R² : {:.3}, RMSE : {}, Temps de traitement : {:.3}s".format(MAE, R2, RMSE, duree_traitement))

MAE : 7591715.23, R² : -4.48, RMSE : 58062423.196, Temps de traitement : 0.0578s


In [29]:
import sklearn
print(sklearn.__version__)

0.23.2
