# CROSS - VALIDATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import time
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

In [3]:
df = pd.read_csv("clean_building_data.csv") # on obtient la table avec ancien index comme nouvelle colonne
df.drop(columns = ['Unnamed: 0'], inplace=True)

In [4]:
df.fillna(0, inplace = True)

In [5]:
# valeurs NaN et valeurs infinit
def control_inf (df):
# suppression des lignes de valeurs infinis par les index
    indinf = df[np.isinf(df.values)==True].index
    df.drop(indinf, axis=0, inplace=True)
    print(df.shape)

In [6]:
control_inf(df)

(3394, 62)


In [7]:
df_emssion = df.drop(columns=['TotalGHGEmissions', 'SiteEnergyUse(kBtu)',
                            'SiteEUI(kBtu/sf)','SourceEUI(kBtu/sf)','GHGEmissionsIntensity','SteamUse(kBtu)','Electricity(kBtu)','NaturalGas(kBtu)',
                            'ENERGYSTARScore']) # élimine la colonne cible, la colonne d'intensité d'émissionCO2 pour la table de prédiction CO2
                                                                                               # élimine la colonne 'energystarscore' pour l'évaluation de l'intérêt
df_energy = df.drop(columns=['TotalGHGEmissions', 'SiteEnergyUse(kBtu)',
                            'SiteEUI(kBtu/sf)','SourceEUI(kBtu/sf)','GHGEmissionsIntensity','SteamUse(kBtu)','Electricity(kBtu)','NaturalGas(kBtu)'])

In [8]:
print(df_emssion.shape)
print(df_energy.shape)
print(df.shape)

(3394, 53)
(3394, 54)
(3394, 62)


In [9]:
df_score = pd.DataFrame(columns=['Modèle', 'MSE', 'R²', 'Time'],index=None)
df_score

Unnamed: 0,Modèle,MSE,R²,Time


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

**Sélection du modèle via validation croisée - Prédiction Emission CO2**

In [17]:
random_state = 0
cross_validation = KFold(n_splits=5, random_state=random_state, shuffle=True)

Train (2262, 53) Test (1132, 53)
Train (2263, 53) Test (1131, 53)
Train (2263, 53) Test (1131, 53)


SVR EMISSION CO2

In [19]:
# récupérer les values dans la table de composants principals
X = df_emssion.values
# définir la variable à prédire
y = df['TotalGHGEmissions']

In [20]:
scaler = StandardScaler()
svr = SVR(C= 10, epsilon = 0.021544346900318846, kernel = 'rbf')

In [21]:
pipeline = Pipeline([('standardiser', scaler), ('svr', svr)])
score = cross_val_score(pipeline, X, y, cv = cross_validation)
score_svr = score.mean()
print(score_svr)

0.7389467021033114


RANDOM FOREST EMISSION CO2

In [26]:
rfr = RandomForestRegressor(max_depth=10, max_features='sqrt', max_samples=0.7, min_samples_split=2, n_estimators=900)

In [27]:
pipeline = Pipeline([('standardiser', scaler), ('random forest', rfr)])
score = cross_val_score(pipeline, X, y, cv = cross_validation)
score_rfr = score.mean()
print(score_rfr)

0.7847762111903304


GRADIENT BOOSTING CO2

In [28]:
gbx = GradientBoostingRegressor(learning_rate=0.2, n_estimators=900, max_depth=3)

In [29]:
pipeline = Pipeline([('standardiser', scaler), ('gradient boosting', gbx)])
score = cross_val_score(pipeline, X, y, cv = cross_validation)
score_gbx = score.mean()
print(score_gbx)

0.8565063291864693


           Résumé 1: Gradient Boosting est le plus performant pour prédire l'émission CO2

**Sélection du modèle via validation croisée - Prédiction Consommation d'energie**

In [30]:
# récupérer les values dans la table de composants principals
X = df_energy.values
# définir la variable à prédire
y = df['SiteEnergyUse(kBtu)']

SVR CONSOMMATION ENERGIE

In [31]:
svr = SVR(C = 10, epsilon = 0.021544346900318846, kernel = 'rbf')
pipeline = Pipeline([('standardiser', scaler), ('svr energy', svr)])
score = cross_val_score(pipeline, X, y, cv = cross_validation)
score_svr_energy = score.mean()
print(score_svr_energy)

0.584452940906403


RANDOM FOREST CONSOMMATION ENERGIE

In [32]:
rfr = RandomForestRegressor(max_depth=10, max_features='sqrt', max_samples=0.7,
                      n_estimators=700)

pipeline = Pipeline([('standardiser', scaler), ('random forest energy', rfr)])
score = cross_val_score(pipeline, X, y, cv = cross_validation)
score_rfr_energy = score.mean()
print(score_rfr_energy)

0.723383734418371


GRADIENT BOOSTING CONSOMMATION ENERGIE

In [33]:
gbx = GradientBoostingRegressor(learning_rate=0.2, n_estimators=900, max_depth=3)

pipeline = Pipeline([('standardiser', scaler), ('gradient boosting energy', gbx)])
score = cross_val_score(pipeline, X, y, cv = cross_validation)
score_gbx_energy = score.mean()
print(score_gbx_energy)

0.8416790467984834


            Résumé 2: Gradient Boosting est le plus performant pour prédire la consommation totale d'énergie