# Audits énergétiques

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor

## Data Input

In [None]:
# lire le csv et convertir en table de données pandas

df = pd.read_csv("../input/auditsnergtiques/Isolation (1).csv",delimiter=";")

In [None]:
# afficher informations sur les colonnes

print(df.head())
print(df.describe())
print(list(df))

## Data pre-processing

In [None]:
# compute target
#df["target"] = df["lambda"] / (df["epaisseur"]/1000)
df["target"] = df["resistance"]

## Data Analysis

In [None]:
sns.set_style('whitegrid')

fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
fig.suptitle('Regression Plots for numerical features')

sns.regplot(ax=axes[0], x ='annee_construction', y ='target', data = df)
sns.regplot(ax=axes[1], x ='cout_total_ht', y ='target', data = df)
sns.regplot(ax=axes[2], x ='nb_pieces', y ='target', data = df)

In [None]:
df = df[["region","isolant","poste_isolation","epaisseur","target"]]

# drop lines without targets because they are useless
df = df.dropna(subset=["target"])

print(list(df))

## One-hot encoding

In [None]:
region_one_hot = pd.get_dummies(df.region, prefix='Region')
isolant_one_hot = pd.get_dummies(df.isolant, prefix='Isolant')
poste_isolation_one_hot = pd.get_dummies(df.poste_isolation, prefix='Poste_Isolant')

frames = [region_one_hot, isolant_one_hot, poste_isolation_one_hot, df]

concat_df = pd.concat(frames,axis=1)
print(concat_df)

# replace NaN value by 0
concat_df = concat_df.fillna(0)

In [None]:
X = concat_df[['epaisseur','Region_ARA', 'Region_BFC', 'Region_BRE', 'Region_COR', 'Region_CVL', 'Region_GDE', 'Region_IDF', 'Region_NAQ', 'Region_NOR', 'Region_PAC', 'Region_PDL', 'Isolant_AUTRES', 'Isolant_LAINE MINERALE', 'Isolant_LAINE VEGETALE', 'Isolant_PLASTIQUES', 'Poste_Isolant_COMBLES PERDUES', 'Poste_Isolant_ITE', 'Poste_Isolant_ITI', 'Poste_Isolant_PLANCHER BAS', 'Poste_Isolant_RAMPANTS', 'Poste_Isolant_SARKING']].values
y = concat_df["target"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Model

In [None]:
reg1 = linear_model.LinearRegression()
reg2 = linear_model.Ridge(alpha=.5)
reg3 = linear_model.BayesianRidge()

svm = svm.SVR() # Support Vector Machines
clf = tree.DecisionTreeRegressor()
gbr = GradientBoostingRegressor(random_state=42)
rfr = RandomForestRegressor(random_state=42)

ereg = VotingRegressor(estimators=[('lr', reg1), ('gbr', gbr), ('rfr', rfr)])

nn = MLPRegressor(random_state=42, max_iter=500)

## Training

In [None]:
reg1.fit(X_train,y_train)
reg2.fit(X_train,y_train)
reg3.fit(X_train,y_train)
svm.fit(X_train,y_train)
clf.fit(X_train,y_train)
gbr.fit(X_train,y_train)
rfr.fit(X_train,y_train)
ereg.fit(X_train,y_train)
nn.fit(X_train,y_train)

## Validation

In [None]:
def plot_results(prediction,model_name):
    plt.title(model_name)
    plt.plot(y_test)
    plt.plot(prediction)
    plt.show()

In [None]:
def validation(model,model_name):
    prediction = model.predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    plot_results(prediction,model_name)
    print("rmse : ", np.sqrt(mse))
    return rmse

In [None]:
validation(reg1,"Linear Regression")
validation(reg2, "Ridge Regression")
validation(reg3, "Bayesian Ridge Regression")
validation(svm, "Support Vector Machines")
validation(clf, "Decision Trees")
validation(gbr, "Gradient Boosting Regressor")
rmse = validation(rfr, "Random Forest Regressor")
validation(ereg, "Voting Regressor")
validation(nn, "Neural Network")

## Fiability error

In [None]:
def fiability_error(value, rmse):
    return("Distance with RMSE : ", value - rmse)

In [None]:
fiability_error(1, rmse)

## Prediction

In [None]:
def predict(model,region,isolant,poste_isolant,epaisseur):
    
    input_list = np.array([[region,isolant,poste_isolant,epaisseur,0]])
    prediction_line = pd.DataFrame(input_list, index=['prediction'], columns=list(df))
    
    prediction_concat = df.append(prediction_line)
    
    region_one_hot = pd.get_dummies(prediction_concat.region, prefix='Region')
    isolant_one_hot = pd.get_dummies(prediction_concat.isolant, prefix='Isolant')
    poste_isolation_one_hot = pd.get_dummies(prediction_concat.poste_isolation, prefix='Poste_Isolant')

    frames = [prediction_concat, region_one_hot, isolant_one_hot, poste_isolation_one_hot]

    prediction_concat = pd.concat(frames,axis=1)
    
    prediction_concat = prediction_concat.fillna(0)
        
    one_hot_prediction_line = prediction_concat.iloc[[-1]]
        
    one_hot_prediction_line = one_hot_prediction_line.drop(['region','isolant','poste_isolation','target','Region_OCC','Region_Z_Non Connu','Poste_Isolant_TOITURE TERRASSE'], axis=1)
        
    return model.predict(one_hot_prediction_line.to_numpy())

In [None]:
predicted_value = predict(rfr,'IDF','LAINE MINERALE','COMBLES PERDUES',300)
print("Predicted coefficient with rmse =", rmse, "y =", predicted_value[0])