## Importamos las librerías

In [1]:
import pandas as pd
import numpy as np
from modelo_energetico.csv_gen import reduce_columns_sum, reduce_columns_avg
from modelo_energetico.scaler import MultiScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance

In [None]:
from google.colab import drive

drive.mount('/content/gdrive').

SI QUIEREN ACCEDER A UNA CARPETA QUE LES COMPARTIERON, TENDRÁN QUE AGREGAR UN ACCESO DIRECTO EN drive
hagan click derecho en la carpeta en drive y elijan "añadir acceso directo"

## Cargamos los datasets

In [3]:
X = pd.read_csv('../raw_data/X_train.csv', index_col = 0)
y = pd.read_csv('../raw_data/y_train.csv', index_col = 0)

print(X.shape)
print(y.shape)

(7500, 12115)
(7500, 5376)


### Reducimos los features

In [4]:
columns_s = [x[:-4] for x in X.columns if '671' in x]
columns_s

['ac_t_conf',
 'ac_t_red',
 'ac_mask',
 'heat_t_conf',
 'heat_t_red',
 'heat_mask',
 'ventilation_t',
 'ventilation_vol',
 'ventilation_mask',
 'occupancy',
 'pc_on_mask',
 'DNI',
 'IBEAM_H',
 'IBEAM_N',
 'IDIFF_H',
 'IGLOB_H',
 'RHUM',
 'TAMB']

In [5]:
X_red = reduce_columns_avg(X, columns_s, 3)
X_red.head()

Unnamed: 0,ac_t_conf_0,ac_t_conf_1,ac_t_conf_2,ac_t_red_0,ac_t_red_1,ac_t_red_2,ac_mask_0,ac_mask_1,ac_mask_2,heat_t_conf_0,...,IDIFF_H_2,IGLOB_H_0,IGLOB_H_1,IGLOB_H_2,RHUM_0,RHUM_1,RHUM_2,TAMB_0,TAMB_1,TAMB_2
0,23.428571,23.428571,23.428571,25.642857,25.642857,25.642857,0.053571,0.946429,0.303571,20.571429,...,147.71875,2.700893,505.28125,266.441964,72.629464,60.138393,53.383929,17.085268,20.870089,22.535714
1,23.357143,23.357143,23.357143,25.642857,25.642857,25.642857,0.017857,0.910714,0.357143,20.642857,...,58.053571,0.191964,365.825893,90.40625,26.383929,22.035714,20.857143,7.985268,11.346875,11.779018
2,23.357143,23.357143,23.357143,25.5,25.5,25.5,0.0,0.857143,0.392857,20.642857,...,119.321429,0.133929,370.602679,190.96875,70.133929,59.566964,52.245536,10.124107,13.175893,14.609821
3,23.428571,23.428571,23.428571,25.5,25.5,25.5,0.0,0.875,0.321429,20.5,...,59.727679,0.0,168.392857,87.758929,84.276786,76.736607,70.763393,13.338393,15.025446,15.795089
4,23.428571,23.428571,23.428571,25.571429,25.571429,25.571429,0.017857,0.839286,0.339286,20.714286,...,144.674107,2.629464,554.526786,294.299107,69.325893,55.732143,47.866071,17.783482,22.098661,24.245536


In [6]:
columns_c = X.columns[0:19]
columns_c

Index(['airchange_infiltration_vol_per_h', 'capacitance_kJ_perdegreK_perm3',
       'power_VCV_kW_heat', 'power_VCV_kW_clim', 'nb_occupants', 'nb_PCs',
       'facade_1_thickness_2', 'facade_1_window_area_percent',
       'facade_2_thickness_2', 'facade_2_window_area_percent',
       'facade_3_thickness_2', 'facade_3_window_area_percent',
       'facade_4_thickness_2', 'facade_4_window_area_percent',
       'roof_thickness_2', 'ground_thickness_2', 'init_day', 'init_month',
       'init_year'],
      dtype='object')

In [7]:
columns_c = X.columns[0:16]
columns_c

Index(['airchange_infiltration_vol_per_h', 'capacitance_kJ_perdegreK_perm3',
       'power_VCV_kW_heat', 'power_VCV_kW_clim', 'nb_occupants', 'nb_PCs',
       'facade_1_thickness_2', 'facade_1_window_area_percent',
       'facade_2_thickness_2', 'facade_2_window_area_percent',
       'facade_3_thickness_2', 'facade_3_window_area_percent',
       'facade_4_thickness_2', 'facade_4_window_area_percent',
       'roof_thickness_2', 'ground_thickness_2'],
      dtype='object')

In [8]:
X = pd.concat([X[columns_c], X_red], axis = 1)

### Reducimos los targets

In [9]:
columns_s = [x[:-4] for x in y.columns if '671' in x]
columns_s

['Q_AC_OFFICE',
 'Q_HEAT_OFFICE',
 'Q_PEOPLE',
 'Q_EQP',
 'Q_LIGHT',
 'Q_AHU_C',
 'Q_AHU_H',
 'T_INT_OFFICE']

In [10]:
y_red = reduce_columns_sum(y, ['Q_AC_OFFICE', 'Q_HEAT_OFFICE', 'Q_PEOPLE', 'Q_EQP', 'Q_LIGHT', 'Q_AHU_C', 'Q_AHU_H'], 1)
y_red['Q_TOTAL'] = y_red.sum(axis = 1)
y_red.head()

Unnamed: 0,Q_AC_OFFICE_0,Q_HEAT_OFFICE_0,Q_PEOPLE_0,Q_EQP_0,Q_LIGHT_0,Q_AHU_C_0,Q_AHU_H_0,Q_TOTAL
0,10748.1049,0.0,3240.0,3874.3458,2112.27,3890.5751,1096.6969,24961.9927
1,1819.367,469.0627,4212.0,5667.4181,2151.491,40.6438,7093.4597,21453.4423
2,6245.3497,0.0,4860.0,6409.7036,2027.7509,662.0432,4612.1041,24816.9515
3,2372.0553,0.0,4212.0,6605.6016,2739.806,285.1977,2852.4764,19067.137
4,15234.8404,0.0,3240.0,7581.4202,2093.6939,3937.5324,520.0846,32607.5715


In [11]:
y_red_t = reduce_columns_avg(y, ['T_INT_OFFICE'], 1)
y_red_t.head()

Unnamed: 0,T_INT_OFFICE_0
0,24.175757
1,21.766815
2,23.245216
3,22.806492
4,24.106955


In [12]:
y = pd.concat([y_red['Q_TOTAL'], y_red_t], axis = 1)
y.head()

Unnamed: 0,Q_TOTAL,T_INT_OFFICE_0
0,24961.9927,24.175757
1,21453.4423,21.766815
2,24816.9515,23.245216
3,19067.137,22.806492
4,32607.5715,24.106955


### Corroboramos los nuevos shapes

In [13]:
print(X.shape)
print(y.shape)

(7500, 70)
(7500, 2)


## Spliteamos los datasets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5250, 70)
(5250, 2)
(2250, 70)
(2250, 2)


## Buscamos modelo óptimo para predecir temperatura y consumo

### Modelo para temperatura

In [16]:
prueba = X_train[["airchange_infiltration_vol_per_h"]]
scaler = MultiScaler(MinMaxScaler())
scaler.fit(prueba)
scaler.transform(prueba)

array([[1.        ],
       [0.66666667],
       [0.        ],
       ...,
       [0.33333333],
       [1.        ],
       [0.33333333]])

In [17]:
pipe = Pipeline([
          ("scaling" , MultiScaler(scaler = "RobustScaler")),
          ("model", SGDRegressor(max_iter=50000, early_stopping=True, n_iter_no_change=3, tol=1e-3))
])

In [18]:
for i in pipe.get_params().keys():
    print(i)

memory
steps
verbose
scaling
model
scaling__scaler
model__alpha
model__average
model__early_stopping
model__epsilon
model__eta0
model__fit_intercept
model__l1_ratio
model__learning_rate
model__loss
model__max_iter
model__n_iter_no_change
model__penalty
model__power_t
model__random_state
model__shuffle
model__tol
model__validation_fraction
model__verbose
model__warm_start


In [None]:
X_train_t = X_train
y_train_t = y_train

params = {
    'scaling__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler() ],     
    'model__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'model__alpha': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__l1_ratio': [1, 0.1, 0.01, 0.001, 0.0001],
}

grid = GridSearchCV(pipe, 
                    param_grid= params, 
                    cv=10,
                    n_jobs=-1,
                    verbose=1,
                    scoring = 'r2'
                   )

grid.fit(X_train_t,y_train_t['T_INT_OFFICE_0'])

In [None]:
best_model = grid.best_estimator_
best_model

### Buscamos los parámetros negativos para el score de temperatura

In [None]:
best_model.fit(X_train_t,y_train_t['T_INT_OFFICE_0']) 

permutation_score = permutation_importance(best_model, X_train_t, y_train_t['T_INT_OFFICE_0'], n_repeats=10) 

importance_df = pd.DataFrame(np.vstack((X.columns, permutation_score.importances_mean)).T ) 
importance_df.columns=['feature','score decrease']

In [None]:
importance_df = importance_df[importance_df['score decrease'] < 0]
importance_df.sort_values(by="score decrease", ascending = True)

In [None]:
neg_score_t = importance_df['feature']
neg_score_t

### Modelo para consumo

In [19]:
X_train_q = X_train
y_train_q = y_train

params = {
    'scaling__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler() ],     
    'model__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'model__alpha': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__l1_ratio': [1, 0.1, 0.01, 0.001, 0.0001],
}

grid = GridSearchCV(pipe, 
                    param_grid= params, 
                    cv=10,
                    n_jobs=-1,
                    verbose=1,
                    scoring = 'r2'
                   )

grid.fit(X_train_q,y_train_q['Q_TOTAL'])

Fitting 10 folds for each of 225 candidates, totalling 2250 fits


KeyboardInterrupt: 

In [None]:
best_model = grid.best_estimator_
best_model

### Buscamos los parámetros negativos para el score de consumo

In [None]:
best_model.fit(X_train_t,y_train_t['Q_TOTAL']) 

permutation_score = permutation_importance(best_model, X_train_t, y_train_t['Q_TOTAL'], n_repeats=10) 

importance_df = pd.DataFrame(np.vstack((X.columns, permutation_score.importances_mean)).T ) 
importance_df.columns=['feature','score decrease']

In [None]:
importance_df = importance_df[importance_df['score decrease'] < 0]
importance_df.sort_values(by="score decrease", ascending = True)

In [None]:
neg_score_t = importance_df['feature']
neg_score_t