## Importamos las librerías

In [1]:
import pandas as pd
import numpy as np
from modelo_energetico.csv_gen import reduce_columns_sum, reduce_columns_avg, reduce_columns_period_avg, reduce_columns_period_sum
from modelo_energetico.scaler import MultiScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import OrdinalEncoder

## Cargamos los datasets

In [2]:
X = pd.read_csv('../raw_data/X_train.csv', index_col = 0)
y = pd.read_csv('../raw_data/y_train.csv', index_col = 0)

print(X.shape)
print(y.shape)

(7500, 12115)
(7500, 5376)


### Reducimos los features

In [3]:
columns_s = [x[:-4] for x in X.columns if '671' in x]
columns_s

['ac_t_conf',
 'ac_t_red',
 'ac_mask',
 'heat_t_conf',
 'heat_t_red',
 'heat_mask',
 'ventilation_t',
 'ventilation_vol',
 'ventilation_mask',
 'occupancy',
 'pc_on_mask',
 'DNI',
 'IBEAM_H',
 'IBEAM_N',
 'IDIFF_H',
 'IGLOB_H',
 'RHUM',
 'TAMB']

In [5]:
X_red_1 = reduce_columns_period_sum(X, columns_s, 4)
X_red_1.head()

  new_dataframe[f'{column}_{i}'] = np.zeros(dataframe.shape[0])


Unnamed: 0,ac_t_conf_0,ac_t_conf_1,ac_t_conf_2,ac_t_conf_3,ac_t_conf_4,ac_t_conf_5,ac_t_conf_6,ac_t_red_0,ac_t_red_1,ac_t_red_2,...,RHUM_4,RHUM_5,RHUM_6,TAMB_0,TAMB_1,TAMB_2,TAMB_3,TAMB_4,TAMB_5,TAMB_6
0,2280.0,2232.0,2256.0,2256.0,2232.0,2256.0,2232.0,2436.0,2484.0,2448.0,...,5631.0,5711.0,4601.0,1554.7,1472.9,1571.7,2029.3,1900.6,2595.2,2425.6
1,2244.0,2244.0,2232.0,2244.0,2256.0,2244.0,2232.0,2472.0,2436.0,2484.0,...,4418.0,5608.0,5492.0,781.5,955.0,1044.2,921.0,1124.8,1177.7,964.7
2,2232.0,2256.0,2220.0,2268.0,2232.0,2244.0,2244.0,2448.0,2460.0,2448.0,...,6402.0,6899.0,6099.0,738.9,1477.1,1861.1,1201.0,1132.3,962.9,1118.5
3,2232.0,2268.0,2244.0,2244.0,2244.0,2256.0,2256.0,2436.0,2448.0,2460.0,...,7254.0,7645.0,8826.0,1629.2,1392.3,1399.1,1338.9,1640.4,1334.5,1157.2
4,2256.0,2256.0,2244.0,2268.0,2220.0,2268.0,2232.0,2448.0,2448.0,2472.0,...,5325.0,4793.0,4206.0,1430.4,1682.3,2021.3,2100.0,2603.8,2284.2,2242.6


In [None]:
X_red = reduce_columns_avg(X, columns_s, 3)
X_red.head()

In [None]:
columns_c = X.columns[0:19]
columns_c

In [None]:
columns_c = X.columns[0:16]

In [None]:
unique_values = X[columns_c].nunique(dropna=False, )
unique_values.sort_values(ascending=True)

In [None]:
columns_c = unique_values.sort_values(ascending=True).keys()[1:]

In [None]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X[columns_c])

In [None]:
X = pd.concat([X[columns_c], X_red], axis = 1)

In [None]:
X.head()

### Reducimos los targets

In [None]:
columns_s = [x[:-4] for x in y.columns if '671' in x]
columns_s

In [None]:
y_red = reduce_columns_sum(y, ['Q_AC_OFFICE', 'Q_HEAT_OFFICE', 'Q_PEOPLE', 'Q_EQP', 'Q_LIGHT', 'Q_AHU_C', 'Q_AHU_H'], 1)
y_red['Q_TOTAL'] = y_red.sum(axis = 1)
y_red.head()

In [None]:
y_red_t = reduce_columns_avg(y, ['T_INT_OFFICE'], 1)
y_red_t.head()

In [None]:
y = pd.concat([y_red['Q_TOTAL'], y_red_t], axis = 1)
y.head()

### Corroboramos los nuevos shapes

In [None]:
print(X.shape)
print(y.shape)

## Spliteamos los datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Buscamos modelo óptimo para predecir temperatura y consumo

### Modelo para temperatura

In [None]:
pipe = Pipeline([
          ("scaling" , MultiScaler(scaler = "RobustScaler")),
          ("model", SGDRegressor(max_iter=50000, early_stopping=True, n_iter_no_change=3, tol=1e-3))
])

In [None]:
for i in pipe.get_params().keys():
    print(i)

In [None]:
X_train_t = X_train
y_train_t = y_train

params = {
    'scaling__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler() ],     
    'model__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'model__alpha': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__l1_ratio': [1, 0.1, 0.01, 0.001, 0.0001],
}

grid = GridSearchCV(pipe, 
                    param_grid= params, 
                    cv=10,
                    n_jobs=-1,
                    verbose=1,
                    scoring = 'r2'
                   )

grid.fit(X_train_t,y_train_t['T_INT_OFFICE_0'])

In [None]:
best_model = grid.best_estimator_
best_model

### Buscamos los parámetros negativos para el score de temperatura

In [None]:
best_model.fit(X_train_t,y_train_t['T_INT_OFFICE_0']) 

permutation_score = permutation_importance(best_model, X_train_t, y_train_t['T_INT_OFFICE_0'], n_repeats=10) 

importance_df = pd.DataFrame(np.vstack((X.columns, permutation_score.importances_mean)).T ) 
importance_df.columns=['feature','score decrease']

In [None]:
importance_df = importance_df[importance_df['score decrease'] < 0]
importance_df.sort_values(by="score decrease", ascending = True)

In [None]:
neg_score_t = importance_df['feature']
neg_score_t

### Modelo para consumo

In [None]:
X_train_q = X_train
y_train_q = y_train

params = {
    'scaling__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler() ],     
    'model__loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'model__alpha': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__l1_ratio': [1, 0.1, 0.01, 0.001, 0.0001],
}

grid = GridSearchCV(pipe, 
                    param_grid= params, 
                    cv=10,
                    n_jobs=-1,
                    verbose=1,
                    scoring = 'r2'
                   )

grid.fit(X_train_q,y_train_q['Q_TOTAL'])

In [None]:
best_model = grid.best_estimator_
best_model

### Buscamos los parámetros negativos para el score de consumo

In [None]:
best_model.fit(X_train_q,y_train_q['Q_TOTAL']) 

permutation_score = permutation_importance(best_model, X_train_q, y_train_q['Q_TOTAL'], n_repeats=10) 

importance_df = pd.DataFrame(np.vstack((X.columns, permutation_score.importances_mean)).T ) 
importance_df.columns=['feature','score decrease']

In [None]:
importance_df = importance_df[importance_df['score decrease'] < 0]
importance_df.sort_values(by="score decrease", ascending = True)

In [None]:
neg_score_t = importance_df['feature']
neg_score_t