# Feauture Selection

## Imports & Load Data

In [216]:
import warnings
warnings.filterwarnings('ignore')

In [217]:
import numpy as np
import pandas as pd

import pickle

In [218]:
# Cargar endpoint
infile = open('./data/21_ETL_tabular.pkl','rb')
tabular_ft = pickle.load(infile)
infile.close()

In [219]:
filename = './data/221_graph_metrics.pkl'
infile = open(filename, 'rb')
gmetrics_ft = pickle.load(infile)
infile.close()

In [220]:
filename = './data/222_ETRD_ROUTES_ELEC.pkl'
infile = open(filename, 'rb')
path_ft = pickle.load(infile)
infile.close()

In [221]:
filename = './data/1_OUTFT_UNTRD_FIC.pkl'
infile = open(filename, 'rb')
DEPVAR = pickle.load(infile)
infile.close()

## Preparar Datos

In [222]:
tabular_cols = tabular_ft.columns

In [223]:
gmetrics_cols = gmetrics_ft.columns

In [224]:
path_ft.drop(['ROUTE_SUB','SID_ROUTE_SUB'], axis=1, inplace=True)
path_cols = path_ft.columns

In [225]:
data = pd.merge(tabular_ft, gmetrics_ft, on = 'COD_ID')
data = pd.merge(data, path_ft, on = 'COD_ID')

In [226]:
data = pd.merge(data, DEPVAR.reset_index(), left_on='COD_ID', right_on='UNI_TR_D')

In [227]:
data.drop(['COD_ID','PAC_1','UNI_TR_D_x','UNI_TR_D_y','UNTRD_PCON_x','UNTRD_PCON_y','UNI_TR_S',
           'CTMT_x','CTMT_y','SUB_x','SUB_y','CONJ','DESCR','CTMT_PCON'], axis=1, inplace=True)

In [228]:
data.columns

Index(['FAS_CON_P', 'FAS_CON_S', 'FAS_CON_T', 'TEN_LIN_SE', 'CAP_ELO',
       'CAP_CHA', 'CONF', 'POSTO', 'POT_NOM', 'PER_FER', 'PER_TOT',
       'TIP_TRAFO', 'ARE_LOC', 'CLAS_SUB', 'CNAE', 'TIP_CC', 'TEN_FORN',
       'GRU_TAR', 'CAR_INST', 'UCBT_ENE_MED', 'UCBT_ENE_STD', 'UCBT_ENE_MAX',
       'NCON', 'NCON_URB', 'CTMT_ENE_MED', 'CTMT_ENE_STD', 'CTMT_ENE_MAX',
       'PERD_A3a', 'PERD_A4', 'PERD_B', 'PERD_MED', 'PERD_A3a_B', 'PERD_A4_B',
       'PERD_B_A3a', 'PERD_B_A4', 'CTMT_PNTMT_MED', 'CTMT_PNTMT_STD',
       'CTMT_PNTMT_MAX', 'CTMT_PNTBT_MED', 'CTMT_PNTBT_STD', 'CTMT_PNTBT_MAX',
       'POT_NOM_TRS', 'POT_F01', 'POT_F02', 'PER_FER_TRS', 'PER_TOT_TRS',
       'TIP_TRAFO_TRS', 'UNTRS_ENES_MED', 'UNTRS_ENES_STD', 'UNTRS_ENES_MAX',
       'UNTRS_ENET_MED', 'UNTRS_ENET_STD', 'UNTRS_ENET_MAX', 'DEGREE',
       'NEIG_DEGREE', 'BET_CEN', 'CLO_CEN', 'PAGE_RANK', 'DISTANCE_SUB',
       'FCONV_R', 'I_MAX_MED_R', 'I_MAX_MIN_R', 'I_NOM_MED_R', 'I_NOM_MIN_R',
       'REAC_SUB', 'RESI_SUB', 'Z

In [229]:
categorical = data.describe(include ='O').columns

In [230]:
categorical

Index(['FAS_CON_P', 'FAS_CON_S', 'FAS_CON_T', 'CAP_ELO', 'CAP_CHA', 'CONF',
       'POSTO', 'TIP_TRAFO', 'ARE_LOC', 'CLAS_SUB', 'CNAE', 'TIP_CC',
       'TEN_FORN', 'GRU_TAR', 'TIP_TRAFO_TRS'],
      dtype='object')

## Split and get baselines

In [231]:
from sklearn.model_selection import train_test_split

In [232]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [233]:
data_labeled = pd.get_dummies(data.dropna(), columns=categorical, dummy_na= True, drop_first= True)

In [234]:
X_train, X_test, y_train, y_test = train_test_split(data_labeled.drop('FIC',axis=1), data_labeled.FIC, test_size=0.3, random_state=0)

In [235]:
xb_reg = XGBRegressor(learning_rate=0.03,
                     n_estimators=1000,
                     max_depth=3,
                     subsample=0.8,
                     colsample_bytree=1,
                     gamma=1,
                     eval_metric='mae')

# Entreno sobre el logarimo de la variable de salidad
xb_reg.fit(X_train, np.log1p(y_train))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mae', gamma=1,
       importance_type='gain', learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [236]:
desc_importance = np.sort(xb_reg.feature_importances_)

In [237]:
ft_qty = (desc_importance[::-1] > 0).sum()

In [238]:
indices = np.argsort(xb_reg.feature_importances_)[::-1]
top_features = data_labeled.columns[indices][:ft_qty]
top_features

Index(['Z_MOD_SUB', 'UNTRS_ENES_MED', 'REAC_SUB', 'CLO_CEN', 'RESI_SUB',
       'CTMT_PNTMT_MAX', 'CTMT_ENE_STD', 'CTMT_ENE_MED', 'I_MAX_MED_R',
       'TIP_CC_RES-Tipo2', 'PAGE_RANK', 'CTMT_PNTBT_MAX', 'PERD_MED',
       'PERD_A4_B', 'PER_TOT', 'FAS_CON_S_ABC', 'UCBT_ENE_MED',
       'TIP_TRAFO_nan', 'POT_NOM', 'NCON_URB', 'CTMT_PNTMT_STD',
       'GRU_TAR_B2RU', 'CTMT_PNTBT_MED', 'TIP_CC_RES-Tipo1', 'CTMT_ENE_MAX',
       'CTMT_PNTMT_MED', 'I_MAX_MIN_R', 'Z_ANG_SUB', 'FAS_CON_S_AB', 'PERD_B',
       'BET_CEN', 'FAS_CON_P_BC', 'FCONV_R', 'TIP_CC_RES-Tipo4', 'NCON',
       'DEGREE', 'CTMT_PNTBT_STD', 'PERD_A4', 'DISTANCE_SUB', 'UCBT_ENE_MAX',
       'TIP_CC_IND-Tipo9', 'CAR_INST', 'CLAS_SUB_PP3', 'TIP_CC_RES-Tipo10',
       'FAS_CON_T_nan', 'CAP_ELO_15K', 'FAS_CON_S_B', 'FAS_CON_S_CAN',
       'PER_FER', 'UCBT_ENE_STD', 'ARE_LOC_nan', 'FAS_CON_P_AB',
       'TIP_CC_RES-Tipo3', 'TEN_LIN_SE', 'TIP_CC_nan', 'TIP_CC_RES-Tipo5',
       'CLAS_SUB_RE1', 'CAP_ELO_nan', 'CAP_ELO_3H', 'TIP_CC_RU

## Eliminar baja varianza