# Rob's sandbox

---

# Imports

#### Python modules

In [1]:
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import (
    train_test_split,
    cross_val_score
)
from sklearn.inspection import permutation_importance

# import matplotlib.pyplot as plt
# %matplotlib inline

import pandas as pd

# Ancillary modules

#### Support adjustments

In [2]:
sys.path.append("../..")

#### Modules

In [3]:
%load_ext autoreload
%autoreload 2

from src.utils.params import (
    param_grid,
    max_features,
    n_estimators,
    cv_rounds,
    evaluation_metric,
    transformation_pickle_loc,
    fe_pickle_loc
)

from src.pipelines.ingestion import (
    ingest
)

from src.pipelines.transformation import (
    transform
)

from src.pipelines.feature_engineering import (
    load_transformation,
    feature_generation,
    feature_selection,
    save_fe,
    feature_engineering
)

from proyecto_1 import (
    main
)

---

# Tests

## Feature engineering

#### Loading transformation pickle

In [4]:
df = load_transformation("../../" + transformation_pickle_loc)

In [5]:
df

Unnamed: 0,fecha_creacion,hora_creacion,dia_semana,codigo_cierre,incidente_c4,tipo_entrada,label,dia_inicio,mes_inicio,anio_inicio,hora_inicio,min_inicio
0,23/01/2016,22:35:04,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,BOTÓN DE AUXILIO,0.0,23,01,2016,22.0,35
1,23/01/2016,22:50:49,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,BOTÓN DE AUXILIO,0.0,23,01,2016,22.0,50
2,24/01/2016,09:40:11,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA_911_066,1.0,24,01,2016,9.0,40
3,24/01/2016,22:40:57,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA_911_066,1.0,24,01,2016,22.0,40
4,24/01/2016,04:25:15,Domingo,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA_911_066,0.0,24,01,2016,4.0,25
...,...,...,...,...,...,...,...,...,...,...,...,...
1383133,22/10/2020,16:17:05,Jueves,(A) La unidad de atención a emergencias fue de...,lesionado-atropellado,BOTÓN DE AUXILIO,0.0,22,10,2020,16.0,17
1383134,22/10/2020,16:29:13,Jueves,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,BOTÓN DE AUXILIO,0.0,22,10,2020,16.0,29
1383135,22/10/2020,12:00:26,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque sin lesionados,LLAMADA_911_066,0.0,22,10,2020,12.0,00
1383136,22/10/2020,12:01:25,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque con lesionados,LLAMADA_911_066,0.0,22,10,2020,12.0,01


#### Processing data for model

In [37]:
df_features_prc, df_labels, df_features_prc_cols = feature_generation(df)


++ Complete list of features (9) that will be fed to the model:
    1. hora_creacion
    2. dia_semana
    3. incidente_c4
    4. tipo_entrada
    5. dia_inicio
    6. mes_inicio
    7. anio_inicio
    8. hora_inicio
    9. min_inicio

++ List of categorical features (2) that will be processed through the pipeline are:
    1. incidente_c4
    2. tipo_entrada
Index(['hora_creacion', 'dia_semana', 'incidente_c4', 'tipo_entrada',
       'dia_inicio', 'mes_inicio', 'anio_inicio', 'hora_inicio', 'min_inicio'],
      dtype='object')


In [38]:
df_features_prc_cols

['hora_creacion',
 'dia_semana',
 'Detención ciudadana-accidente automovilístico',
 'accidente-choque con lesionados',
 'accidente-choque con prensados',
 'accidente-choque sin lesionados',
 'accidente-ciclista',
 'accidente-ferroviario',
 'accidente-monopatín',
 'accidente-motociclista',
 'accidente-otros',
 'accidente-persona atrapada / desbarrancada',
 'accidente-vehiculo atrapado',
 'accidente-vehiculo desbarrancado',
 'accidente-vehículo atrapado-varado',
 'accidente-volcadura',
 'cadáver-accidente automovilístico',
 'cadáver-atropellado',
 'detención ciudadana-accidente automovilístico',
 'detención ciudadana-atropellado',
 'lesionado-accidente automovilístico',
 'lesionado-atropellado',
 'mi ciudad-calle-incidente de tránsito',
 'mi ciudad-taxi-incidente de tránsito',
 'sismo-choque con lesionados',
 'sismo-choque con prensados',
 'sismo-choque sin lesionados',
 'sismo-persona atropellada',
 'APLICATIVOS',
 'BOTÓN DE AUXILIO',
 'CÁMARA',
 'LLAMADA APP911',
 'LLAMADA_911_066',
 '

In [41]:
df_features_prc.shape

(1383138, 34)

In [15]:
print(len(enc_cat_features))
enc_cat_features

34


array(['x0_Detención ciudadana-accidente automovilístico',
       'x0_accidente-choque con lesionados',
       'x0_accidente-choque con prensados',
       'x0_accidente-choque sin lesionados', 'x0_accidente-ciclista',
       'x0_accidente-ferroviario', 'x0_accidente-monopatín',
       'x0_accidente-motociclista', 'x0_accidente-otros',
       'x0_accidente-persona atrapada / desbarrancada',
       'x0_accidente-vehiculo atrapado',
       'x0_accidente-vehiculo desbarrancado',
       'x0_accidente-vehículo atrapado-varado', 'x0_accidente-volcadura',
       'x0_cadáver-accidente automovilístico', 'x0_cadáver-atropellado',
       'x0_detención ciudadana-accidente automovilístico',
       'x0_detención ciudadana-atropellado',
       'x0_lesionado-accidente automovilístico',
       'x0_lesionado-atropellado',
       'x0_mi ciudad-calle-incidente de tránsito',
       'x0_mi ciudad-taxi-incidente de tránsito',
       'x0_sismo-choque con lesionados', 'x0_sismo-choque con prensados',
       'x0

In [13]:
df_features_prc.shape

(1383138, 34)

In [None]:
len(df_features_prc_cols)

#### Training, executing and evaluating model

In [None]:
model = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [None]:
## Splitting data in train and test
X_train, X_test, y_train, y_test = train_test_split(df_features_prc, df_labels, test_size=0.3)

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=2,
                           scoring=evaluation_metric,
                           return_train_score=True,
                           n_jobs=-1
                           )

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)

print("\n++ Grid search results:\n")
print("    ++++ Best estimator: {}".format(grid_search.best_estimator_))
print("    ++++ Number of features in best estimator: {} \n".format(grid_search.best_estimator_.n_features_))
print("    ++++ Best estimator oob score: {}\n".format(grid_search.best_estimator_.oob_score_))

#### Identifying most important variables

In [None]:
print(len(grid_search.best_estimator_.feature_importances_))
grid_search.best_estimator_.feature_importances_

In [None]:
feature_importance = pd.DataFrame(
    {
        "Importance": grid_search.best_estimator_.feature_importances_,
        "Feature": df_features_prc.columns
    }
)
feature_importance.sort_values(by="Importance", ascending=False)
print(display(feature_importance))

# *Notes*

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

train = pd.DataFrame({'age': [23,12, 12, np.nan],
                      'Gender': ['M','F', np.nan, 'F'],
                      'income': ['high','low','low','medium'],
                      'sales': [10000, 100020, 110000, 100],
                      'foo' : [1,0,0,1],
                      'text': ['I will test this',
                               'need to write more sentence',
                               'want to keep it simple',
                               'hope you got that these sentences are junk'],
                      'y': [0,1,1,1]})
numeric_columns = ['age']
cat_columns     = ['Gender','income']

numeric_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline     = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
text_pipeline = make_pipeline(CountVectorizer(), SelectKBest(k=5))

transformers = [
('num', numeric_pipeline, numeric_columns),
('cat', cat_pipeline, cat_columns),
('text', text_pipeline, 'text'),
('simple_transformer', MinMaxScaler(), ['sales']),
]

combined_pipe = ColumnTransformer(transformers, remainder='passthrough')

transformed_data = combined_pipe.fit_transform(train.drop('y',1), train['y'])

In [None]:
train

In [None]:
transformed_data.shape

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features




pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

In [None]:
train

---
---