# Rob's sandbox

---

# Imports

#### Python modules

In [1]:
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import (
    train_test_split,
    cross_val_score
)
from sklearn.inspection import permutation_importance

# import matplotlib.pyplot as plt
# %matplotlib inline

import pandas as pd

# Ancillary modules

#### Support adjustments

In [2]:
sys.path.append("../..")

#### Modules

In [5]:
%load_ext autoreload
%autoreload 2

from src.utils.params import (
    param_grid,
    max_features,
    n_estimators,
    cv_rounds,
    evaluation_metric,
    ingestion_pickle_loc,
    transformation_pickle_loc,
    fe_pickle_loc
)

from src.pipelines.ingestion import (
    ingest
)

from src.pipelines.transformation import (
    load_ingestion,
#     date_transformation,
#     hour_transformation,
#     categoric_trasformation,
    transform
)

from src.pipelines.feature_engineering import (
    load_transformation,
    feature_generation,
    feature_selection,
    save_fe,
    feature_engineering
)

from proyecto_1 import (
    main
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---

# Tests

## Transformation

#### Loading ingestion pickle

In [8]:
df_ing = load_ingestion("../../" + ingestion_pickle_loc)

In [9]:
df_ing

Unnamed: 0,fecha_creacion,hora_creacion,dia_semana,codigo_cierre,incidente_c4,tipo_entrada,label
0,23/01/2016,22:35:04,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,BOTÓN DE AUXILIO,0.0
1,23/01/2016,22:50:49,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,BOTÓN DE AUXILIO,0.0
2,24/01/2016,09:40:11,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA DEL 066,1.0
3,24/01/2016,22:40:57,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA DEL 066,1.0
4,24/01/2016,04:25:15,Domingo,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA DEL 066,0.0
...,...,...,...,...,...,...,...
1383133,22/10/2020,16:17:05,Jueves,(A) La unidad de atención a emergencias fue de...,lesionado-atropellado,BOTÓN DE AUXILIO,0.0
1383134,22/10/2020,16:29:13,Jueves,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,BOTÓN DE AUXILIO,0.0
1383135,22/10/2020,12:00:26,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque sin lesionados,LLAMADA DEL 911,0.0
1383136,22/10/2020,12:01:25,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque con lesionados,LLAMADA DEL 911,0.0


In [12]:
transform(("../../" + ingestion_pickle_loc), ("../../" + transformation_pickle_loc))


** Tranformation module successfully executed **



## Feature engineering

#### Loading transformation pickle

In [14]:
df_tra = load_transformation("../../" + transformation_pickle_loc)

In [15]:
df_tra

Unnamed: 0,dia_semana,codigo_cierre,incidente_c4,tipo_entrada,label,anio_inicio,dia_inicio_sin,dia_inicio_cos,mes_inicio_sin,mes_inicio_cos,hora_inicio_sin,hora_inicio_cos
0,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,BOTÓN DE AUXILIO,0.0,2016,-0.999146,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
1,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,BOTÓN DE AUXILIO,0.0,2016,-0.999146,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
2,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA_911_066,1.0,2016,-0.969400,0.245485,0.500000,0.866025,7.071068e-01,-0.707107
3,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA_911_066,1.0,2016,-0.969400,0.245485,0.500000,0.866025,-5.000000e-01,0.866025
4,Domingo,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,LLAMADA_911_066,0.0,2016,-0.969400,0.245485,0.500000,0.866025,8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...
1383133,Jueves,(A) La unidad de atención a emergencias fue de...,lesionado-atropellado,BOTÓN DE AUXILIO,0.0,2020,-0.986361,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
1383134,Jueves,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,BOTÓN DE AUXILIO,0.0,2020,-0.986361,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
1383135,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque sin lesionados,LLAMADA_911_066,0.0,2020,-0.986361,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000
1383136,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque con lesionados,LLAMADA_911_066,0.0,2020,-0.986361,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000


##### Exploring transfomation result

#### Processing data for model

In [None]:
df_features_prc, df_labels, df_features_prc_cols = feature_generation(df)

In [None]:
print(len(df_features_prc_cols))
df_features_prc_cols

In [None]:
df_features_prc.shape

In [None]:
print(len(enc_cat_features))
enc_cat_features

In [None]:
df_features_prc.shape

In [None]:
len(df_features_prc_cols)

#### Training, executing and evaluating model

In [None]:
model = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [None]:
## Splitting data in train and test
X_train, X_test, y_train, y_test = train_test_split(df_features_prc, df_labels, test_size=0.3)

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=2,
                           scoring=evaluation_metric,
                           return_train_score=True,
                           n_jobs=-1
                           )

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)

print("\n++ Grid search results:\n")
print("    ++++ Best estimator: {}".format(grid_search.best_estimator_))
print("    ++++ Number of features in best estimator: {} \n".format(grid_search.best_estimator_.n_features_))
print("    ++++ Best estimator oob score: {}\n".format(grid_search.best_estimator_.oob_score_))

#### Identifying most important variables

In [None]:
print(len(grid_search.best_estimator_.feature_importances_))
grid_search.best_estimator_.feature_importances_

In [None]:
feature_importance = pd.DataFrame(
    {
        "Importance": grid_search.best_estimator_.feature_importances_,
        "Feature": df_features_prc.columns
    }
)
feature_importance.sort_values(by="Importance", ascending=False)
print(display(feature_importance))

## Modeling (magic loop)

# *Notes*

## Finding labels of processed matrix

#### Manually number of labels that should be in the model.

In [70]:
cat_list = list(df_tra["dia_semana"].unique()) + \
            list(df_tra["incidente_c4"].unique()) + \
            list(df_tra["tipo_entrada"].unique()) + \
            list(df_tra["anio_inicio"].unique())

In [71]:
print(len(cat_list))
cat_list

49


['Sábado',
 'Domingo',
 'Lunes',
 'Martes',
 'Miércoles',
 'Jueves',
 'Viernes',
 'accidente-choque sin lesionados',
 'accidente-choque con lesionados',
 'lesionado-accidente automovilístico',
 'accidente-volcadura',
 'lesionado-atropellado',
 'accidente-otros',
 'accidente-persona atrapada / desbarrancada',
 'cadáver-accidente automovilístico',
 'cadáver-atropellado',
 'accidente-motociclista',
 'accidente-vehiculo desbarrancado',
 'accidente-choque con prensados',
 'accidente-vehiculo atrapado',
 'accidente-ciclista',
 'accidente-monopatín',
 'sismo-choque con lesionados',
 'accidente-vehículo atrapado-varado',
 'detención ciudadana-atropellado',
 'accidente-ferroviario',
 'detención ciudadana-accidente automovilístico',
 'sismo-choque sin lesionados',
 'sismo-persona atropellada',
 'Detención ciudadana-accidente automovilístico',
 'sismo-choque con prensados',
 'mi ciudad-calle-incidente de tránsito',
 'mi ciudad-taxi-incidente de tránsito',
 'BOTÓN DE AUXILIO',
 'LLAMADA_911_066',


#### Example code from StackOverflow to get labels.

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

train = pd.DataFrame({'age': [23,12, 12, np.nan],
                      'Gender': ['M','F', np.nan, 'F'],
                      'income': ['high','low','low','medium'],
                      'sales': [10000, 100020, 110000, 100],
                      'foo' : [1,0,0,1],
                      'text': ['I will test this',
                               'need to write more sentence',
                               'want to keep it simple',
                               'hope you got that these sentences are junk'],
                      'y': [0,1,1,1]})
numeric_columns = ['age']
cat_columns     = ['Gender','income']

numeric_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline     = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
text_pipeline = make_pipeline(CountVectorizer(), SelectKBest(k=5))

transformers = [
('num', numeric_pipeline, numeric_columns),
('cat', cat_pipeline, cat_columns),
('text', text_pipeline, 'text'),
('simple_transformer', MinMaxScaler(), ['sales']),
]

combined_pipe = ColumnTransformer(transformers, remainder='passthrough')

transformed_data = combined_pipe.fit_transform(train.drop('y',1), train['y'])

In [None]:
train

In [None]:
transformed_data.shape

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features




pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

In [None]:
train

## Updating dictionary

In [46]:
x_dict = {}

In [47]:
var = "var_1"

In [49]:
def create_datadict_entry(var, relevant=True, data_type="not_specified", model_relevant="False"):
    
    entry = {
        var: {
            "relevant": relevant,
            "data_type": data_type,
            "model_relevant": model_relevant
        }
    }
    
    x_dict.update(entry)
    
    return

In [50]:
create_datadict_entry(var)

In [51]:
x_dict

{'var_1': {'relevant': True,
  'data_type': 'not_specified',
  'model_relevant': 'False'}}

---
---