# Rob's sandbox

---

# Imports

#### Python modules

In [1]:
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import (
    train_test_split,
    cross_val_score
)
from sklearn.inspection import permutation_importance

# import matplotlib.pyplot as plt
# %matplotlib inline

import pandas as pd

# Ancillary modules

#### Support adjustments

In [2]:
sys.path.append("../..")

#### Modules

In [3]:
%load_ext autoreload
%autoreload 2

from src.utils.utils import (
    json_dump_dict
)

from src.utils.params import (
#     param_grid,
#     max_features,
#     n_estimators,
    cv_rounds,
    evaluation_metric,
    feature_importance_theshold,
    tag_non_relevant_cats,
    ingestion_pickle_loc,
    transformation_pickle_loc,
    fe_pickle_loc,
)

from src.pipelines.ingestion import (
    ingest
)

from src.pipelines.transformation import (
    load_ingestion,
#     date_transformation,
#     hour_transformation,
#     categoric_trasformation,
    transform
)

from src.pipelines.feature_engineering import (
    load_transformation,
    feature_generation,
    feature_selection,
    save_fe,
    feature_engineering
)

---

# Tests

## Transformation

#### Loading ingestion pickle

In [None]:
df_ing = load_ingestion("../../" + ingestion_pickle_loc)

In [None]:
df_ing

In [None]:
transform(("../../" + ingestion_pickle_loc), ("../../" + transformation_pickle_loc))

## Feature engineering

#### Loading transformation pickles

In [4]:
df_tra = load_transformation("../../" + transformation_pickle_loc)
df_tra

Unnamed: 0,fecha_creacion,hora_creacion,dia_semana,codigo_cierre,incidente_c4,...,dia_inicio_cos,mes_inicio_sin,mes_inicio_cos,hora_inicio_sin,hora_inicio_cos
0,23/01/2016,22:35:04,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
1,23/01/2016,22:50:49,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,...,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
2,24/01/2016,09:40:11,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,7.071068e-01,-0.707107
3,24/01/2016,22:40:57,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,-5.000000e-01,0.866025
4,24/01/2016,04:25:15,Domingo,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...,...,...,...
1383133,22/10/2020,16:17:05,Jueves,(A) La unidad de atención a emergencias fue de...,lesionado-atropellado,...,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
1383134,22/10/2020,16:29:13,Jueves,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,...,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
1383135,22/10/2020,12:00:26,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque sin lesionados,...,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000
1383136,22/10/2020,12:01:25,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque con lesionados,...,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000


In [5]:
feature_importance = load_transformation("../../" + "outputs/fe_df_feature_importance.pkl")
feature_importance

Unnamed: 0,Importance,Feature
10,0.318598,accidente-choque sin lesionados
37,0.190152,LLAMADA_911_066
34,0.110243,BOTÓN DE AUXILIO
38,0.101715,RADIO
8,0.100429,accidente-choque con lesionados
26,0.069934,lesionado-atropellado
20,0.022249,accidente-volcadura
42,0.015421,2014
53,0.014279,hora_inicio_sin
14,0.011829,accidente-motociclista


In [6]:
ohe_dict = load_transformation("../../" + "outputs/fe_ohe_dict.pkl")
# ohe_dict

##### Exploring transfomation result

#### Processing data for model

In [None]:
df_features_prc, df_labels, df_features_prc_cols = feature_generation(df)

In [None]:
print(len(df_features_prc_cols))
df_features_prc_cols

In [None]:
df_features_prc.shape

In [None]:
print(len(enc_cat_features))
enc_cat_features

In [None]:
df_features_prc.shape

In [None]:
len(df_features_prc_cols)

#### Training, executing and evaluating model

In [None]:
model = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [None]:
## Splitting data in train and test
X_train, X_test, y_train, y_test = train_test_split(df_features_prc, df_labels, test_size=0.3)

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=2,
                           scoring=evaluation_metric,
                           return_train_score=True,
                           n_jobs=-1
                           )

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)

print("\n++ Grid search results:\n")
print("    ++++ Best estimator: {}".format(grid_search.best_estimator_))
print("    ++++ Number of features in best estimator: {} \n".format(grid_search.best_estimator_.n_features_))
print("    ++++ Best estimator oob score: {}\n".format(grid_search.best_estimator_.oob_score_))

#### Identifying most important variables

##### Creating dataframe with ranks

##### Obtaining features cleaning dict from ranked features

In [7]:
feature_importance.reset_index(inplace=True, drop=True)
feature_importance

Unnamed: 0,Importance,Feature
0,0.318598,accidente-choque sin lesionados
1,0.190152,LLAMADA_911_066
2,0.110243,BOTÓN DE AUXILIO
3,0.101715,RADIO
4,0.100429,accidente-choque con lesionados
5,0.069934,lesionado-atropellado
6,0.022249,accidente-volcadura
7,0.015421,2014
8,0.014279,hora_inicio_sin
9,0.011829,accidente-motociclista


In [8]:
feature_importance["Important"] = feature_importance["Importance"].apply(lambda x: True if x >= feature_importance_theshold else False)

In [9]:
def find_mother(row, ohe_dict):
    """
    """
    
    for key in ohe_dict:
        if row in ohe_dict[key]:
            return key
    
    return row

In [10]:
feature_importance["Mother_feature"] = feature_importance["Feature"].apply(lambda x: find_mother(x, ohe_dict))

In [11]:
feature_importance

Unnamed: 0,Importance,Feature,Important,Mother_feature
0,0.318598,accidente-choque sin lesionados,True,incidente_c4
1,0.190152,LLAMADA_911_066,True,tipo_entrada
2,0.110243,BOTÓN DE AUXILIO,True,tipo_entrada
3,0.101715,RADIO,True,tipo_entrada
4,0.100429,accidente-choque con lesionados,True,incidente_c4
5,0.069934,lesionado-atropellado,True,incidente_c4
6,0.022249,accidente-volcadura,True,incidente_c4
7,0.015421,2014,True,anio_inicio
8,0.014279,hora_inicio_sin,True,hora_inicio_sin
9,0.011829,accidente-motociclista,True,incidente_c4


In [12]:
def feature_cleaning_dict(feature_importance, ohe_dict):
    
    m1 = feature_importance["Important"] == True
    important_features = list(feature_importance.loc[m1, "Mother_feature"].unique())

    fe_cln_dict = {}

    for imp_f in important_features:

        m2 = feature_importance["Mother_feature"] == imp_f

        if imp_f in ohe_dict:
            fe_cln_dict[imp_f] = {
                "data_type": "categoric",
                "important_categories": list(feature_importance.loc[(m1 & m2), "Feature"])
            }
        else:
            fe_cln_dict[imp_f] = {
                "data_type": "non-categoric",
            }

#     json_dump_dict(fe_cln_dict)
    
    return fe_cln_dict

In [13]:
fe_cln_dict = feature_cleaning_dict(feature_importance, ohe_dict)
fe_cln_dict

{'incidente_c4': {'data_type': 'categoric',
  'important_categories': ['accidente-choque sin lesionados',
   'accidente-choque con lesionados',
   'lesionado-atropellado',
   'accidente-volcadura',
   'accidente-motociclista']},
 'tipo_entrada': {'data_type': 'categoric',
  'important_categories': ['LLAMADA_911_066',
   'BOTÓN DE AUXILIO',
   'RADIO',
   'REDES']},
 'anio_inicio': {'data_type': 'categoric', 'important_categories': ['2014']},
 'hora_inicio_sin': {'data_type': 'non-categoric'},
 'hora_inicio_cos': {'data_type': 'non-categoric'}}

##### Cleaning transformation dataframe based on features cleaning dict

In [14]:
df_tra

Unnamed: 0,fecha_creacion,hora_creacion,dia_semana,codigo_cierre,incidente_c4,...,dia_inicio_cos,mes_inicio_sin,mes_inicio_cos,hora_inicio_sin,hora_inicio_cos
0,23/01/2016,22:35:04,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
1,23/01/2016,22:50:49,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,...,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
2,24/01/2016,09:40:11,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,7.071068e-01,-0.707107
3,24/01/2016,22:40:57,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,-5.000000e-01,0.866025
4,24/01/2016,04:25:15,Domingo,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...,...,...,...
1383133,22/10/2020,16:17:05,Jueves,(A) La unidad de atención a emergencias fue de...,lesionado-atropellado,...,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
1383134,22/10/2020,16:29:13,Jueves,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,...,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
1383135,22/10/2020,12:00:26,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque sin lesionados,...,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000
1383136,22/10/2020,12:01:25,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque con lesionados,...,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000


In [15]:
nr_f = [col for col in df_tra.columns if col not in fe_cln_dict]
df_tra.drop(nr_f, axis=1, inplace=True)

In [16]:
df_tra

Unnamed: 0,incidente_c4,tipo_entrada,anio_inicio,hora_inicio_sin,hora_inicio_cos
0,accidente-choque sin lesionados,BOTÓN DE AUXILIO,2016,-5.000000e-01,0.866025
1,accidente-choque con lesionados,BOTÓN DE AUXILIO,2016,-5.000000e-01,0.866025
2,accidente-choque sin lesionados,LLAMADA_911_066,2016,7.071068e-01,-0.707107
3,accidente-choque sin lesionados,LLAMADA_911_066,2016,-5.000000e-01,0.866025
4,accidente-choque sin lesionados,LLAMADA_911_066,2016,8.660254e-01,0.500000
...,...,...,...,...,...
1383133,lesionado-atropellado,BOTÓN DE AUXILIO,2020,-8.660254e-01,-0.500000
1383134,accidente-choque con lesionados,BOTÓN DE AUXILIO,2020,-8.660254e-01,-0.500000
1383135,accidente-choque sin lesionados,LLAMADA_911_066,2020,1.224647e-16,-1.000000
1383136,accidente-choque con lesionados,LLAMADA_911_066,2020,1.224647e-16,-1.000000


In [17]:
for cat_key in [key for key in fe_cln_dict if fe_cln_dict[key]["data_type"] == "categoric"]:
    m1 = ~df_tra[cat_key].isin(fe_cln_dict[cat_key]["important_categories"])
    df_tra.loc[m1, cat_key] = tag_non_relevant_cats

In [18]:
df_tra

Unnamed: 0,incidente_c4,tipo_entrada,anio_inicio,hora_inicio_sin,hora_inicio_cos
0,accidente-choque sin lesionados,BOTÓN DE AUXILIO,other_nr_categories,-5.000000e-01,0.866025
1,accidente-choque con lesionados,BOTÓN DE AUXILIO,other_nr_categories,-5.000000e-01,0.866025
2,accidente-choque sin lesionados,LLAMADA_911_066,other_nr_categories,7.071068e-01,-0.707107
3,accidente-choque sin lesionados,LLAMADA_911_066,other_nr_categories,-5.000000e-01,0.866025
4,accidente-choque sin lesionados,LLAMADA_911_066,other_nr_categories,8.660254e-01,0.500000
...,...,...,...,...,...
1383133,lesionado-atropellado,BOTÓN DE AUXILIO,other_nr_categories,-8.660254e-01,-0.500000
1383134,accidente-choque con lesionados,BOTÓN DE AUXILIO,other_nr_categories,-8.660254e-01,-0.500000
1383135,accidente-choque sin lesionados,LLAMADA_911_066,other_nr_categories,1.224647e-16,-1.000000
1383136,accidente-choque con lesionados,LLAMADA_911_066,other_nr_categories,1.224647e-16,-1.000000


## Modeling (magic loop)

In [24]:
x_dict = {
    "a": {
        "score": 10,
        "code": 123
    },
    "b": {
        "score": 9,
        "code": 456
    }
}

In [44]:
def select_best_model(x_dict):
    """
    """
    
    res = "nothing_"
    bench = 0
    for key in x_dict:
        if x_dict[key]["score"] > bench:
            res = key
            bench = x_dict[key]["score"]

    print(res)
    print(bench)

In [45]:
select_best_model(x_dict)

a
10


# *Notes*

## Finding labels of processed matrix

#### Manually number of labels that should be in the model.

In [None]:
cat_list = list(df_tra["dia_semana"].unique()) + \
            list(df_tra["incidente_c4"].unique()) + \
            list(df_tra["tipo_entrada"].unique()) + \
            list(df_tra["anio_inicio"].unique())

In [None]:
print(len(cat_list))
cat_list

#### Example code from StackOverflow to get labels.

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

train = pd.DataFrame({'age': [23,12, 12, np.nan],
                      'Gender': ['M','F', np.nan, 'F'],
                      'income': ['high','low','low','medium'],
                      'sales': [10000, 100020, 110000, 100],
                      'foo' : [1,0,0,1],
                      'text': ['I will test this',
                               'need to write more sentence',
                               'want to keep it simple',
                               'hope you got that these sentences are junk'],
                      'y': [0,1,1,1]})
numeric_columns = ['age']
cat_columns     = ['Gender','income']

numeric_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline     = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
text_pipeline = make_pipeline(CountVectorizer(), SelectKBest(k=5))

transformers = [
('num', numeric_pipeline, numeric_columns),
('cat', cat_pipeline, cat_columns),
('text', text_pipeline, 'text'),
('simple_transformer', MinMaxScaler(), ['sales']),
]

combined_pipe = ColumnTransformer(transformers, remainder='passthrough')

transformed_data = combined_pipe.fit_transform(train.drop('y',1), train['y'])

In [None]:
train

In [None]:
transformed_data.shape

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features




pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

In [None]:
train

## Updating dictionary

In [None]:
x_dict = {}

In [None]:
var = "var_1"

In [None]:
def create_datadict_entry(var, relevant=True, data_type="not_specified", model_relevant="False"):
    
    entry = {
        var: {
            "relevant": relevant,
            "data_type": data_type,
            "model_relevant": model_relevant
        }
    }
    
    x_dict.update(entry)
    
    return

In [None]:
create_datadict_entry(var)

In [None]:
x_dict

---
---