# Rob's sandbox

---

# Imports

#### Python modules

In [1]:
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import (
    train_test_split,
    cross_val_score
)
from sklearn.inspection import permutation_importance

# import matplotlib.pyplot as plt
# %matplotlib inline

import pandas as pd
pd.set_option("display.max_columns", 50)

# Ancillary modules

#### Support adjustments

In [2]:
sys.path.append("../..")

#### Modules

In [6]:
%load_ext autoreload
%autoreload 2

from src.utils.data_dict import (
    data_dict
)

from src.utils.utils import (
    json_dump_dict,
    load_df
)

from src.utils.params import (
#     param_grid,
#     max_features,
#     n_estimators,
    cv_rounds,
    evaluation_metric,
    feature_importance_theshold,
    tag_non_relevant_cats,
    ingestion_pickle_loc,
    transformation_pickle_loc,
    models_pickle_loc
)

from src.pipelines.ingestion import (
    ingest
)

from src.pipelines.transformation import (
    load_ingestion,
#     date_transformation,
#     hour_transformation,
#     categoric_trasformation,
    transform
)

from src.pipelines.feature_engineering import (
    load_transformation,
    feature_generation,
    feature_selection,
    save_fe,
    feature_engineering,
    fe_pickle_loc_imp_features,
    fe_pickle_loc_feature_labs
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---

# Original data

In [None]:
df_o = pd.read_csv("../../" + "data/incidentes-viales-c5.csv")
df_o

# Tests

## Transformation

#### Loading ingestion pickle

In [None]:
df_ing = load_ingestion("../../" + ingestion_pickle_loc)

In [None]:
df_ing

In [None]:
df_ing["label"].value_counts(normalize=True)

## Feature engineering

#### Loading transformation pickles

In [4]:
df_tra = load_transformation("../../" + transformation_pickle_loc)
df_tra

Unnamed: 0_level_0,fecha_creacion,hora_creacion,dia_semana,codigo_cierre,incidente_c4,...,dia_inicio_cos,mes_inicio_sin,mes_inicio_cos,hora_inicio_sin,hora_inicio_cos
folio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GA/160123/05714,23/01/2016,22:35:04,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
AO/160123/05826,23/01/2016,22:50:49,Sábado,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,...,0.041325,0.500000,0.866025,-5.000000e-01,0.866025
C4/160124/02187,24/01/2016,09:40:11,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,7.071068e-01,-0.707107
C4/160124/05723,24/01/2016,22:40:57,Domingo,(N) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,-5.000000e-01,0.866025
C4/160124/01334,24/01/2016,04:25:15,Domingo,(A) La unidad de atención a emergencias fue de...,accidente-choque sin lesionados,...,0.245485,0.500000,0.866025,8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...,...,...,...
GA/201022/03443,22/10/2020,16:17:05,Jueves,(A) La unidad de atención a emergencias fue de...,lesionado-atropellado,...,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
GA/201022/03492,22/10/2020,16:29:13,Jueves,(A) La unidad de atención a emergencias fue de...,accidente-choque con lesionados,...,-0.164595,-0.866025,0.500000,-8.660254e-01,-0.500000
C5/201022/02030,22/10/2020,12:00:26,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque sin lesionados,...,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000
C5/201022/02039,22/10/2020,12:01:25,Jueves,(D) El incidente reportado se registró en dos ...,accidente-choque con lesionados,...,-0.164595,-0.866025,0.500000,1.224647e-16,-1.000000


##### Exploring transfomation result

#### Processing data for model

In [None]:
df_features_prc, df_labels, df_features_prc_cols = feature_generation(df)

In [None]:
print(len(df_features_prc_cols))
df_features_prc_cols

In [None]:
df_features_prc.shape

In [None]:
print(len(enc_cat_features))
enc_cat_features

In [None]:
df_features_prc.shape

In [None]:
len(df_features_prc_cols)

#### Training, executing and evaluating model

In [None]:
model = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [None]:
## Splitting data in train and test
X_train, X_test, y_train, y_test = train_test_split(df_features_prc, df_labels, test_size=0.3)

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=2,
                           scoring=evaluation_metric,
                           return_train_score=True,
                           n_jobs=-1
                           )

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)

print("\n++ Grid search results:\n")
print("    ++++ Best estimator: {}".format(grid_search.best_estimator_))
print("    ++++ Number of features in best estimator: {} \n".format(grid_search.best_estimator_.n_features_))
print("    ++++ Best estimator oob score: {}\n".format(grid_search.best_estimator_.oob_score_))

#### Identifying most important variables

##### Creating dataframe with ranks

##### Obtaining features cleaning dict from ranked features

In [None]:
feature_importance.reset_index(inplace=True, drop=True)
feature_importance

In [None]:
feature_importance["Important"] = feature_importance["Importance"].apply(lambda x: True if x >= feature_importance_theshold else False)

In [None]:
def find_mother(row, ohe_dict):
    """
    """
    
    for key in ohe_dict:
        if row in ohe_dict[key]:
            return key
    
    return row

In [None]:
feature_importance["Mother_feature"] = feature_importance["Feature"].apply(lambda x: find_mother(x, ohe_dict))

In [None]:
feature_importance

In [None]:
def feature_cleaning_dict(feature_importance, ohe_dict):
    
    m1 = feature_importance["Important"] == True
    important_features = list(feature_importance.loc[m1, "Mother_feature"].unique())

    fe_cln_dict = {}

    for imp_f in important_features:

        m2 = feature_importance["Mother_feature"] == imp_f

        if imp_f in ohe_dict:
            fe_cln_dict[imp_f] = {
                "data_type": "categoric",
                "important_categories": list(feature_importance.loc[(m1 & m2), "Feature"])
            }
        else:
            fe_cln_dict[imp_f] = {
                "data_type": "non-categoric",
            }

#     json_dump_dict(fe_cln_dict)
    
    return fe_cln_dict

In [None]:
fe_cln_dict = feature_cleaning_dict(feature_importance, ohe_dict)
fe_cln_dict

##### Cleaning transformation dataframe based on features cleaning dict

In [None]:
df_tra

In [None]:
nr_f = [col for col in df_tra.columns if col not in fe_cln_dict]
df_tra.drop(nr_f, axis=1, inplace=True)

In [None]:
df_tra

In [None]:
for cat_key in [key for key in fe_cln_dict if fe_cln_dict[key]["data_type"] == "categoric"]:
    m1 = ~df_tra[cat_key].isin(fe_cln_dict[cat_key]["important_categories"])
    df_tra.loc[m1, cat_key] = tag_non_relevant_cats

In [None]:
df_tra

#### Reviewing results

In [7]:
df_imp_features_prc = load_df("../../" + fe_pickle_loc_imp_features)
df_labels = load_df("../../" + fe_pickle_loc_feature_labs)

In [11]:
print(df_imp_features_prc.shape)
df_imp_features_prc.toarray()

(1383138, 7)


array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [14]:
print(df_labels.shape)
df_labels

(1383138,)


folio
GA/160123/05714    0.0
AO/160123/05826    0.0
C4/160124/02187    1.0
C4/160124/05723    1.0
C4/160124/01334    0.0
                  ... 
GA/201022/03443    0.0
GA/201022/03492    0.0
C5/201022/02030    0.0
C5/201022/02039    0.0
C5/201022/03364    0.0
Name: label, Length: 1383138, dtype: float64

## Modeling (magic loop)

### Imports

In [15]:
best_model = load_df("../../" + models_pickle_loc)
X_test = load_df("../../" + "outputs/X_test.pkl")
y_test = load_df("../../" + "outputs/y_test.pkl")
test_predict_labs = load_df("../../" + "outputs/test_predict_labs.pkl")
test_predict_scores = load_df("../../" + "outputs/test_predict_scores.pkl")

In [16]:
best_model

RandomForestClassifier(max_features=6, max_leaf_nodes=10, min_samples_leaf=3,
                       n_jobs=-1, oob_score=True, random_state=1111)

In [17]:
print(X_test.shape)
X_test.toarray()

(414942, 7)


array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [19]:
print(y_test.shape)
y_test

(414942,)


folio
C5/180923/00715    0.0
C5/180125/06384    1.0
C5/180904/02619    1.0
C5/170327/07629    0.0
C5/170803/05943    0.0
                  ... 
C4/140210/00850    0.0
C5/180109/02335    1.0
C5/160527/01506    1.0
C5/191115/08861    0.0
C5/190306/07844    0.0
Name: label, Length: 414942, dtype: float64

In [20]:
print(test_predict_labs.shape)
test_predict_labs

(414942,)


array([0., 0., 0., ..., 0., 0., 0.])

In [21]:
print(test_predict_scores.shape)
test_predict_scores

(414942, 2)


array([[0.71259143, 0.28740857],
       [0.85976458, 0.14023542],
       [0.84693228, 0.15306772],
       ...,
       [0.71259143, 0.28740857],
       [0.71259143, 0.28740857],
       [0.71259143, 0.28740857]])

In [23]:
prds = best_model.predict(X_test)
prds

array([0., 0., 0., ..., 0., 0., 0.])

In [26]:
df_ypred = y_test.to_frame()
df_ypred["mod_pred"] = prds
df_ypred

Unnamed: 0_level_0,label,mod_pred
folio,Unnamed: 1_level_1,Unnamed: 2_level_1
C5/180923/00715,0.0,0.0
C5/180125/06384,1.0,0.0
C5/180904/02619,1.0,0.0
C5/170327/07629,0.0,0.0
C5/170803/05943,0.0,0.0
...,...,...
C4/140210/00850,0.0,0.0
C5/180109/02335,1.0,0.0
C5/160527/01506,1.0,0.0
C5/191115/08861,0.0,0.0


# *Notes*

## Finding labels of processed matrix

#### Manually number of labels that should be in the model.

In [None]:
cat_list = list(df_tra["dia_semana"].unique()) + \
            list(df_tra["incidente_c4"].unique()) + \
            list(df_tra["tipo_entrada"].unique()) + \
            list(df_tra["anio_inicio"].unique())

In [None]:
print(len(cat_list))
cat_list

#### Example code from StackOverflow to get labels.

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

train = pd.DataFrame({'age': [23,12, 12, np.nan],
                      'Gender': ['M','F', np.nan, 'F'],
                      'income': ['high','low','low','medium'],
                      'sales': [10000, 100020, 110000, 100],
                      'foo' : [1,0,0,1],
                      'text': ['I will test this',
                               'need to write more sentence',
                               'want to keep it simple',
                               'hope you got that these sentences are junk'],
                      'y': [0,1,1,1]})
numeric_columns = ['age']
cat_columns     = ['Gender','income']

numeric_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline     = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
text_pipeline = make_pipeline(CountVectorizer(), SelectKBest(k=5))

transformers = [
('num', numeric_pipeline, numeric_columns),
('cat', cat_pipeline, cat_columns),
('text', text_pipeline, 'text'),
('simple_transformer', MinMaxScaler(), ['sales']),
]

combined_pipe = ColumnTransformer(transformers, remainder='passthrough')

transformed_data = combined_pipe.fit_transform(train.drop('y',1), train['y'])

In [None]:
train

In [None]:
transformed_data.shape

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features




pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

In [None]:
train

## Updating dictionary

In [None]:
x_dict = {}

In [None]:
var = "var_1"

In [None]:
def create_datadict_entry(var, relevant=True, data_type="not_specified", model_relevant="False"):
    
    entry = {
        var: {
            "relevant": relevant,
            "data_type": data_type,
            "model_relevant": model_relevant
        }
    }
    
    x_dict.update(entry)
    
    return

In [None]:
create_datadict_entry(var)

In [None]:
x_dict

---
---