# reloading final model, columns, transformation and raw test set

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# imports
#imports
import os

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

import pickle
from treeinterpreter import treeinterpreter as ti

## Loading model

In [3]:
folder = "models"
path_to_load = f"../../src/{folder}/1.1 - df_train 01-18to12-18"
model_file = os.path.join(path_to_load, "1.1.b.2.f(Model) - Model Optimization recall - class_weight (1, 18) - time sorted - valid score (0.403, 0.972).pickle")

with open(model_file, "rb") as file:
    m = pickle.load(file)
m

ExtraTreesClassifier(bootstrap=False, class_weight={0: 1, 1: 18},
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=55,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

## Loading columns

In [4]:
folder = "features"
path_to_load = f"../../src/{folder}/1.1 - df_train 01-18to12-18"
columns_file = os.path.join(path_to_load, "1.1.b.2.f (COLUMNS) - features: 17.pickle")

with open(columns_file, "rb") as file:
    columns = pickle.load(file)
print(columns)

['cond_C10', 'TIPO_EXPED', 'preg_15', 'es_gte_5', 'preg_31', 'preg_33', 'cond_32', 'preg_32', 'cond_C11', 'cond_C12', 'cond_C2', 'METRO', 'SEXO_TERC', 'OCUPACION_ASEG', 'FALTANTE', 'preg_34', 'cond_C5']


In [5]:
model_file = "../../src/features/1.1 - df_train 01-18to12-18/1.1 - dict categorical mappers.pickle"

with open(model_file, "rb") as f:
    mapper = pickle.load(f)

## Loading raw dataset

In [6]:
path = "../../data/interim/5. merged/merged_Condiciones_side + merged_DSS_SINIESTROS_AUTOS_side.feather"

if not os.path.isfile(path):
    raise Exception(f"file not founded: {path}")

In [7]:
df_pred = pd.read_feather(path)
df_pred = df_pred[(df_pred["FECHA_SINI"] >= "2019/04/01") & (df_pred["FECHA_SINI"] < "2019/05/01")]
df_pred.info()

  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
Int64Index: 34658 entries, 1149599 to 1228251
Columns: 147 entries, NUM_SECU_EXPED to HABILITADO
dtypes: bool(1), datetime64[ns](8), float64(63), int64(1), object(74)
memory usage: 38.9+ MB


In [8]:
# df_pred.iloc[:50].to_csv("test_example.csv")

## Applying transformations

In [9]:
# reduce columns
target = "EXISTE_FRAUDE"
X_pred = df_pred[columns].copy()
# not necessary in the pipe
y_pred = df_pred[target] == True  # fixing values for target
X_pred.head()

Unnamed: 0,cond_C10,TIPO_EXPED,preg_15,es_gte_5,preg_31,preg_33,cond_32,preg_32,cond_C11,cond_C12,cond_C2,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,preg_34,cond_C5
1149599,,10,,False,,,,,,,,S,,44.0,S,,
1149600,,10,,False,,,,,,,,S,,5.0,S,,
1149601,,50,,False,,,,2.0,,,3.0,S,,44.0,S,,0.0
1149603,,3,,False,,,,,,,,N,M,44.0,S,,
1149604,,10,,False,,,,,,,,N,,44.0,S,,


In [10]:
y_pred.value_counts()

False    34562
True        96
Name: EXISTE_FRAUDE, dtype: int64

In [11]:
y_pred.value_counts(normalize=True)

False    0.99723
True     0.00277
Name: EXISTE_FRAUDE, dtype: float64

In [12]:
# normalizing TIPO_EXPED 
X_pred["TIPO_EXPED"] = X_pred["TIPO_EXPED"].astype("str").str.zfill(3)

In [13]:
# replace None by np.nan
X_pred.fillna(np.nan, inplace=True)

In [14]:
X_pred.head()

Unnamed: 0,cond_C10,TIPO_EXPED,preg_15,es_gte_5,preg_31,preg_33,cond_32,preg_32,cond_C11,cond_C12,cond_C2,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,preg_34,cond_C5
1149599,,10,,False,,,,,,,,S,,44.0,S,,
1149600,,10,,False,,,,,,,,S,,5.0,S,,
1149601,,50,,False,,,,2.0,,,3.0,S,,44.0,S,,0.0
1149603,,3,,False,,,,,,,,N,M,44.0,S,,
1149604,,10,,False,,,,,,,,N,,44.0,S,,


In [15]:
cols_used = list(set(columns) - set(target))
cat_cols = list(set(mapper.keys()).intersection(cols_used))
for col in cat_cols:
    unique_values = X_pred.loc[:, col].unique()
    keys = mapper[col].keys()
    new_vals = list(set(unique_values) - keys)
    if(new_vals):
        # replace new values by np.nan
        print(col, new_vals)
        X_pred.replace(new_vals, np.nan, inplace=True)
    X_pred.loc[:,col] = X_pred.loc[:,col].replace(mapper[col])
X_pred.fillna(-1, inplace=True)

preg_33 [nan]
SEXO_TERC [nan]
preg_34 [nan]
preg_15 [nan]
METRO [nan]
preg_32 [nan]
preg_31 [nan]


In [16]:
X_pred.head()

Unnamed: 0,cond_C10,TIPO_EXPED,preg_15,es_gte_5,preg_31,preg_33,cond_32,preg_32,cond_C11,cond_C12,cond_C2,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,preg_34,cond_C5
1149599,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,1,-1,44.0,1,-1,-1.0
1149600,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,1,-1,5.0,1,-1,-1.0
1149601,-1.0,5,-1,False,-1,-1,-1.0,1,-1.0,-1.0,3.0,1,-1,44.0,1,-1,0.0
1149603,-1.0,2,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,0,1,44.0,1,-1,-1.0
1149604,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,0,-1,44.0,1,-1,-1.0


### predict all examples

In [17]:
y_model_pred_prob = m.predict_proba(X_pred)[:, -1]
y_model_pred_prob

array([0.00421323, 0.00490136, 0.43368405, ..., 0.0040771 , 0.06984446,
       0.05567552])

In [18]:
confusion_matrix(y_pred, y_model_pred_prob > 0.5)

array([[34352,   210],
       [   30,    66]])

In [19]:
y_model_pred_prob[:5]

array([0.00421323, 0.00490136, 0.43368405, 0.00678954, 0.00572236])

### Saving predictions to analysis

In [20]:
df_pred["pred"] = y_model_pred_prob

In [21]:
# df_pred.to_csv("val_with_pred_feb_march.csv")

# Analysing model results

In [22]:
# normalizing TIPO_EXPED 
df_pred["TIPO_EXPED"] = df_pred["TIPO_EXPED"].astype("str").str.zfill(3)

In [25]:
df_pred["pred_bin"] = pd.cut(df_pred["pred"], [x/100 for x in range(0,110,10)])

In [27]:
# pd.crosstab(df_pred["pred_bin"], [df_pred["TIPO_EXPED"], df_pred["EXISTE_FRAUDE"]])
pd.crosstab(df_pred["pred"]> 0.5, [df_pred["TIPO_EXPED"], df_pred["EXISTE_FRAUDE"]])

TIPO_EXPED,001,002,003,010,010,020,050,050,060,060
EXISTE_FRAUDE,False,False,False,False,True,False,False,True,False,True
pred,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
False,3,17,6,8,1,6,130,10,46,19
True,0,0,0,0,0,0,8,0,104,66


In [28]:
# tronador
pd.crosstab(df_pred["es_gte_5"], [df_pred["TIPO_EXPED"], df_pred["EXISTE_FRAUDE"]])

TIPO_EXPED,001,002,003,010,010,020,050,050,060,060
EXISTE_FRAUDE,False,False,False,False,True,False,False,True,False,True
es_gte_5,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
False,3,17,6,8,1,6,46,5,47,22
True,0,0,0,0,0,0,92,5,103,63


### predict 1 row example

In [70]:
case = X_pred.iloc[1:5,:].head(1)
case

Unnamed: 0,cond_C10,TIPO_EXPED,preg_15,es_gte_5,preg_31,preg_33,cond_32,preg_32,cond_C11,cond_C12,cond_C2,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,preg_34,cond_C5
1149600,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,1,-1,5.0,1,-1,-1.0


In [71]:
y_example_pred = m.predict_proba(case)[0, 1]
y_example_pred

0.004901363432583442

In [72]:
X_pred[:5]

Unnamed: 0,cond_C10,TIPO_EXPED,preg_15,es_gte_5,preg_31,preg_33,cond_32,preg_32,cond_C11,cond_C12,cond_C2,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,preg_34,cond_C5
1149599,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,1,-1,44.0,1,-1,-1.0
1149600,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,1,-1,5.0,1,-1,-1.0
1149601,-1.0,5,-1,False,-1,-1,-1.0,1,-1.0,-1.0,3.0,1,-1,44.0,1,-1,0.0
1149603,-1.0,2,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,0,1,44.0,1,-1,-1.0
1149604,-1.0,3,-1,False,-1,-1,-1.0,-1,-1.0,-1.0,-1.0,0,-1,44.0,1,-1,-1.0


In [43]:
mapper["TIPO_EXPED"]

{'None': 7,
 '003': 2,
 '060': 6,
 '010': 3,
 '020': 4,
 '002': 1,
 '050': 5,
 '001': 0}

Unnamed: 0,cond_C10,TIPO_EXPED,preg_15,es_gte_5,preg_31,preg_33,cond_32,preg_32,cond_C11,cond_C12,cond_C2,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,preg_34,cond_C5
