# Transforming column to numeric, create new cols & drop useless

## imports & configs

In [1]:
import os

import pickle
import pandas as pd
import numpy as np

In [2]:
# pandas configs
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

# working with date columns

## Loading the dataset

In [3]:
path = "../../../data/interim/5. merged/merged_Condiciones_side + merged_DSS_SINIESTROS_AUTOS_side.feather"

if not os.path.isfile(path):
    raise Exception(f"file not founded: {path}")

In [4]:
df = pd.read_feather(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 147 entries, NUM_SECU_EXPED to HABILITADO
dtypes: bool(1), datetime64[ns](8), float64(63), int64(1), object(74)
memory usage: 1.3+ GB


## Working with datetime columns

In [5]:
date_cols = list(df.select_dtypes("datetime64").columns)
date_cols

['FEC_ACT',
 'FECHA_SINI',
 'FECHA_PROCESO',
 'FECHA_VENC_POL',
 'FECHA_VIG_ORIG_POL',
 'FECHA_VIG_POL',
 'FECHA_DESDE',
 'FECHA_NACIMIENTO']

In [6]:
for col in df.columns:
    print(col)

NUM_SECU_EXPED
cond_04
cond_05
cond_06
cond_09
cond_11
cond_12
cond_32
cond_37
cond_C1
cond_C10
cond_C11
cond_C12
cond_C13
cond_C14
cond_C15
cond_C16
cond_C17
cond_C18
cond_C19
cond_C2
cond_C20
cond_C21
cond_C3
cond_C4
cond_C5
cond_C6
cond_C7
cond_C8
cond_C9
FEC_ACT
USR_ACT
total_condicion
es_gte_5
COD_FRAUDE
COD_RAMO
EXISTE_FRAUDE
EXISTE_INHABILITAR
EXISTE_INVEST
preg_1
preg_2
preg_3
preg_4
preg_5
preg_6
preg_7
preg_8
preg_9
preg_10
preg_11
preg_12
preg_15
preg_16
preg_18
preg_19
preg_22
preg_24
preg_27
preg_28
preg_30
preg_31
preg_32
preg_33
preg_34
preg_36
preg_37
CATASTROFICO
CERRADURA_BAUL
CERRADURA_DERECHA
CERRADURA_IZQUIERDA
CODIGO_BAJA
CODIGO_CARATULA
CODIGO_REAPERTURA
CODIGO_VEHICULO
COD_ACT_BENEF
COD_CAUSA_SINI
COD_POST_OCURRENCIA
COD_POST_POLIZA
COD_POST_TERC
COD_RAMO_sini
COD_RIES_sini
CONDICION_ROBO_EXP50
DANOS_MATERIALES
DESCRIPCION_TIPO
DESCRIPCION_VEHICULO
ESTADO_CIVIL
ESTADO_CIVIL_TERC
ESTAD_VEH_ASEG
FALTANTE
FECHA_SINI
MCA_COASEG
MCA_JUICIO
MCA_VIP
METRO
NUM_SECU_POL


## Creating new columns

In [7]:
# Age: FECHA_SINI - FECHA_NACIMIENTO
# (df["FECHA_SINI"] - df["FECHA_NACIMIENTO"]).tail()
df["FECHA_NACIMIENTO"].value_counts(dropna=False).head()

NaT           1135032
1900-01-01       2258
1911-11-01       2036
1980-01-01        964
1911-01-01        232
Name: FECHA_NACIMIENTO, dtype: int64

<h2 style="color:red;">Ignore for quality issues</h2>

In [8]:
# antiguedad cliente
df["FECHA_DESDE"].value_counts(dropna=False).head()

NaT           1134640
1997-05-02        120
1997-04-05        120
1997-03-08        119
1997-03-14        117
Name: FECHA_DESDE, dtype: int64

<h2 style="color:red;">Ignore for quality issues</h2>

## Explote SINI date

In [9]:
import pandas as pd
import re
import numpy as np


def expandDatetime(data, datefields, drop=True, time=False, inplace=False):
   """Create several features from every datetime column.

   Add new columns to the Dataframe('Year', 'Month', 'Week',
   'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end',
   'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
   'Is_year_end' and 'Is_year_start') for every feature
   containing the word "Date".

   This method is adapted from fastai.structured.add_datepart.

   Parameters
   ----------
   data: pandas.Dataframe
       The entire working dataset.
   
   datefields: list, optional
       List of datefields to expand.

   drop: boolean, optional
       Determines whether to drop the original datetime columns
       or not.

   time: boolean, optional
       If True adds aditional columns (Hour, Min and Sec).
   
   inplace: boolena, optional (default=False)
       If False modify a new object else modify the object pass
       int data.

   Returns
   -------
   new_data: pandas.Dataframe
       The entire dataframe with the new columns.

   """
   if(inplace):
       new_data = data
   else:
       new_data = data.copy(deep=True)
   fields_list = list(new_data)
   for field in fields_list:
       if field in datefields:
           fld = data[field]
           if not np.issubdtype(fld.dtype, np.datetime64):
               fld = pd.to_datetime(fld, infer_datetime_format=True)
               new_data[field] = fld
           targ_pre = re.sub('[Dd]ate$', '', field)
           attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek',
                   'Dayofyear', 'Is_month_end', 'Is_month_start',
                   'Is_quarter_end', 'Is_quarter_start', 'Is_year_end',
                   'Is_year_start']
           if time:
               attr = attr + ['Hour', 'Minute', 'Second']
           for n in attr:
               new_data[targ_pre + "_" + n] = getattr(fld.dt, n.lower()).astype(float)
           new_data[targ_pre + "_" + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
           if drop:
               new_data.drop(field, axis=1, inplace=True)
   return new_data


In [10]:
expandDatetime(df, ["FECHA_SINI"], drop=False, time=True, inplace=True).head()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,FEC_ACT,USR_ACT,total_condicion,es_gte_5,COD_FRAUDE,COD_RAMO,EXISTE_FRAUDE,EXISTE_INHABILITAR,EXISTE_INVEST,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37,CATASTROFICO,CERRADURA_BAUL,CERRADURA_DERECHA,CERRADURA_IZQUIERDA,CODIGO_BAJA,CODIGO_CARATULA,CODIGO_REAPERTURA,CODIGO_VEHICULO,COD_ACT_BENEF,COD_CAUSA_SINI,COD_POST_OCURRENCIA,COD_POST_POLIZA,COD_POST_TERC,COD_RAMO_sini,COD_RIES_sini,CONDICION_ROBO_EXP50,DANOS_MATERIALES,DESCRIPCION_TIPO,DESCRIPCION_VEHICULO,ESTADO_CIVIL,ESTADO_CIVIL_TERC,ESTAD_VEH_ASEG,FALTANTE,FECHA_SINI,MCA_COASEG,MCA_JUICIO,MCA_VIP,METRO,NUM_SECU_POL,OCUPACION_ASEG,SEXO,SEXO_TERC,TELEFONO_TERC,TIPO,TIPO_EXPED,TIPO_LESION,TIPO_LESION_MAXIMA,TIPO_SINIESTRO,USO,dias_entre_denu_y_sini,dist_fformal_fsini,dist_fformal_fdenu,existe_FECHA_FORMAL,CANT_RENOVACION,CAPITAL_ACCESORIOS,CAPITAL_ASEGURADO_COTIZACION,CAPITAL_VEHICULO,COD_COBRO,COD_COBRO_ANTERIOR,COD_POSTAL,COD_PROD,COD_RAMO_vigabt_aseg_cif,COD_RIES_vigabt_aseg_cif,COD_ZONA_CASCO,COD_ZONA_RC,COD_ZONA_ROBO,CONV_COMISIONARIO,FECHA_PROCESO,FECHA_VENC_POL,FECHA_VIG_ORIG_POL,FECHA_VIG_POL,CIF_ID,MCA_AGRAVANTE,MCA_EMPLEADO,MCA_MOVIMIENTO,MCA_POLIZA_VIP,NEGOCIO,PRENDARIO,SEGMENTO,cambio_cobro,ANTIG_calc,COD_EST_CIVIL,FECHA_DESDE,FECHA_NACIMIENTO,SEXO_ASEG,TIPO_ACTIVIDAD,CLIENTE,CODIGO_NACION,DATECO_TIPO_ACTIVIDAD,DOMICILIO_CODIGO_POSTAL,HABILITADO,FECHA_SINI_Year,FECHA_SINI_Month,FECHA_SINI_Week,FECHA_SINI_Day,FECHA_SINI_Dayofweek,FECHA_SINI_Dayofyear,FECHA_SINI_Is_month_end,FECHA_SINI_Is_month_start,FECHA_SINI_Is_quarter_end,FECHA_SINI_Is_quarter_start,FECHA_SINI_Is_year_end,FECHA_SINI_Is_year_start,FECHA_SINI_Hour,FECHA_SINI_Minute,FECHA_SINI_Second,FECHA_SINI_Elapsed
0,516359994,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,1.0,3.0,2018-06-19,SALINASL,4.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,NaT,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037
1,1762619999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,2017-03-20,COLOMBOM,0.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,NaT,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037
2,4768809999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,2018-03-08,ALAIS,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,NaT,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037
3,6444209999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,2018-02-26,KLEIN,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,NaT,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037
4,7529469970,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,2018-04-18,CHIPIAN,1.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,NaT,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037


In [11]:
df.columns

Index(['NUM_SECU_EXPED', 'cond_04', 'cond_05', 'cond_06', 'cond_09', 'cond_11',
       'cond_12', 'cond_32', 'cond_37', 'cond_C1',
       ...
       'FECHA_SINI_Is_month_end', 'FECHA_SINI_Is_month_start',
       'FECHA_SINI_Is_quarter_end', 'FECHA_SINI_Is_quarter_start',
       'FECHA_SINI_Is_year_end', 'FECHA_SINI_Is_year_start', 'FECHA_SINI_Hour',
       'FECHA_SINI_Minute', 'FECHA_SINI_Second', 'FECHA_SINI_Elapsed'],
      dtype='object', length=163)

### renaming columns

In [12]:
# antiguedad poliza: rename ANTIG_calc as ANTIG_pol
df.rename(columns={"ANTIG_calc": "ANTIG_pol"}, inplace=True)

## Drop useless datetime columns

In [13]:
date_cols.remove("FECHA_SINI")

In [14]:
df["ANTIG_pol"].value_counts(dropna=False).head()

NaN      95428
0.0      73588
92.0     29794
365.0    20512
61.0     18186
Name: ANTIG_pol, dtype: int64

In [15]:
df.drop(columns=date_cols, inplace=True)

## analyzing COD_POST_POLIZA y COD_POSTAL

In [16]:
df["COD_POST_POLIZA"].equals(df["COD_POSTAL"])

False

In [17]:
df["COD_POST_POLIZA"].value_counts(dropna=False)

NaN          89671
1900000.0    20817
1406000.0    17364
2000005.0    17224
1407000.0    15700
5000050.0    15011
7600002.0    14224
1425000.0    13986
1439000.0    13566
5000000.0    11544
1419000.0    10999
1426000.0    10906
4000028.0    10430
1431000.0    10232
1416000.0    10123
1824002.0    10077
1414000.0     9793
1704001.0     9070
1428000.0     9069
1417000.0     9009
1408000.0     8616
1828008.0     8556
9000010.0     8535
9420016.0     8042
1424000.0     7485
1878000.0     7460
1832007.0     7419
9410012.0     7376
4400000.0     7353
1429000.0     7151
1430000.0     7141
1712000.0     7048
3500019.0     6838
8300001.0     6822
1663015.0     6701
1744005.0     6696
1722006.0     6655
1714010.0     6584
1884018.0     6540
1405000.0     6532
1754001.0     6411
1440000.0     6370
1437000.0     6195
3000022.0     6018
1636002.0     5991
1888032.0     5849
1842010.0     5764
3600014.0     5742
1427000.0     5685
1757000.0     5647
1708007.0     5531
1602000.0     5451
1870011.0   

In [18]:
df["COD_POSTAL"].value_counts(dropna=False) # has more None

NaN          95428
1900000.0    20751
1406000.0    17255
2000005.0    17065
1407000.0    15579
1425000.0    15144
5000050.0    14831
7600002.0    14070
5000000.0    11584
1419000.0    10989
1426000.0    10839
1002000.0    10809
4000028.0    10361
1431000.0    10146
1416000.0    10083
1824002.0    10015
1414000.0     9521
1704001.0     9085
1428000.0     8957
1417000.0     8940
1408000.0     8550
9000010.0     8511
1828008.0     8490
9420016.0     7989
1439000.0     7526
4400000.0     7505
1878000.0     7433
1424000.0     7405
9410012.0     7341
1832007.0     7340
1429000.0     7198
1430000.0     7074
3500019.0     7071
1712000.0     6946
8300001.0     6734
1663015.0     6635
1744005.0     6622
1722006.0     6597
1714010.0     6499
1884018.0     6477
1405000.0     6476
1754001.0     6378
1440000.0     6364
1437000.0     6194
3000022.0     5985
1636002.0     5922
1888032.0     5823
1842010.0     5733
1427000.0     5704
3600014.0     5642
1757000.0     5603
1708007.0     5489
1602000.0   

In [19]:
# trying to fixed missing values
df[(df["COD_POST_POLIZA"].isna()) & (~df["COD_POSTAL"].isna())][["COD_POST_POLIZA", "COD_POSTAL"]]

Unnamed: 0,COD_POST_POLIZA,COD_POSTAL
95503,,9103002.0
163240,,9103002.0
211890,,9103002.0
225850,,1424000.0
225851,,1424000.0
225852,,1424000.0
453285,,9103002.0
453290,,9103002.0
540991,,9103002.0
632055,,9103002.0


In [20]:
idxs = df[(df["COD_POST_POLIZA"].isna()) & (~df["COD_POSTAL"].isna())].index
df.loc[idxs, "COD_POST_POLIZA"] = df.loc[idxs, "COD_POSTAL"]

In [21]:
# trying to fixed missing values
df[(df["COD_POST_POLIZA"].isna()) & (~df["COD_POSTAL"].isna())][["COD_POST_POLIZA", "COD_POSTAL"]]

Unnamed: 0,COD_POST_POLIZA,COD_POSTAL


## saving checkpoint

In [22]:
df.to_feather("../../../data/processed/1.1 - full merge - date cols transformed.feather")

# Loading from checkpoint - drop useless cols
**From quality analysis: Quality analysis full table merge**


In [23]:
path = "../../../data/processed/1.1 - full merge - date cols transformed.feather"

if not os.path.isfile(path):
    raise Exception(f"file not founded: {path}")

In [24]:
df = pd.read_feather(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 156 entries, NUM_SECU_EXPED to FECHA_SINI_Elapsed
dtypes: bool(1), datetime64[ns](1), float64(78), int64(2), object(74)
memory usage: 1.4+ GB


In [25]:
to_drop = ["COD_ACT_BENEF", "EXISTE_INHABILITAR", 'COD_FRAUDE', 'COD_RAMO', 'COD_RAMO_vigabt_aseg_cif', 'COD_RIES_vigabt_aseg_cif', 'DESCRIPCION_VEHICULO', 'FEC_ACT', 'FECHA_NACIMIENTO', 'FECHA_PROCESO', 'FECHA_VENC_POL', 'FECHA_VIG_ORIG_POL', 'FECHA_VIG_POL', 'SEXO_ASEG', 'USR_ACT', "COD_POSTAL"]
# 'FECHA_DESDE', dropped for quality reasons
#  'EXISTE_INVEST', Use to create cant_prev_inv

to_drop = list(set(to_drop).intersection(set(df.columns)))
to_drop

['DESCRIPCION_VEHICULO',
 'COD_FRAUDE',
 'COD_RAMO_vigabt_aseg_cif',
 'USR_ACT',
 'EXISTE_INHABILITAR',
 'COD_RAMO',
 'COD_POSTAL',
 'SEXO_ASEG',
 'COD_RIES_vigabt_aseg_cif',
 'COD_ACT_BENEF']

In [26]:
df.drop(columns=to_drop, inplace=True)

In [27]:
df.to_feather("../../../data/processed/1.1 - full merge - date cols transformed - dropped cols.feather")

# Loading from checkpoint - agruping values same meaning

In [28]:
path = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols.feather"

if not os.path.isfile(path):
    raise Exception(f"file not founded: {path}")

In [29]:
df = pd.read_feather(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 146 entries, NUM_SECU_EXPED to FECHA_SINI_Elapsed
dtypes: bool(1), datetime64[ns](1), float64(72), int64(2), object(70)
memory usage: 1.3+ GB


## TIPO_EXPED
00X is the same as only X. Example: 001 & 1, 060 & 60

In [30]:
df["TIPO_EXPED"].value_counts(dropna=False)

003    287834
010    246477
3      222318
10     185672
NaN     89647
002     51985
060     42060
2       41523
60      30199
020      7881
050      7494
20       5932
50       5578
001      3517
1        2898
Name: TIPO_EXPED, dtype: int64

In [31]:
# Using zfill to make all 3 len values with 0
df["TIPO_EXPED"] = df["TIPO_EXPED"].astype("str").str.zfill(3)
# other option
#.rjust(3, "0")
df["TIPO_EXPED"].value_counts(dropna=False)

003     510152
010     432149
002      93508
None     89647
060      72259
020      13813
050      13072
001       6415
Name: TIPO_EXPED, dtype: int64

### Save checkpoint

In [32]:
df.to_feather("../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals.feather")

# TODO: Loading from checkpoint - creating columns
- cant_pol: Cantidad de NUM_SECU_POL por CIF_ID
- cant_prev_fraud: El cliente tiene un caso de fraude previo
- cant_prev_inv: Cantidad de investigaciones previas por cliente.
- cant_sini: Cantidad de siniestros por cliente.
- ratio_sini: Ratio de siniestros por cliente / meses que es cliente.


In [75]:
path = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals.feather"

if not os.path.isfile(path):
    raise Exception(f"file not founded: {path}")

In [76]:
df = pd.read_feather(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 146 entries, NUM_SECU_EXPED to FECHA_SINI_Elapsed
dtypes: bool(1), datetime64[ns](1), float64(72), int64(2), object(70)
memory usage: 1.3+ GB


## cant_pol: Cantidad de NUM_SECU_POL por CIF_ID

In [33]:
df["CIF_ID"].value_counts(dropna=False)

NaN           1117841
10991015.0      10659
16188039.0       1142
16647307.0       1127
21504168.0       1124
19613011.0        676
21504203.0        626
18349880.0        475
20636433.0        219
22279462.0         71
21504187.0         46
11319603.0         33
11463248.0         28
21016106.0         26
3173895.0          26
21101078.0         23
19070238.0         22
9505138.0          21
3165882.0          21
19848247.0         21
10806326.0         21
3817809.0          20
995830.0           20
1072346.0          20
3543511.0          20
18404480.0         20
21467211.0         19
14495093.0         19
3990073.0          19
3148138.0          19
21408439.0         18
1702715.0          18
14612677.0         18
7904804.0          18
20748608.0         18
10995451.0         17
7875155.0          17
10787271.0         17
1890443.0          17
3178717.0          17
504138.0           17
16490639.0         17
2968960.0          17
8697960.0          17
777705.0           17
2935326.0 

In [34]:
import datetime
"""
cont = 0
# create function to calculate number of NUM_SECU_POL
def numb_pol_by_cifid(row, interval_days=31):
    global cont
    cont = cont + 1
    print(cont)
    if (row["CIF_ID"] is None) or (row["FECHA_SINI"] is None):
        return 0
    return df[(df["CIF_ID"] == row["CIF_ID"]) & ( (df["FECHA_SINI"] > row["FECHA_SINI"] - datetime.timedelta(interval_days)) & (df["FECHA_SINI"] < row["FECHA_SINI"]))]["NUM_SECU_POL"].nunique()
"""

'\ncont = 0\n# create function to calculate number of NUM_SECU_POL\ndef numb_pol_by_cifid(row, interval_days=31):\n    global cont\n    cont = cont + 1\n    print(cont)\n    if (row["CIF_ID"] is None) or (row["FECHA_SINI"] is None):\n        return 0\n    return df[(df["CIF_ID"] == row["CIF_ID"]) & ( (df["FECHA_SINI"] > row["FECHA_SINI"] - datetime.timedelta(interval_days)) & (df["FECHA_SINI"] < row["FECHA_SINI"]))]["NUM_SECU_POL"].nunique()\n'

In [35]:
# TOO SLOW!!!
# cols = ["CIF_ID", "FECHA_SINI"]
# df["cant_pol"] = df.apply(numb_pol_by_cifid, axis=1)
# df.head(100).apply(numb_pol_by_cifid, axis=1)

In [36]:
from tqdm import tqdm_notebook as tqdm

def quantity_by_range_dates(df, date_col, id_group, id_count, interval_months=1, new_col="quantity"):
    min_date = df[date_col].min()
    max_date = df[date_col].max()
    num_of_periods = ((max_date - min_date) / (30*interval_months) ).days + 1 # approx. TODO: make it better  
    df[new_col] = df[id_group]
    for i in tqdm(range(num_of_periods)):
        min_period = min_date + pd.DateOffset(months=(interval_months * i))
        max_period = min_period + pd.DateOffset(months=interval_months)
        tmp = df[((df[date_col] >= min_period) & (df[date_col] < max_period))]
        num_uniques = tmp.groupby(id_group)[id_count].nunique()
        mapper = num_uniques.to_dict()
        idxs = tmp[tmp[id_group].isin(num_uniques.index)].index
        df.loc[idxs, new_col] = df.loc[idxs, new_col].replace(mapper)
    # if CIF_ID IS NULL, replace by 0
    df[new_col].fillna(0, inplace=True)
    return df

In [37]:
df = quantity_by_range_dates(df, "FECHA_SINI", id_group="CIF_ID", id_count="NUM_SECU_POL", new_col="cant_pol")

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [38]:
df[df["cant_pol"] == 5.0].head()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,total_condicion,es_gte_5,EXISTE_FRAUDE,EXISTE_INVEST,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37,CATASTROFICO,CERRADURA_BAUL,CERRADURA_DERECHA,CERRADURA_IZQUIERDA,CODIGO_BAJA,CODIGO_CARATULA,CODIGO_REAPERTURA,CODIGO_VEHICULO,COD_CAUSA_SINI,COD_POST_OCURRENCIA,COD_POST_POLIZA,COD_POST_TERC,COD_RAMO_sini,COD_RIES_sini,CONDICION_ROBO_EXP50,DANOS_MATERIALES,DESCRIPCION_TIPO,ESTADO_CIVIL,ESTADO_CIVIL_TERC,ESTAD_VEH_ASEG,FALTANTE,FECHA_SINI,MCA_COASEG,MCA_JUICIO,MCA_VIP,METRO,NUM_SECU_POL,OCUPACION_ASEG,SEXO,SEXO_TERC,TELEFONO_TERC,TIPO,TIPO_EXPED,TIPO_LESION,TIPO_LESION_MAXIMA,TIPO_SINIESTRO,USO,dias_entre_denu_y_sini,dist_fformal_fsini,dist_fformal_fdenu,existe_FECHA_FORMAL,CANT_RENOVACION,CAPITAL_ACCESORIOS,CAPITAL_ASEGURADO_COTIZACION,CAPITAL_VEHICULO,COD_COBRO,COD_COBRO_ANTERIOR,COD_PROD,COD_ZONA_CASCO,COD_ZONA_RC,COD_ZONA_ROBO,CONV_COMISIONARIO,CIF_ID,MCA_AGRAVANTE,MCA_EMPLEADO,MCA_MOVIMIENTO,MCA_POLIZA_VIP,NEGOCIO,PRENDARIO,SEGMENTO,cambio_cobro,ANTIG_pol,COD_EST_CIVIL,TIPO_ACTIVIDAD,CLIENTE,CODIGO_NACION,DATECO_TIPO_ACTIVIDAD,DOMICILIO_CODIGO_POSTAL,HABILITADO,FECHA_SINI_Year,FECHA_SINI_Month,FECHA_SINI_Week,FECHA_SINI_Day,FECHA_SINI_Dayofweek,FECHA_SINI_Dayofyear,FECHA_SINI_Is_month_end,FECHA_SINI_Is_month_start,FECHA_SINI_Is_quarter_end,FECHA_SINI_Is_quarter_start,FECHA_SINI_Is_year_end,FECHA_SINI_Is_year_start,FECHA_SINI_Hour,FECHA_SINI_Minute,FECHA_SINI_Second,FECHA_SINI_Elapsed,cant_pol
72644,1629607209999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,,,,,,,36619.0,501.0,1987005.0,1038000.0,,9.0,299.0,,0,CAMIONES Y SEMITRACC,CA,,,S,2017-01-03,False,,N,S,1601619000000.0,99999.0,M,M,,11.0,3,,,B,6.0,1.0,,,False,,0,10390000,0,CC,,73189.0,1.0,1.0,3.0,,21504168.0,S,N,,,,,,False,0.0,,SinDato,,,,,,2017.0,1.0,1.0,3.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1483401600,5.0
73857,1629792329999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,,,,,,,2022.0,501.0,6100002.0,1038000.0,,9.0,171.0,,0,"AUTOS, CAMIONE.RURAL",DI,,,S,2017-01-05,False,,N,S,1601619000000.0,99999.0,M,M,,3.0,3,,,B,6.0,0.0,,,False,,0,10390000,0,CC,,73189.0,1.0,1.0,3.0,,21504168.0,S,N,,,,,,False,0.0,,SinDato,,,,,,2017.0,1.0,1.0,5.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1483574400,5.0
73866,1629793429999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,,,,,,,2022.0,501.0,6100002.0,1038000.0,,9.0,171.0,,0,"AUTOS, CAMIONE.RURAL",DI,,,S,2017-01-05,False,,N,S,1601619000000.0,99999.0,M,M,,3.0,3,,,B,6.0,0.0,,,False,,0,10390000,0,CC,,73189.0,1.0,1.0,3.0,,21504168.0,S,N,,,,,,False,0.0,,SinDato,,,,,,2017.0,1.0,1.0,5.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1483574400,5.0
74950,1630028439999,,,,,,,0.0,,,0.0,2.0,2.0,3.0,,,,,,,,,,,,,,,,,7.0,True,,,,,,,,,,,,,,,N,,,,,,,,,N,1.0,S,N,,,NO,N,N,N,,,,2067.0,209.0,1826005.0,1038000.0,,9.0,102.0,,0,"AUTOS, CAMIONE.RURAL",CA,,3.0,S,2017-01-05,False,,N,S,1613544000000.0,99999.0,M,,,3.0,60,,,I,6.0,1.0,,,False,,0,13440000,13960000,CC,,73189.0,1.0,1.0,3.0,,21504168.0,,N,,,,,,False,0.0,,SinDato,,,,,,2017.0,1.0,1.0,5.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1483574400,5.0
75039,1630036109999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,,,,,,,18687.0,502.0,5000050.0,1038000.0,,9.0,244.0,,0,"AUTOS, CAMIONE.RURAL",SO,,,S,2017-01-05,False,,N,S,1601619000000.0,99999.0,M,M,,3.0,3,,,C,6.0,1.0,,,False,,0,10390000,0,CC,,73189.0,1.0,1.0,3.0,,21504168.0,S,N,,,,,,False,0.0,,SinDato,,,,,,2017.0,1.0,1.0,5.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1483574400,5.0


In [39]:
df["FECHA_SINI"].min(), df["FECHA_SINI"].min() + pd.DateOffset(months=1)

(Timestamp('2017-01-01 00:00:00'), Timestamp('2017-02-01 00:00:00'))

In [40]:
tmp = df[(df["CIF_ID"] == 21504168.0) & (df["FECHA_SINI"] >= df["FECHA_SINI"].min()) & (df["FECHA_SINI"] < df["FECHA_SINI"].min() + pd.DateOffset(months=1))]
tmp["NUM_SECU_POL"].nunique()

5

In [41]:
df["cant_pol"].value_counts(dropna=False)

0.0     1117841
1.0       96740
2.0        9109
3.0        3932
5.0        1064
4.0         545
29.0        145
8.0         142
6.0         134
7.0         116
34.0        104
12.0        101
39.0        100
9.0          98
38.0         87
31.0         85
11.0         82
32.0         82
28.0         82
13.0         74
25.0         68
30.0         66
23.0         54
24.0         54
19.0         50
10.0         31
16.0         29
Name: cant_pol, dtype: int64

### saving checkpoint

In [42]:
file = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - new cols.feather"

df.to_feather(file)

## cant_prev_fraud: El cliente tiene un caso de fraude previo

In [None]:
file = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - new cols.feather"

df = pd.read_feather(file)

**descartar inhabilitar no se puede usar**

## cant_prev_inv: Cantidad de investigaciones previas por cliente.

In [88]:
file = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - new cols.feather"

df = pd.read_feather(file)

In [89]:
df["EXISTE_INVEST"].value_counts()

True     8088
False    4654
Name: EXISTE_INVEST, dtype: int64

In [95]:
# df = quantity_by_range_dates(df, "FECHA_SINI", id_group="CIF_ID", id_count="EXISTE_INVEST", interval_months=18, new_col="cant_prev_inv")

**Target related**

In [96]:
#df["cant_prev_inv"].value_counts(dropna=False)

## cant_sini: Cantidad de siniestros por cliente.

In [43]:
file = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - new cols.feather"

df = pd.read_feather(file)

In [44]:
df = quantity_by_range_dates(df, "FECHA_SINI", id_group="CIF_ID", id_count="NUM_SECU_EXPED", interval_months=12, new_col="cant_sini_1y")

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [45]:
df["cant_sini_1y"].value_counts(dropna=False)

0.0       1117841
2.0         30012
1.0         28906
3.0         15315
4.0          9812
5.0          5260
4441.0       4441
3938.0       3938
6.0          3030
2280.0       2280
7.0          1820
8.0          1088
887.0         887
803.0         803
9.0           729
637.0         637
505.0         505
10.0          430
415.0         415
321.0         321
276.0         276
259.0         259
11.0          231
219.0         219
211.0         211
177.0         177
154.0         154
141.0         141
137.0         137
12.0          132
86.0           86
82.0           82
79.0           79
13.0           78
71.0           71
16.0           64
15.0           30
25.0           25
23.0           23
21.0           21
18.0           18
17.0           17
14.0           14
Name: cant_sini_1y, dtype: int64

In [46]:
df[df["cant_sini_1y"] == 4441.0].head()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,total_condicion,es_gte_5,EXISTE_FRAUDE,EXISTE_INVEST,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37,CATASTROFICO,CERRADURA_BAUL,CERRADURA_DERECHA,CERRADURA_IZQUIERDA,CODIGO_BAJA,CODIGO_CARATULA,CODIGO_REAPERTURA,CODIGO_VEHICULO,COD_CAUSA_SINI,COD_POST_OCURRENCIA,COD_POST_POLIZA,COD_POST_TERC,COD_RAMO_sini,COD_RIES_sini,CONDICION_ROBO_EXP50,DANOS_MATERIALES,DESCRIPCION_TIPO,ESTADO_CIVIL,ESTADO_CIVIL_TERC,ESTAD_VEH_ASEG,FALTANTE,FECHA_SINI,MCA_COASEG,MCA_JUICIO,MCA_VIP,METRO,NUM_SECU_POL,OCUPACION_ASEG,SEXO,SEXO_TERC,TELEFONO_TERC,TIPO,TIPO_EXPED,TIPO_LESION,TIPO_LESION_MAXIMA,TIPO_SINIESTRO,USO,dias_entre_denu_y_sini,dist_fformal_fsini,dist_fformal_fdenu,existe_FECHA_FORMAL,CANT_RENOVACION,CAPITAL_ACCESORIOS,CAPITAL_ASEGURADO_COTIZACION,CAPITAL_VEHICULO,COD_COBRO,COD_COBRO_ANTERIOR,COD_PROD,COD_ZONA_CASCO,COD_ZONA_RC,COD_ZONA_ROBO,CONV_COMISIONARIO,CIF_ID,MCA_AGRAVANTE,MCA_EMPLEADO,MCA_MOVIMIENTO,MCA_POLIZA_VIP,NEGOCIO,PRENDARIO,SEGMENTO,cambio_cobro,ANTIG_pol,COD_EST_CIVIL,TIPO_ACTIVIDAD,CLIENTE,CODIGO_NACION,DATECO_TIPO_ACTIVIDAD,DOMICILIO_CODIGO_POSTAL,HABILITADO,FECHA_SINI_Year,FECHA_SINI_Month,FECHA_SINI_Week,FECHA_SINI_Day,FECHA_SINI_Dayofweek,FECHA_SINI_Dayofyear,FECHA_SINI_Is_month_end,FECHA_SINI_Is_month_start,FECHA_SINI_Is_quarter_end,FECHA_SINI_Is_quarter_start,FECHA_SINI_Is_year_end,FECHA_SINI_Is_year_start,FECHA_SINI_Hour,FECHA_SINI_Minute,FECHA_SINI_Second,FECHA_SINI_Elapsed,cant_pol,cant_sini_1y
556231,1767526989999,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,N,N,N,,,,12797.0,502.0,1414000.0,1002000.0,,9.0,351.0,,0,"AUTOS, CAMIONE.RURAL",CA,,5.0,S,2018-01-01,True,,N,S,1701160000000.0,5.0,M,,,3.0,10,,,C,6.0,2.0,,,False,,0,,0,CC,,5570.0,1.0,1.0,3.0,,10991015.0,,N,,,,,,False,0.0,,SinDato,,,,,,2018.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1514764800,5.0,4441.0
556233,1767527079999,0.0,,,,0.0,,,,,,,,3.0,,,,,,,,,,,,,0.0,,0.0,0.0,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,,,,,,,12797.0,502.0,1414000.0,1439000.0,1293000.0,9.0,351.0,,0,"AUTOS, CAMIONE.RURAL",CA,SO,5.0,N,2018-01-01,True,,N,S,1701160000000.0,99999.0,M,M,,3.0,2,L,,C,6.0,2.0,,,False,,0,,0,CC,,5570.0,1.0,1.0,3.0,,10991015.0,,N,,,,,,False,0.0,,SinDato,,,,,,2018.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1514764800,5.0,4441.0
556889,1767566439999,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,N,N,N,,,,6901.0,501.0,1414000.0,1002000.0,,9.0,359.0,,0,"AUTOS, CAMIONE.RURAL",SO,,,S,2018-01-01,True,,N,S,1701160000000.0,99999.0,M,,,3.0,10,,,C,6.0,3.0,,,False,,0,,0,CC,,5570.0,1.0,1.0,3.0,,10991015.0,,N,,,,,,False,0.0,,SinDato,,,,,,2018.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1514764800,5.0,4441.0
556890,1767566479999,0.0,,,,0.0,,,,,,,,3.0,,,,,,,,,,,,,0.0,,0.0,0.0,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,,,,,,,6901.0,501.0,1414000.0,1439000.0,,9.0,359.0,,0,"AUTOS, CAMIONE.RURAL",SO,,,S,2018-01-01,True,,N,S,1701160000000.0,99999.0,M,M,,3.0,2,L,,C,6.0,3.0,,,False,,0,,0,CC,,5570.0,1.0,1.0,3.0,,10991015.0,,N,,,,,,False,0.0,,SinDato,,,,,,2018.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1514764800,5.0,4441.0
557932,1767648999999,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,3.0,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,N,N,N,,,,27826.0,505.0,1200000.0,1002000.0,,9.0,829.0,,0,JEEPS DE +4 CIL.Y PI,SO,,,S,2018-01-01,True,,N,S,1701160000000.0,99999.0,M,,,9.0,10,,,N,6.0,3.0,,,False,,0,,0,CC,,5570.0,1.0,1.0,3.0,,10991015.0,,N,,,,,,False,0.0,,SinDato,,,,,,2018.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1514764800,5.0,4441.0


In [47]:
tmp = df[(df["CIF_ID"] == 10991015.0) & (df["FECHA_SINI"] >= df["FECHA_SINI"].min()) & (df["FECHA_SINI"] < df["FECHA_SINI"].min() + pd.DateOffset(months=12))]
tmp["NUM_SECU_EXPED"].nunique()

3938

### Saving checkpoint

In [48]:
file = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - new cols.feather"

df.to_feather(file)

## ratio_sini: Ratio de siniestros por cliente / meses que es cliente.

In [26]:
df["ANTIG_pol"].head()

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: ANTIG_pol, dtype: float64

**Descarted for quality issues: FECHA_DESDE**

# Loading from checkpoint - working with object columns
**Date already pre-processed**

In [49]:
file = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - new cols.feather"

if not os.path.isfile(file):
    raise Exception(f"file not founded: {file}")

In [50]:
df = pd.read_feather(file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 148 entries, NUM_SECU_EXPED to cant_sini_1y
dtypes: bool(1), datetime64[ns](1), float64(74), int64(2), object(70)
memory usage: 1.3+ GB


In [51]:
obj_cols = list(df.select_dtypes("object").columns)
print(obj_cols)

['EXISTE_FRAUDE', 'EXISTE_INVEST', 'preg_1', 'preg_2', 'preg_3', 'preg_4', 'preg_5', 'preg_6', 'preg_7', 'preg_8', 'preg_9', 'preg_10', 'preg_11', 'preg_12', 'preg_15', 'preg_16', 'preg_18', 'preg_19', 'preg_22', 'preg_24', 'preg_27', 'preg_28', 'preg_30', 'preg_31', 'preg_32', 'preg_33', 'preg_34', 'preg_36', 'preg_37', 'CATASTROFICO', 'CERRADURA_BAUL', 'CERRADURA_DERECHA', 'CERRADURA_IZQUIERDA', 'DANOS_MATERIALES', 'DESCRIPCION_TIPO', 'ESTADO_CIVIL', 'ESTADO_CIVIL_TERC', 'FALTANTE', 'MCA_COASEG', 'MCA_JUICIO', 'MCA_VIP', 'METRO', 'SEXO', 'SEXO_TERC', 'TELEFONO_TERC', 'TIPO_EXPED', 'TIPO_LESION', 'TIPO_LESION_MAXIMA', 'TIPO_SINIESTRO', 'existe_FECHA_FORMAL', 'CAPITAL_ACCESORIOS', 'CAPITAL_ASEGURADO_COTIZACION', 'CAPITAL_VEHICULO', 'COD_COBRO', 'COD_COBRO_ANTERIOR', 'CONV_COMISIONARIO', 'MCA_AGRAVANTE', 'MCA_EMPLEADO', 'MCA_MOVIMIENTO', 'MCA_POLIZA_VIP', 'NEGOCIO', 'PRENDARIO', 'SEGMENTO', 'cambio_cobro', 'COD_EST_CIVIL', 'TIPO_ACTIVIDAD', 'CLIENTE', 'CODIGO_NACION', 'DATECO_TIPO_ACTIV

In [52]:
bin_cols = []
for col in obj_cols:
    print("**********--*****"*3)
    print(df[col].value_counts(dropna=False).head(6))
    if(df[col].nunique() <= 3):
        bin_cols.append(col)

**********--***************--***************--*****
NaN      1218273
False       8651
True        4091
Name: EXISTE_FRAUDE, dtype: int64
**********--***************--***************--*****
NaN      1218273
True        8088
False       4654
Name: EXISTE_INVEST, dtype: int64
**********--***************--***************--*****
NaN    1228474
ADD       1360
ATD        496
ATI        391
NO         152
ATM        142
Name: preg_1, dtype: int64
**********--***************--***************--*****
NaN    1178130
M        39272
A1        6517
B         3591
PE        2969
A2         249
Name: preg_2, dtype: int64
**********--***************--***************--*****
NaN    1179345
N        48070
S         3600
Name: preg_3, dtype: int64
**********--***************--***************--*****
NaN    1176789
NH       44820
DP        8643
DJ         763
Name: preg_4, dtype: int64
**********--***************--***************--*****
NaN    1164427
N        39892
S        26696
Name: preg_5, dtype: int64
*

NaN          120694
189000,00     11815
210600,00      9721
226800,00      9109
183600,00      8749
237600,00      8639
Name: CAPITAL_ASEGURADO_COTIZACION, dtype: int64
**********--***************--***************--*****
NaN          95428
0,00         35128
189000,00     8035
226800,00     6479
183600,00     6087
270000,00     5882
Name: CAPITAL_VEHICULO, dtype: int64
**********--***************--***************--*****
TM     424687
PP     367743
CC     205178
BA     131030
NaN     95428
TA       4376
Name: COD_COBRO, dtype: int64
**********--***************--***************--*****
TM     390436
PP     342899
NaN    322122
BA     115224
CC      53566
TA       4281
Name: COD_COBRO_ANTERIOR, dtype: int64
**********--***************--***************--*****
nan           568226
1000100       213011
NaN            95428
CS_1000155     42749
CS_1000111     39121
CS_1000112     39005
Name: CONV_COMISIONARIO, dtype: int64
**********--***************--***************--*****
N      676031
S    

## Analyzing each case

### Transform boolean None == False

In [53]:
df["EXISTE_FRAUDE"] = df["EXISTE_FRAUDE"] == True
# df["EXISTE_INHABILITAR"] = df["EXISTE_INHABILITAR"] == True

In [9]:
# IN preguntas None means it shouldn't be asked, so None != False
# transform letters like "S" to boolean
# df["preg_5"] = df["preg_5"] == "S"
# df["preg_24"] = df["preg_24"] == "S"
# df["preg_30"] = df["preg_30"] == "S"
# df["preg_31"] = df["preg_31"] == "S"
# df["preg_33"] = df["preg_33"] == "S"

In [10]:
# this variable is text
# df["preg_34"] is N & NO the same??
# idx_rep = df[df["preg_34"] == "NO"].index
# df.loc[idx_rep, "preg_34"] = "N"
# df["preg_34"].value_counts(dropna=False)

In [54]:
idx_rep = df[df["TIPO_ACTIVIDAD"]== "SinDato"].index
df.loc[idx_rep, "TIPO_ACTIVIDAD"] = np.nan
df["TIPO_ACTIVIDAD"].value_counts(dropna=False)

NaN        1168249
EMPL         18928
JUBI         14395
OTRO          5937
ADMI          4594
COME          3066
ADNI          1903
ABOG          1378
AMAD          1372
611010        1142
291000        1127
DOCE          1110
MEDI          1090
FUNC           475
INGE           466
EMPR           414
CONT           334
ARQU           277
ESCB           270
841100         260
DIPLO          198
VEND           157
MILI           133
ADEM           129
741101         123
ENCE           122
SICO           113
DESE            94
POLI            89
ESCR            85
ODON            83
ALBA            75
PERI            71
ELECTRI         66
CUEN            66
MECA            64
CONSO           61
CHOF            58
CONS            54
PSIC            52
JUEZ            48
INDU            48
BIOQ            46
AGRO            46
691001          46
PLOM            44
851120          43
ASES            43
TECN            43
ACTR            41
VIAJ            41
OPER            40
ENFE        

### Postal code to groups by zone

In [55]:
df["COD_POST_POLIZA"].value_counts()

1900000.0    20817
1406000.0    17364
2000005.0    17224
1407000.0    15700
5000050.0    15011
7600002.0    14224
1425000.0    13986
1439000.0    13566
5000000.0    11544
1419000.0    10999
1426000.0    10906
4000028.0    10430
1431000.0    10232
1416000.0    10123
1824002.0    10077
1414000.0     9793
1704001.0     9070
1428000.0     9069
1417000.0     9009
1408000.0     8616
1828008.0     8556
9000010.0     8535
9420016.0     8042
1424000.0     7488
1878000.0     7460
1832007.0     7419
9410012.0     7376
4400000.0     7353
1429000.0     7151
1430000.0     7141
1712000.0     7048
3500019.0     6838
8300001.0     6822
1663015.0     6701
1744005.0     6696
1722006.0     6655
1714010.0     6584
1884018.0     6540
1405000.0     6532
1754001.0     6411
1440000.0     6370
1437000.0     6195
3000022.0     6018
1636002.0     5991
1888032.0     5849
1842010.0     5764
3600014.0     5742
1427000.0     5685
1757000.0     5647
1708007.0     5531
1602000.0     5451
1870011.0     5427
1653000.0   

In [56]:
df["COD_POST_POLIZA"].min(), df["COD_POST_POLIZA"].max()

(5012.0, 9420017.0)

In [57]:
tmp = df[df["COD_POST_POLIZA"] >= 1000000]
tmp.shape

(1141366, 148)

In [58]:
df["COD_POST_POLIZA"] = df["COD_POST_POLIZA"].astype(str)
df["COD_POST_POLIZA"] = df["COD_POST_POLIZA"].str.replace(".0","", regex=False)

In [59]:
df.loc[tmp.index, "COD_POST_POLIZA"].head()

67628    3500019
67631    1723001
67632    1101000
67633    1894002
67634    1894002
Name: COD_POST_POLIZA, dtype: object

In [60]:
# removing last 3 digits
df.loc[tmp.index, "COD_POST_POLIZA"] = df.loc[tmp.index, "COD_POST_POLIZA"].str[:-3]

In [61]:
# returning to float for the mapper
df["COD_POST_POLIZA"] = df["COD_POST_POLIZA"].astype(float)

In [62]:
df["COD_POST_POLIZA"].min(), df["COD_POST_POLIZA"].max()

(1000.0, 9420.0)

In [63]:
# COD_POST_OCURRENCIA
col = "COD_POST_OCURRENCIA"
tmp = df[df[col] >= 1000000]
df[col] = df[col].astype(str)
df[col] = df[col].str.replace(".0","", regex=False)
# removing last 3 digits
df.loc[tmp.index, col] = df.loc[tmp.index, col].str[:-3]
# returning to float for the mapper
df[col] = df[col].astype(float)
df[col].min(), df[col].max()

(1.0, 9420.0)

In [64]:
(df["COD_POST_OCURRENCIA"] < 1000).sum()

2730

In [65]:
# COD_POST_TERC
col = "COD_POST_TERC"
tmp = df[df[col] >= 1000000]
df[col] = df[col].astype(str)
df[col] = df[col].str.replace(".0","", regex=False)
# removing last 3 digits
df.loc[tmp.index, col] = df.loc[tmp.index, col].str[:-3]
# returning to float for the mapper
df[col] = df[col].astype(float)
df[col].min(), df[col].max()

(1000.0, 9420.0)

### COD_POSTAL mapper to groups

In [66]:
path_mapper = "../../../src/features/cod_postal_to_cluster_mapper.pickle"

if not os.path.isfile(path_mapper):
    raise Exception("File doesn't exists")

In [67]:
with open(path_mapper, "rb") as handle:
    mapper = pickle.load(handle)

In [68]:
mapper

{'DP_CLUSTER_2019': {1343: 17.0,
  1344: 17.0,
  1345: 17.0,
  1347: 17.0,
  1348: 17.0,
  1349: 17.0,
  1350: 17.0,
  1351: 17.0,
  1352: 17.0,
  1353: 17.0,
  1354: 17.0,
  1355: 17.0,
  1356: 17.0,
  1357: 17.0,
  1358: 17.0,
  1359: 17.0,
  1360: 17.0,
  1361: 17.0,
  1362: 17.0,
  1363: 17.0,
  1364: 17.0,
  1365: 17.0,
  1366: 17.0,
  1367: 17.0,
  1368: 17.0,
  1370: 17.0,
  1371: 17.0,
  1372: 17.0,
  1373: 17.0,
  1374: 17.0,
  1375: 17.0,
  1376: 17.0,
  1377: 17.0,
  1378: 17.0,
  1379: 17.0,
  1380: 17.0,
  1382: 17.0,
  1383: 17.0,
  1384: 17.0,
  1385: 17.0,
  1386: 17.0,
  1387: 17.0,
  1388: 17.0,
  1389: 17.0,
  1390: 17.0,
  1391: 17.0,
  1392: 17.0,
  1393: 17.0,
  1394: 17.0,
  1395: 17.0,
  1396: 17.0,
  1397: 17.0,
  1398: 17.0,
  1399: 17.0,
  1400: 17.0,
  1401: 17.0,
  1402: 17.0,
  1403: 17.0,
  1404: 17.0,
  1405: 11.0,
  1406: 15.0,
  1407: 25.0,
  1408: 24.0,
  1409: 17.0,
  1410: 17.0,
  1411: 17.0,
  1412: 17.0,
  1413: 17.0,
  1414: 21.0,
  1415: 17.0,
 

In [28]:
cols_to_map = ["COD_POST_POLIZA", "COD_POST_OCURRENCIA", "COD_POST_TERC"]

In [35]:
for col in cols_to_map:
    for k, v in mapper.items():
        new_col = col + "_" + k
        df[new_col] = df[col]
        df[new_col] = df[new_col].map(v)

In [41]:
df.head()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,total_condicion,es_gte_5,EXISTE_FRAUDE,EXISTE_INVEST,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37,CATASTROFICO,CERRADURA_BAUL,CERRADURA_DERECHA,CERRADURA_IZQUIERDA,CODIGO_BAJA,CODIGO_CARATULA,CODIGO_REAPERTURA,CODIGO_VEHICULO,COD_CAUSA_SINI,COD_POST_OCURRENCIA,COD_POST_POLIZA,COD_POST_TERC,COD_RAMO_sini,COD_RIES_sini,CONDICION_ROBO_EXP50,DANOS_MATERIALES,DESCRIPCION_TIPO,ESTADO_CIVIL,ESTADO_CIVIL_TERC,ESTAD_VEH_ASEG,FALTANTE,FECHA_SINI,MCA_COASEG,MCA_JUICIO,MCA_VIP,METRO,NUM_SECU_POL,OCUPACION_ASEG,SEXO,SEXO_TERC,TELEFONO_TERC,TIPO,TIPO_EXPED,TIPO_LESION,TIPO_LESION_MAXIMA,TIPO_SINIESTRO,USO,dias_entre_denu_y_sini,dist_fformal_fsini,dist_fformal_fdenu,existe_FECHA_FORMAL,CANT_RENOVACION,CAPITAL_ACCESORIOS,CAPITAL_ASEGURADO_COTIZACION,CAPITAL_VEHICULO,COD_COBRO,COD_COBRO_ANTERIOR,COD_PROD,COD_ZONA_CASCO,COD_ZONA_RC,COD_ZONA_ROBO,CONV_COMISIONARIO,CIF_ID,MCA_AGRAVANTE,MCA_EMPLEADO,MCA_MOVIMIENTO,MCA_POLIZA_VIP,NEGOCIO,PRENDARIO,SEGMENTO,cambio_cobro,ANTIG_pol,COD_EST_CIVIL,TIPO_ACTIVIDAD,CLIENTE,CODIGO_NACION,DATECO_TIPO_ACTIVIDAD,DOMICILIO_CODIGO_POSTAL,HABILITADO,FECHA_SINI_Year,FECHA_SINI_Month,FECHA_SINI_Week,FECHA_SINI_Day,FECHA_SINI_Dayofweek,FECHA_SINI_Dayofyear,FECHA_SINI_Is_month_end,FECHA_SINI_Is_month_start,FECHA_SINI_Is_quarter_end,FECHA_SINI_Is_quarter_start,FECHA_SINI_Is_year_end,FECHA_SINI_Is_year_start,FECHA_SINI_Hour,FECHA_SINI_Minute,FECHA_SINI_Second,FECHA_SINI_Elapsed,cant_pol,cant_sini_1y,COD_POST_POLIZA_DP_CLUSTER_2019,COD_POST_POLIZA_RT_CLUSTER_2019,COD_POST_POLIZA_RC_CLUSTER_2019,COD_POST_OCURRENCIA_DP_CLUSTER_2019,COD_POST_OCURRENCIA_RT_CLUSTER_2019,COD_POST_OCURRENCIA_RC_CLUSTER_2019,COD_POST_TERC_DP_CLUSTER_2019,COD_POST_TERC_RT_CLUSTER_2019,COD_POST_TERC_RC_CLUSTER_2019
0,516359994,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,1.0,3.0,4.0,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0,,,,,,,,,
1,1762619999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0,,,,,,,,,
2,4768809999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,3.0,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0,,,,,,,,,
3,6444209999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,3.0,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0,,,,,,,,,
4,7529469970,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,1.0,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0,,,,,,,,,


In [43]:
df.loc[:, "COD_POST_TERC_RC_CLUSTER_2019"].value_counts(dropna=False)

NaN     1038407
21.0      20732
24.0      19956
18.0      17068
25.0      16279
17.0       9459
6.0        8202
26.0       7939
22.0       7682
20.0       6824
16.0       6448
23.0       5694
10.0       5592
8.0        5592
11.0       5502
30.0       5319
19.0       5290
28.0       4405
13.0       4152
29.0       3724
3.0        3664
9.0        3420
5.0        3164
7.0        2971
15.0       2900
27.0       2803
12.0       2562
4.0        2302
14.0       2054
2.0         639
1.0         270
Name: COD_POST_TERC_RC_CLUSTER_2019, dtype: int64

## Convert object cols to categorical improving the order

In [69]:
obj_cols = df.select_dtypes("object").columns
print(obj_cols)

Index(['EXISTE_INVEST', 'preg_1', 'preg_2', 'preg_3', 'preg_4', 'preg_5',
       'preg_6', 'preg_7', 'preg_8', 'preg_9', 'preg_10', 'preg_11', 'preg_12',
       'preg_15', 'preg_16', 'preg_18', 'preg_19', 'preg_22', 'preg_24',
       'preg_27', 'preg_28', 'preg_30', 'preg_31', 'preg_32', 'preg_33',
       'preg_34', 'preg_36', 'preg_37', 'CATASTROFICO', 'CERRADURA_BAUL',
       'CERRADURA_DERECHA', 'CERRADURA_IZQUIERDA', 'DANOS_MATERIALES',
       'DESCRIPCION_TIPO', 'ESTADO_CIVIL', 'ESTADO_CIVIL_TERC', 'FALTANTE',
       'MCA_COASEG', 'MCA_JUICIO', 'MCA_VIP', 'METRO', 'SEXO', 'SEXO_TERC',
       'TELEFONO_TERC', 'TIPO_EXPED', 'TIPO_LESION', 'TIPO_LESION_MAXIMA',
       'TIPO_SINIESTRO', 'existe_FECHA_FORMAL', 'CAPITAL_ACCESORIOS',
       'CAPITAL_ASEGURADO_COTIZACION', 'CAPITAL_VEHICULO', 'COD_COBRO',
       'COD_COBRO_ANTERIOR', 'CONV_COMISIONARIO', 'MCA_AGRAVANTE',
       'MCA_EMPLEADO', 'MCA_MOVIMIENTO', 'MCA_POLIZA_VIP', 'NEGOCIO',
       'PRENDARIO', 'SEGMENTO', 'cambio_cobro', '

In [70]:
df[obj_cols] = df[obj_cols].astype("category")

In [71]:
bin_cols = []
for col in obj_cols:
    print("**********--*****"*3)
    print(col)
    print(df[col].cat.categories)
    if(df[col].nunique() <= 3):
        bin_cols.append(col)

**********--***************--***************--*****
EXISTE_INVEST
Index([False, True], dtype='object')
**********--***************--***************--*****
preg_1
Index(['ADD', 'ATD', 'ATI', 'ATM', 'NO'], dtype='object')
**********--***************--***************--*****
preg_2
Index(['A1', 'A2', 'B', 'C', 'M', 'NO', 'PE'], dtype='object')
**********--***************--***************--*****
preg_3
Index(['N', 'S'], dtype='object')
**********--***************--***************--*****
preg_4
Index(['DJ', 'DP', 'NH'], dtype='object')
**********--***************--***************--*****
preg_5
Index(['N', 'S'], dtype='object')
**********--***************--***************--*****
preg_6
Index(['D', 'FA', 'FR', 'G', 'PC'], dtype='object')
**********--***************--***************--*****
preg_7
Index(['11', '12', '13', '14', '15', '16', '18', '21', '31', '32', '33', '34',
       '38', '41', '42', '43', '44', '45', '46', '47', '48', '51', '52', '53',
       '54', '55', '56', '57', '58', '71', 

**********--***************--***************--*****
NEGOCIO
Index(['AMV1', 'AMV2', 'AMV3', 'AMV4', 'AMV5', 'BBVPP', 'BFYPF', 'CETE',
       'COSAN', 'D10B', 'D10C', 'D10D', 'D10E', 'D15B', 'D15C', 'D15D', 'D15E',
       'D5B', 'D5C', 'D5D', 'D5E', 'DEF', 'DEFB', 'DEFC', 'DEFD', 'DEFE',
       'DTO10', 'DTO20', 'DTO30', 'EMP', 'EMPB', 'EMPBS', 'EMPCO', 'EMPGA',
       'FINAN', 'GERE', 'M10', 'M15', 'M20', 'MOSTC', 'MOSTG', 'MSHOT', 'NCLG',
       'NCLM', 'OLINE', 'P2BBV', 'PBBVA', 'PP', 'PPEMP', 'PRES1', 'PREST',
       'PRMM', 'PSAPP', 'PVWP', 'PVWPP', 'R10B', 'R5B', 'RE10C', 'REC10',
       'REC5', 'STDMO', 'TCFAP', 'TIE1G', 'TIE1M', 'TIE2G', 'TIE2M', 'TIE3G',
       'TIE3M', 'TIE4G', 'TIE4M', 'TIE5G', 'TIE5M', 'TLMK', 'TLMKA', 'TMKIN',
       'TMKOU'],
      dtype='object')
**********--***************--***************--*****
PRENDARIO
Index(['N', 'S'], dtype='object')
**********--***************--***************--*****
SEGMENTO
Index(['ALTO', 'BAJO', 'BBAJO', 'MALTO', 'MBAJO', 'MEDIO

### setting logical order

In [72]:
# preg_37 : ['0', '1', '2', 'N', 'S']
print(df["preg_37"].cat.categories)
df["preg_37"] = pd.Categorical(df["preg_37"], ['N', '0', '1', 'S', '2'])
print(df["preg_37"].cat.categories)

Index(['0', '1', '2', 'N', 'S'], dtype='object')
Index(['N', '0', '1', 'S', '2'], dtype='object')


In [73]:
# SEGMENTO : ['ALTO', 'BAJO', 'BBAJO', 'MALTO', 'MBAJO', 'MEDIO', 'MMBAJ']
print(df["SEGMENTO"].cat.categories)
df["SEGMENTO"] = pd.Categorical(df["SEGMENTO"], ['MMBAJ', 'MBAJO', 'BBAJO', 'BAJO', 'MEDIO', 'ALTO', 'MALTO',])
print(df["SEGMENTO"].cat.categories)

Index(['ALTO', 'BAJO', 'BBAJO', 'MALTO', 'MBAJO', 'MEDIO', 'MMBAJ'], dtype='object')
Index(['MMBAJ', 'MBAJO', 'BBAJO', 'BAJO', 'MEDIO', 'ALTO', 'MALTO'], dtype='object')


In [74]:
# COD_EST_CIVIL : ['CA', 'CO', 'DI', 'NC', 'NO', 'SE', 'SO', 'UC', 'UL', 'VI']
print(df["COD_EST_CIVIL"].cat.categories)
df["COD_EST_CIVIL"] = pd.Categorical(df["SEGMENTO"], ['NC', 'SO', 'NO', 'CO', 'CA', 'UC', 'UL', 'SE', 'DI', 'VI'])
print(df["COD_EST_CIVIL"].cat.categories)

Index(['CA', 'CO', 'DI', 'NC', 'NO', 'SE', 'SO', 'UC', 'UL', 'VI'], dtype='object')
Index(['NC', 'SO', 'NO', 'CO', 'CA', 'UC', 'UL', 'SE', 'DI', 'VI'], dtype='object')


In [75]:
def categorical_mapper(df):
    map_dict = {}
    cat_cols = df.select_dtypes("category").columns
    for col in cat_cols:
        tmp_dict = {}
        for i, v in zip(df[col].cat.codes.unique(), df[col].unique()):
            tmp_dict[v] = i
        map_dict[col] = tmp_dict
    return map_dict

In [76]:
categorical_mapper = categorical_mapper(df)
categorical_mapper

{'EXISTE_INVEST': {nan: -1, True: 1, False: 0},
 'preg_1': {nan: -1, 'ADD': 0, 'NO': 4, 'ATM': 3, 'ATD': 1, 'ATI': 2},
 'preg_2': {nan: -1,
  'NO': 5,
  'M': 4,
  'A1': 0,
  'PE': 6,
  'B': 2,
  'A2': 1,
  'C': 3},
 'preg_3': {nan: -1, 'N': 0, 'S': 1},
 'preg_4': {nan: -1, 'NH': 2, 'DP': 1, 'DJ': 0},
 'preg_5': {nan: -1, 'N': 0, 'S': 1},
 'preg_6': {nan: -1, 'D': 0, 'FA': 1, 'G': 3, 'PC': 4, 'FR': 2},
 'preg_7': {nan: -1,
  '91': 34,
  '54': 24,
  '47': 19,
  '11': 0,
  '41': 13,
  '71': 29,
  '51': 21,
  '55': 25,
  '46': 18,
  '78': 33,
  '12': 1,
  '42': 14,
  '53': 23,
  '45': 17,
  '33': 10,
  '14': 3,
  '43': 15,
  '44': 16,
  '21': 7,
  '48': 20,
  '18': 6,
  '58': 28,
  '31': 8,
  '38': 12,
  '56': 26,
  '16': 5,
  '13': 2,
  '52': 22,
  '32': 9,
  '34': 11,
  '15': 4,
  '57': 27,
  '73': 31,
  '74': 32,
  '72': 30},
 'preg_8': {nan: -1, 'D': 2, 'SR': 3, 'CM': 0, 'CO': 1},
 'preg_9': {nan: -1, 'D': 2, 'AMB': 0, 'N': 4, 'ASEG': 1, 'OTROS': 5, 'MP': 3},
 'preg_10': {nan: -1, 'D':

#### save mapper as pickle

In [77]:
model_file = "../../../src/features/1.1 - dict categorical mappers.pickle"

with open(model_file, "wb") as f:
    pickle.dump(categorical_mapper, f, pickle.HIGHEST_PROTOCOL)

### Replace category for index value

In [78]:
df.loc[:, obj_cols] = df.loc[:, obj_cols].apply(lambda col: col.cat.codes)

In [79]:
df.tail()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,total_condicion,es_gte_5,EXISTE_FRAUDE,EXISTE_INVEST,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37,CATASTROFICO,CERRADURA_BAUL,CERRADURA_DERECHA,CERRADURA_IZQUIERDA,CODIGO_BAJA,CODIGO_CARATULA,CODIGO_REAPERTURA,CODIGO_VEHICULO,COD_CAUSA_SINI,COD_POST_OCURRENCIA,COD_POST_POLIZA,COD_POST_TERC,COD_RAMO_sini,COD_RIES_sini,CONDICION_ROBO_EXP50,DANOS_MATERIALES,DESCRIPCION_TIPO,ESTADO_CIVIL,ESTADO_CIVIL_TERC,ESTAD_VEH_ASEG,FALTANTE,FECHA_SINI,MCA_COASEG,MCA_JUICIO,MCA_VIP,METRO,NUM_SECU_POL,OCUPACION_ASEG,SEXO,SEXO_TERC,TELEFONO_TERC,TIPO,TIPO_EXPED,TIPO_LESION,TIPO_LESION_MAXIMA,TIPO_SINIESTRO,USO,dias_entre_denu_y_sini,dist_fformal_fsini,dist_fformal_fdenu,existe_FECHA_FORMAL,CANT_RENOVACION,CAPITAL_ACCESORIOS,CAPITAL_ASEGURADO_COTIZACION,CAPITAL_VEHICULO,COD_COBRO,COD_COBRO_ANTERIOR,COD_PROD,COD_ZONA_CASCO,COD_ZONA_RC,COD_ZONA_ROBO,CONV_COMISIONARIO,CIF_ID,MCA_AGRAVANTE,MCA_EMPLEADO,MCA_MOVIMIENTO,MCA_POLIZA_VIP,NEGOCIO,PRENDARIO,SEGMENTO,cambio_cobro,ANTIG_pol,COD_EST_CIVIL,TIPO_ACTIVIDAD,CLIENTE,CODIGO_NACION,DATECO_TIPO_ACTIVIDAD,DOMICILIO_CODIGO_POSTAL,HABILITADO,FECHA_SINI_Year,FECHA_SINI_Month,FECHA_SINI_Week,FECHA_SINI_Day,FECHA_SINI_Dayofweek,FECHA_SINI_Dayofyear,FECHA_SINI_Is_month_end,FECHA_SINI_Is_month_start,FECHA_SINI_Is_quarter_end,FECHA_SINI_Is_quarter_start,FECHA_SINI_Is_year_end,FECHA_SINI_Is_year_start,FECHA_SINI_Hour,FECHA_SINI_Minute,FECHA_SINI_Second,FECHA_SINI_Elapsed,cant_pol,cant_sini_1y
1231010,1961339409999,,,,,,,,,0.0,,,,,,0.0,0.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,,,,,,,,,,,-1,-1,-1,-1,,-1,NaT,-1,-1,-1,-1,,,-1,-1,-1,,7,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,-1,-1,-1,,-1,-1,-1,-1,-1,,-1,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0
1231011,1961339439999,,,,,,,,,0.0,,,,,,0.0,0.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,,,,,,,,,,,-1,-1,-1,-1,,-1,NaT,-1,-1,-1,-1,,,-1,-1,-1,,7,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,-1,-1,-1,,-1,-1,-1,-1,-1,,-1,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0
1231012,1961339449999,,,,,,,,,,,,,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,,,,,,,,,,,-1,-1,-1,-1,,-1,NaT,-1,-1,-1,-1,,,-1,-1,-1,,7,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,-1,-1,-1,,-1,-1,-1,-1,-1,,-1,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0
1231013,1961339479999,,,,,,,,,,,,,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,,,,,,,,,,,,-1,-1,-1,-1,,-1,NaT,-1,-1,-1,-1,,,-1,-1,-1,,7,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,-1,-1,-1,,-1,-1,-1,-1,-1,,-1,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0
1231014,1961339499999,,,,,,,0.0,,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,,,,,,,,,0.0,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,1,-1,-1,-1,-1,-1,-1,-1,,,,,,,,,,,,-1,-1,-1,-1,,-1,NaT,-1,-1,-1,-1,,,-1,-1,-1,,7,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,,,,,-1,,-1,-1,-1,-1,-1,-1,-1,-1,,-1,-1,-1,-1,-1,,-1,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,-9223372037,0.0,0.0


### Replace null by -1

In [80]:
df.fillna(-1, inplace=True)

In [81]:
df["FECHA_SINI"].replace(-1, np.nan, inplace=True)

### Saving checkpoint all numeric with na = -1

In [82]:
df.to_feather("../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - col transf na to -1.feather")