# split dataset traning & test

## imports & config

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# configs
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

model_id = "1.2.b - group zone"

# Loading dataset

In [3]:
path = f"../../../data/processed/1.2 - full merge - date cols transformed - dropped cols - clean vals - col transf na to -1.feather"

if not os.path.isfile(path):
    raise Exception(f"{path} is not a file.")

In [4]:
df = pd.read_feather(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 157 entries, NUM_SECU_EXPED to COD_POST_TERC_RC_CLUSTER_2019
dtypes: bool(2), datetime64[ns](1), float64(83), int16(10), int32(2), int64(2), int8(57)
memory usage: 909.8 MB


In [5]:
# convert target into bool (removing -1 values)
# df["EXISTE_FRAUDE"] = df["EXISTE_FRAUDE"] == True

## Making splits

- Training: (01/2017, 1/2019)
- Validation: (02/2019, 03/2019)
- Test: (04/2019, 05/2019)


### Training
** including FECHA_SINI== None**

In [6]:
# distrib of fraud in None fecha_sini
df[df["FECHA_SINI"].isna()]["EXISTE_FRAUDE"].value_counts(dropna=False)

False    89541
True       106
Name: EXISTE_FRAUDE, dtype: int64

In [7]:
df_train = df[(df["FECHA_SINI"] < "2019/02/01") | (df["FECHA_SINI"].isna())]
df_train.shape

(1091402, 157)

In [8]:
df_train["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.996654
True     0.003346
Name: EXISTE_FRAUDE, dtype: float64

In [9]:
# NUM_SECU_EXPED AS INDEX
df_train.reset_index(drop=True).to_feather(f"../../../data/split/{model_id} - 01-17 to 01-19 - df_train with nan.feather")

#### Traing without None

In [10]:
df_train = df[(df["FECHA_SINI"] < "2019/02/01")]
df_train.shape

(1001755, 157)

In [11]:
# NUM_SECU_EXPED AS INDEX
df_train.reset_index(drop=True).to_feather(f"../../../data/split/{model_id} - 01-17 to 01-19 - df_train without nulls.feather")

In [12]:
df_train["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.99646
True     0.00354
Name: EXISTE_FRAUDE, dtype: float64

#### Traing with only 2018

In [13]:
df_train = df[(df["FECHA_SINI"] >= "2018/01/01") & (df["FECHA_SINI"] < "2019/02/01")]
df_train.shape

(516465, 157)

In [14]:
# NUM_SECU_EXPED AS INDEX
df_train.reset_index(drop=True).to_feather(f"../../../data/split/{model_id} - 01-17 to 01-19 - df_train only 2018 without nulls.feather")

In [15]:
df_train["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.996162
True     0.003838
Name: EXISTE_FRAUDE, dtype: float64

### Validation

In [16]:
df_val = df[(df["FECHA_SINI"] >= "2019/02/01") & (df["FECHA_SINI"] < "2019/04/01")]
df_val.shape

(71813, 157)

In [17]:
df_val["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.995683
True     0.004317
Name: EXISTE_FRAUDE, dtype: float64

In [18]:
# NUM_SECU_EXPED AS INDEX
df_val.reset_index(drop=True).to_feather(f"../../../data/split/{model_id} - 01-17 to 01-19 - df_val 02-19to03-19.feather")

### Test

In [19]:
df_test = df[(df["FECHA_SINI"] >= "2019/04/01")]
df_test.shape

(67800, 157)

In [20]:
df_test["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.998097
True     0.001903
Name: EXISTE_FRAUDE, dtype: float64

In [21]:
# NUM_SECU_EXPED AS INDEX
df_test.reset_index(drop=True).to_feather(f"../../../data/split/{model_id} - 01-17 to 01-19 - df_test 04-19to05-19.feather")

#### Test only month 4

In [22]:
df_test = df[(df["FECHA_SINI"] >= "2019/04/01") & (df["FECHA_SINI"] < "2019/05/01")]
df_test.shape

(34658, 157)

In [23]:
df_test["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.99723
True     0.00277
Name: EXISTE_FRAUDE, dtype: float64

In [24]:
# NUM_SECU_EXPED AS INDEX
df_test.reset_index(drop=True).to_feather(f"../../../data/split/{model_id} - 01-17 to 01-19 - df_test 04-19to04-19.feather")