# split dataset traning & test

## imports & config

In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# configs
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

# Loading dataset

In [4]:
path = "../../../data/processed/1.1 - full merge - date cols transformed - dropped cols - clean vals - col transf na to -1.feather"

if not os.path.isfile(path):
    raise Exception(f"{path} is not a file.")

In [5]:
df = pd.read_feather(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Columns: 157 entries, NUM_SECU_EXPED to COD_POST_TERC_RC_CLUSTER_2019
dtypes: bool(2), datetime64[ns](1), float64(83), int16(10), int32(2), int64(2), int8(57)
memory usage: 909.8 MB


  labels, = index.labels


In [6]:
# convert target into bool (removing -1 values)
# df["EXISTE_FRAUDE"] = df["EXISTE_FRAUDE"] == True

## Making splits

- Training: (01/2017, 12/2018)
- Validation: (01/2019, 03/2019)
- Test: (04/2019, 05/2019)


### Training
** including FECHA_SINI== None**

In [5]:
# distrib of fraud in None fecha_sini
df[df["FECHA_SINI"].isna()]["EXISTE_FRAUDE"].value_counts(dropna=False)

False    89541
True       106
Name: EXISTE_FRAUDE, dtype: int64

In [6]:
df_train = df[(df["FECHA_SINI"] < "2019/01/01") | (df["FECHA_SINI"].isna())]
df_train.shape

(1055951, 157)

In [7]:
df_train["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.99676
True     0.00324
Name: EXISTE_FRAUDE, dtype: float64

In [8]:
# NUM_SECU_EXPED AS INDEX
df_train.reset_index(drop=True).to_feather("../../../data/split/1.1.a - df_train with nan.feather")

#### Traing without None

In [9]:
df_train = df[(df["FECHA_SINI"] < "2019/01/01")]
df_train.shape

(966304, 157)

In [10]:
# NUM_SECU_EXPED AS INDEX
df_train.reset_index(drop=True).to_feather("../../../data/split/1.1.a - df_train without nulls.feather")

In [11]:
df_train["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.996569
True     0.003431
Name: EXISTE_FRAUDE, dtype: float64

#### Traing with only 2018

In [12]:
df_train = df[(df["FECHA_SINI"] >= "2018/01/01") & (df["FECHA_SINI"] < "2019/01/01")]
df_train.shape

(481014, 157)

In [13]:
# NUM_SECU_EXPED AS INDEX
df_train.reset_index(drop=True).to_feather("../../../data/split/1.1.a - df_train only 2018 without nulls.feather")

In [14]:
df_train["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.99636
True     0.00364
Name: EXISTE_FRAUDE, dtype: float64

### Validation

In [15]:
df_val = df[(df["FECHA_SINI"] >= "2019/01/01") & (df["FECHA_SINI"] < "2019/04/01")]
df_val.shape

(107264, 157)

In [16]:
df_val["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.994956
True     0.005044
Name: EXISTE_FRAUDE, dtype: float64

In [17]:
# NUM_SECU_EXPED AS INDEX
df_val.reset_index(drop=True).to_feather("../../../data/split/1.1.a - df_val 01-19to03-19.feather")

#### validation 02-03/19

In [6]:
df_val = df[(df["FECHA_SINI"] >= "2019/02/01") & (df["FECHA_SINI"] < "2019/04/01")]
df_val.shape

(71813, 157)

In [7]:
df_val["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.995683
True     0.004317
Name: EXISTE_FRAUDE, dtype: float64

### Test

In [18]:
df_test = df[(df["FECHA_SINI"] >= "2019/04/01")]
df_test.shape

(67800, 157)

In [19]:
df_test["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.998097
True     0.001903
Name: EXISTE_FRAUDE, dtype: float64

In [20]:
# NUM_SECU_EXPED AS INDEX
df_test.reset_index(drop=True).to_feather("../../../data/split/1.1.a - df_test 04-19to05-19.feather")

#### Test only month 4

In [21]:
df_test = df[(df["FECHA_SINI"] >= "2019/04/01") & (df["FECHA_SINI"] < "2019/05/01")]
df_test.shape

(34658, 157)

In [22]:
df_test["EXISTE_FRAUDE"].value_counts(normalize=True)

False    0.99723
True     0.00277
Name: EXISTE_FRAUDE, dtype: float64

In [23]:
# NUM_SECU_EXPED AS INDEX
df_test.reset_index(drop=True).to_feather("../../../data/split/1.1.a - df_test 04-19to04-19.feather")