# merging cond, invest, preg into a dataset

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# general configs
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

# Merging CONDICIONES & INVESTIGACIONES

## Loading CONDICIONES

In [3]:
# path to dataset
path_cond = "../../../data/interim/4. transformed/CONDICIONES-red_col_with_fec_and_user-new_cols.feather"

if not(os.path.exists(path_cond) and os.path.isfile(path_cond)):
    raise Exception("File {} doesn't exists.".format(path_cond))

In [4]:
df_cond = pd.read_feather(path_cond)
df_cond.info()

  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Data columns (total 34 columns):
NUM_SECU_EXPED     1231015 non-null int64
cond_04            108057 non-null float64
cond_05            33644 non-null float64
cond_06            33644 non-null float64
cond_09            34337 non-null float64
cond_11            131464 non-null float64
cond_12            14119 non-null float64
cond_32            74055 non-null float64
cond_37            11644 non-null float64
cond_C1            191102 non-null float64
cond_C10           75339 non-null float64
cond_C11           75339 non-null float64
cond_C12           75339 non-null float64
cond_C13           670758 non-null float64
cond_C14           10667 non-null float64
cond_C15           261730 non-null float64
cond_C16           261730 non-null float64
cond_C17           35174 non-null float64
cond_C18           17275 non-null float64
cond_C19           17275 non-null float64
cond_C2            14229 non-null float64

In [5]:
# checking ID is unique
df_cond["NUM_SECU_EXPED"].nunique() / len(df_cond)

1.0

## Loading INVESTIGACIONES

In [6]:
# path to dataset
path_inv = "../../../data/interim/3. row_red/investigaciones_col_full_uniques.feather"

if not(os.path.exists(path_inv) and os.path.isfile(path_inv)):
    raise Exception("File {} doesn't exists.".format(path_inv))

In [7]:
df_inv = pd.read_feather(path_inv)
df_inv.drop(columns="index", inplace=True)
df_inv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13092 entries, 0 to 13091
Data columns (total 6 columns):
COD_FRAUDE            12415 non-null float64
COD_RAMO              13092 non-null int64
EXISTE_FRAUDE         13092 non-null bool
EXISTE_INHABILITAR    13092 non-null bool
EXISTE_INVEST         13092 non-null bool
NUM_SECU_EXPED        13092 non-null int64
dtypes: bool(3), float64(1), int64(2)
memory usage: 345.3 KB


  labels, = index.labels


In [8]:
# checking ID is unique
df_inv["NUM_SECU_EXPED"].nunique() / len(df_inv)

1.0

## Comparing CONDICIONES vs INVESTIGACIONES
Checking NUM_SECU_EXPED intersection

In [9]:
# Checking intersection between both datasets
id_common = set(df_cond["NUM_SECU_EXPED"].unique()).intersection(set(df_inv["NUM_SECU_EXPED"].unique()))
len(id_common), len(id_common) / len(df_cond) *100, len(id_common) / len(df_inv)*100

(12742, 1.035080807301292, 97.32661167124962)

As expected, only a few (1%) of the NUM_SECU_EXPED in INVESTIGACIONES are in CONDICIONES. <br>
But, there are almost 3% of rows in INVESTIGACIONES that aren't in CONDICIONES. <br>

In [10]:
# Analyzing the 3% of INVESTIGACIONES
idx = set(df_inv["NUM_SECU_EXPED"].unique()) - set(df_cond["NUM_SECU_EXPED"].unique()) 
len(idx)

350

In [11]:
# Checking distribution on target (EXISTE_FRAUDE)
df_inv[df_inv["NUM_SECU_EXPED"].isin(idx)]["EXISTE_FRAUDE"].value_counts(dropna=False)

False    236
True     114
Name: EXISTE_FRAUDE, dtype: int64

<b>104 records are fraud.</b>

In [13]:
# See rows detail in file:
# ../../1. Analysis/investigaciones sin aparecer en condiciones.csv
tmp = pd.read_csv("../../1. Analysis/investigaciones sin aparecer en condiciones.csv", index_col=0)
tmp.head()
# tmp["FECHA_SINI"].value_counts(dropna=False)

Unnamed: 0,COD_FRAUDE,COD_RAMO,EXISTE_FRAUDE,EXISTE_INHABILITAR,EXISTE_INVEST,NUM_SECU_EXPED
6,99.0,1,False,False,False,1626306469999
7,99.0,8,False,False,False,1622593359999
8,99.0,8,False,False,False,1622555639999
9,99.0,8,False,False,False,1626456429999
10,,1,False,False,True,1616251899999


## Merging CONDICIONES & INVESTIGACIONES

In [14]:
# Merging dropping not founded INVESTIGACIONES
df_cond_inv = pd.merge(df_cond, df_inv, on="NUM_SECU_EXPED", how="left", suffixes=("_cond", "_inv"))
print(len(df_cond), len(df_inv), len(df_cond_inv))

1231015 13092 1231015


In [15]:
df_cond_inv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231015 entries, 0 to 1231014
Data columns (total 39 columns):
NUM_SECU_EXPED        1231015 non-null int64
cond_04               108057 non-null float64
cond_05               33644 non-null float64
cond_06               33644 non-null float64
cond_09               34337 non-null float64
cond_11               131464 non-null float64
cond_12               14119 non-null float64
cond_32               74055 non-null float64
cond_37               11644 non-null float64
cond_C1               191102 non-null float64
cond_C10              75339 non-null float64
cond_C11              75339 non-null float64
cond_C12              75339 non-null float64
cond_C13              670758 non-null float64
cond_C14              10667 non-null float64
cond_C15              261730 non-null float64
cond_C16              261730 non-null float64
cond_C17              35174 non-null float64
cond_C18              17275 non-null float64
cond_C19              1727

### temporal saving (checkpoint)

In [16]:
path_to_save = "../../../data/interim/5. merged"
check_filename = "condiciones_Investigacion.feather"
df_cond_inv.to_feather(os.path.join(path_to_save, check_filename))

# MERGIN condiciones_Investigacion & PREGUNTAS

## Restart from checkpoint (condiciones_Investigacion)

In [17]:
path = "../../../data/interim/5. merged"
check_filename = os.path.join(path, "condiciones_Investigacion.feather")
df_cond_inv = pd.read_feather(check_filename)
df_cond_inv.info()

  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Data columns (total 39 columns):
NUM_SECU_EXPED        1231015 non-null int64
cond_04               108057 non-null float64
cond_05               33644 non-null float64
cond_06               33644 non-null float64
cond_09               34337 non-null float64
cond_11               131464 non-null float64
cond_12               14119 non-null float64
cond_32               74055 non-null float64
cond_37               11644 non-null float64
cond_C1               191102 non-null float64
cond_C10              75339 non-null float64
cond_C11              75339 non-null float64
cond_C12              75339 non-null float64
cond_C13              670758 non-null float64
cond_C14              10667 non-null float64
cond_C15              261730 non-null float64
cond_C16              261730 non-null float64
cond_C17              35174 non-null float64
cond_C18              17275 non-null float64
cond_C19              1727

## Loading PREGUNTAS

In [18]:
# path to dataset
path_preg = "../../../data/interim/4. transformed/PREGUNTAS-red_col_with_fec_and_user.feather"

if not(os.path.exists(path_preg) and os.path.isfile(path_preg)):
    raise Exception("File {} doesn't exists.".format(path_preg))

In [19]:
df_preg = pd.read_feather(path_preg)
df_preg.drop(columns="index", inplace=True)
df_preg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147342 entries, 0 to 147341
Data columns (total 29 columns):
NUM_SECU_EXPED    147342 non-null int64
preg_1            2544 non-null object
preg_2            52954 non-null object
preg_3            51706 non-null object
preg_4            54292 non-null object
preg_5            66654 non-null object
preg_6            66654 non-null object
preg_7            54292 non-null object
preg_8            57010 non-null object
preg_9            66654 non-null object
preg_10           54292 non-null object
preg_11           66654 non-null object
preg_12           12995 non-null object
preg_15           62152 non-null object
preg_16           12995 non-null object
preg_18           10762 non-null object
preg_19           10762 non-null object
preg_22           7923 non-null object
preg_24           10763 non-null object
preg_27           12995 non-null object
preg_28           12995 non-null object
preg_30           12995 non-null object
preg_31    

  labels, = index.labels


In [20]:
# Dropping FEC_ACT
df_preg.drop(columns="FEC_ACT", inplace=True)

## Comparing merged vs PREGUNTAS
Checking NUM_SECU_EXPED intersection

In [21]:
# Checking intersection between both datasets
id_common = set(df_cond_inv["NUM_SECU_EXPED"].unique()).intersection(set(df_preg["NUM_SECU_EXPED"].unique()))
len(id_common), len(id_common) / len(df_cond_inv) *100, len(id_common) / len(df_preg)*100

(146595, 11.908465778239908, 99.49301624791302)

Almost 88% of the NUM_SECU_EXPED doens't have PREGUNTAS.<br/>
Only 0.5% of PREGUNTAS, aren't in MERGED dataset

## Merging merged & PREGUNTAS

In [22]:
# Merging dropping not founded INVESTIGACIONES
df_cond_inv_preg = pd.merge(df_cond_inv, df_preg, on="NUM_SECU_EXPED", how="left", suffixes=("", "_preg"))
print(len(df_cond_inv), len(df_preg), len(df_cond_inv_preg))
df_cond_inv_preg.info()

1231015 147342 1231015
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231015 entries, 0 to 1231014
Data columns (total 66 columns):
NUM_SECU_EXPED        1231015 non-null int64
cond_04               108057 non-null float64
cond_05               33644 non-null float64
cond_06               33644 non-null float64
cond_09               34337 non-null float64
cond_11               131464 non-null float64
cond_12               14119 non-null float64
cond_32               74055 non-null float64
cond_37               11644 non-null float64
cond_C1               191102 non-null float64
cond_C10              75339 non-null float64
cond_C11              75339 non-null float64
cond_C12              75339 non-null float64
cond_C13              670758 non-null float64
cond_C14              10667 non-null float64
cond_C15              261730 non-null float64
cond_C16              261730 non-null float64
cond_C17              35174 non-null float64
cond_C18              17275 non-null float64
con

### Saving merged file

In [23]:
path_to_save = "../../../data/interim/5. merged"
check_filename = "condiciones_Investigacion_preguntas.feather"
df_cond_inv_preg.to_feather(os.path.join(path_to_save, check_filename))