# Investigaciones Analysis

In [1]:
import os

import pandas as pd

In [2]:
# general configs
pd.set_option("display.max_columns", 500)

In [3]:
# setting configs and constants
# path to raw dataset
PATH = "../../../data/interim/1. col_red"
FILENAME = "INVESTIGACION-red_col.feather"
RAW_FILE = os.path.join(PATH, FILENAME)

if not(os.path.exists(RAW_FILE) and os.path.isfile(RAW_FILE)):
    raise Exception("File doesn't exists.")

## Loading database

In [4]:
df = pd.read_feather(RAW_FILE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22468 entries, 0 to 22467
Data columns (total 7 columns):
COD_FRAUDE            13711 non-null float64
COD_RAMO              22468 non-null int64
EXISTE_FRAUDE         22468 non-null object
EXISTE_INHABILITAR    22468 non-null object
EXISTE_INVEST         22468 non-null object
NRO_EXPED             22468 non-null int64
NUM_SECU_EXPED        22468 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 1.2+ MB


  labels, = index.labels


In [8]:
df["EXISTE_FRAUDE"].value_counts()

N    17609
S     4859
Name: EXISTE_FRAUDE, dtype: int64

In [9]:
df["EXISTE_FRAUDE"].value_counts(normalize=True)

N    0.783737
S    0.216263
Name: EXISTE_FRAUDE, dtype: float64

In [6]:
# fixing NUM_SECU_EXPED repeated
df["NUM_SECU_EXPED"].nunique()

13092

**Fixing last record not fraud but previously identified as fraud**

In [7]:
idx_frauds = df[df["EXISTE_FRAUDE"] == "S"]["NUM_SECU_EXPED"].unique()
len(idx_frauds)

4205

In [14]:
# set all values of that NUM_SECU_EXPED as fraud
idx_to_replace = df[df["NUM_SECU_EXPED"].isin(idx_frauds)].index
df.loc[idx_to_replace, "EXISTE_FRAUDE"] = "S"

In [15]:
df["EXISTE_FRAUDE"].value_counts()

N    13058
S     9410
Name: EXISTE_FRAUDE, dtype: int64

<h3 style="color:red">Using last item.</h3>

In [16]:
df.drop_duplicates(subset="NUM_SECU_EXPED", keep="last", inplace=True)
# old version
# df = df.groupby(["NUM_SECU_EXPED"]).last()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13092 entries, 7 to 22467
Data columns (total 7 columns):
COD_FRAUDE            12415 non-null float64
COD_RAMO              13092 non-null int64
EXISTE_FRAUDE         13092 non-null object
EXISTE_INHABILITAR    13092 non-null object
EXISTE_INVEST         13092 non-null object
NRO_EXPED             13092 non-null int64
NUM_SECU_EXPED        13092 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 818.2+ KB


In [18]:
df["EXISTE_FRAUDE"].value_counts(dropna=False, normalize=True)

N    0.678811
S    0.321189
Name: EXISTE_FRAUDE, dtype: float64

In [19]:
# formatting variables
df["EXISTE_FRAUDE"] = df["EXISTE_FRAUDE"] == "S"
df["EXISTE_INHABILITAR"] = df["EXISTE_INHABILITAR"] == "S"
df["EXISTE_INVEST"] = df["EXISTE_INVEST"] == "S"

In [20]:
# nulls or values?
df["COD_FRAUDE"].value_counts(dropna=False).head(8)

0.0     3742
90.0    2683
99.0    1983
27.0    1713
30.0     826
20.0     781
NaN      677
81.0     143
Name: COD_FRAUDE, dtype: int64

In [21]:
# dropping NRO_EXPED, is not useful anymore
df.drop(columns="NRO_EXPED", inplace=True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13092 entries, 7 to 22467
Data columns (total 6 columns):
COD_FRAUDE            12415 non-null float64
COD_RAMO              13092 non-null int64
EXISTE_FRAUDE         13092 non-null bool
EXISTE_INHABILITAR    13092 non-null bool
EXISTE_INVEST         13092 non-null bool
NUM_SECU_EXPED        13092 non-null int64
dtypes: bool(3), float64(1), int64(2)
memory usage: 447.5 KB


## save sample

In [23]:
df.reset_index().to_feather("../../../data/interim/3. row_red/investigaciones_col_full_uniques.feather")

# Analysing EXISTE_INHABILITAR in the dataset

In [24]:
var = "EXISTE_INHABILITAR"
df[df["EXISTE_FRAUDE"] == True][var].value_counts() / len(df[df["EXISTE_FRAUDE"] == True])

False    0.922949
True     0.077051
Name: EXISTE_INHABILITAR, dtype: float64

In [25]:
df[df["EXISTE_FRAUDE"] == False][var].value_counts() / len(df[df["EXISTE_FRAUDE"] == False])

False    0.999775
True     0.000225
Name: EXISTE_INHABILITAR, dtype: float64