In [2]:
import os

import pandas as pd

In [3]:
# general configs
pd.set_option("display.max_columns", 500)

# condiciones - analysis

In [3]:
# setting configs and constants
# path to raw dataset
PATH = "../../../data/interim/1. col_red"
FILENAME = "CONDICIONES-red_col.feather"
RAW_FILE = os.path.join(PATH, FILENAME)

if not(os.path.exists(RAW_FILE) and os.path.isfile(RAW_FILE)):
    raise Exception("File doesn't exists.")

## Loading database

In [6]:
df = pd.read_feather(RAW_FILE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10230858 entries, 0 to 10230857
Data columns (total 5 columns):
CONDICION          object
FEC_ACT            datetime64[ns]
NUM_SECU_EXPED     int64
USR_ACT            object
VALOR_CONDICION    float64
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 390.3+ MB


  labels, = index.labels


In [7]:
df["NUM_SECU_EXPED"].nunique()

1231015

## pivoting table

In [8]:
id_col = "NUM_SECU_EXPED"
condition = "CONDICION"
value_col = "VALOR_CONDICION"
# getting last repeated record
pv_df = df.pivot_table(index=id_col, columns=condition, values=value_col, aggfunc="last")
#renaming columns
pv_df = pv_df.add_prefix("cond_")
pv_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231015 entries, 516359994 to 1961339499999
Data columns (total 29 columns):
cond_04     108057 non-null float64
cond_05     33644 non-null float64
cond_06     33644 non-null float64
cond_09     34337 non-null float64
cond_11     131464 non-null float64
cond_12     14119 non-null float64
cond_32     74055 non-null float64
cond_37     11644 non-null float64
cond_C1     191102 non-null float64
cond_C10    75339 non-null float64
cond_C11    75339 non-null float64
cond_C12    75339 non-null float64
cond_C13    670758 non-null float64
cond_C14    10667 non-null float64
cond_C15    261730 non-null float64
cond_C16    261730 non-null float64
cond_C17    35174 non-null float64
cond_C18    17275 non-null float64
cond_C19    17275 non-null float64
cond_C2     14229 non-null float64
cond_C20    17275 non-null float64
cond_C21    35174 non-null float64
cond_C3     14229 non-null float64
cond_C4     13069 non-null float64
cond_C5     14229 non-null 

In [9]:
pv_df_res = pv_df.reset_index()
pv_df_res["NUM_SECU_EXPED"].value_counts().max(), len(pv_df_res)

(1, 1231015)

In [10]:
# double checking repeated ids
pv_df_res["NUM_SECU_EXPED"].value_counts().max()

1

In [13]:
round((pv_df_res.isna().mean()) * 100, 2)

CONDICION
NUM_SECU_EXPED     0.00
cond_04           91.22
cond_05           97.27
cond_06           97.27
cond_09           97.21
cond_11           89.32
cond_12           98.85
cond_32           93.98
cond_37           99.05
cond_C1           84.48
cond_C10          93.88
cond_C11          93.88
cond_C12          93.88
cond_C13          45.51
cond_C14          99.13
cond_C15          78.74
cond_C16          78.74
cond_C17          97.14
cond_C18          98.60
cond_C19          98.60
cond_C2           98.84
cond_C20          98.60
cond_C21          97.14
cond_C3           98.84
cond_C4           98.94
cond_C5           98.84
cond_C6           44.70
cond_C7           55.78
cond_C8           43.40
cond_C9           44.70
dtype: float64

## adding date and user act

In [11]:
# getting the index of the pivot_table to concat
index = pv_df_res["NUM_SECU_EXPED"]
print(index.value_counts().max())

1


In [12]:
df_last = df[df["NUM_SECU_EXPED"].isin(index)][["NUM_SECU_EXPED", "FEC_ACT", "USR_ACT"]]
df_last.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10230858 entries, 0 to 10230857
Data columns (total 3 columns):
NUM_SECU_EXPED    int64
FEC_ACT           datetime64[ns]
USR_ACT           object
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 312.2+ MB


In [13]:
df_last = df_last[~df_last["NUM_SECU_EXPED"].duplicated(keep="last")]
df_last.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231015 entries, 26 to 10230857
Data columns (total 3 columns):
NUM_SECU_EXPED    1231015 non-null int64
FEC_ACT           1231015 non-null datetime64[ns]
USR_ACT           1231015 non-null object
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 37.6+ MB


In [14]:
df_last["NUM_SECU_EXPED"].value_counts().max()

1

In [15]:
len(set(df_last.NUM_SECU_EXPED).intersection(set(index))), len(df_last), len(pv_df_res)

(1231015, 1231015, 1231015)

In [16]:
set(df_last.NUM_SECU_EXPED) - (set(index))

set()

In [17]:
# CONDICION it's the index column name
pv_df_res.head()

CONDICION,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9
0,516359994,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,1.0,3.0
1,1762619999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0
2,4768809999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0
3,6444209999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0
4,7529469970,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0


In [18]:
pv_df_res["NUM_SECU_EXPED"].value_counts().max(), df_last["NUM_SECU_EXPED"].value_counts().max()

(1, 1)

In [19]:
df_concat = pd.merge(pv_df_res, df_last, on="NUM_SECU_EXPED", how="inner")
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231015 entries, 0 to 1231014
Data columns (total 32 columns):
NUM_SECU_EXPED    1231015 non-null int64
cond_04           108057 non-null float64
cond_05           33644 non-null float64
cond_06           33644 non-null float64
cond_09           34337 non-null float64
cond_11           131464 non-null float64
cond_12           14119 non-null float64
cond_32           74055 non-null float64
cond_37           11644 non-null float64
cond_C1           191102 non-null float64
cond_C10          75339 non-null float64
cond_C11          75339 non-null float64
cond_C12          75339 non-null float64
cond_C13          670758 non-null float64
cond_C14          10667 non-null float64
cond_C15          261730 non-null float64
cond_C16          261730 non-null float64
cond_C17          35174 non-null float64
cond_C18          17275 non-null float64
cond_C19          17275 non-null float64
cond_C2           14229 non-null float64
cond_C20          17

In [20]:
df_concat["NUM_SECU_EXPED"].value_counts().max()

1

In [21]:
df_concat.head()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,FEC_ACT,USR_ACT
0,516359994,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,1.0,3.0,2018-06-19,SALINASL
1,1762619999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,2017-03-20,COLOMBOM
2,4768809999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,2018-03-08,ALAIS
3,6444209999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,2018-02-26,KLEIN
4,7529469970,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,2018-04-18,CHIPIAN


In [22]:
# checking unique num_secu_exped vs total number of rows
(df["NUM_SECU_EXPED"].nunique() ,  df_concat["NUM_SECU_EXPED"].nunique())

(1231015, 1231015)

In [75]:
# Convert float to int
# cols_to_int = df_concat.dtypes[df_concat.dtypes == "float64"].index
# int doesn't accept nulls
#df_concat[cols_to_int] = df_concat[cols_to_int].astype(int)
#df_concat.info()

# saving into feather

In [23]:
to_save = "../../../data/interim/4. transformed/CONDICIONES-red_col_with_fec_and_user.feather"
df_concat.reset_index().to_feather(to_save)

# Loading dataset to create new columns

In [6]:
# path to tmp saved file
file = "../../../data/interim/4. transformed/CONDICIONES-red_col_with_fec_and_user.feather"
if not(os.path.exists(file) and os.path.isfile(file)):
    raise Exception("File doesn't exists.")

In [11]:
df = pd.read_feather(file)
df.drop(columns="index", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Data columns (total 32 columns):
NUM_SECU_EXPED    1231015 non-null int64
cond_04           108057 non-null float64
cond_05           33644 non-null float64
cond_06           33644 non-null float64
cond_09           34337 non-null float64
cond_11           131464 non-null float64
cond_12           14119 non-null float64
cond_32           74055 non-null float64
cond_37           11644 non-null float64
cond_C1           191102 non-null float64
cond_C10          75339 non-null float64
cond_C11          75339 non-null float64
cond_C12          75339 non-null float64
cond_C13          670758 non-null float64
cond_C14          10667 non-null float64
cond_C15          261730 non-null float64
cond_C16          261730 non-null float64
cond_C17          35174 non-null float64
cond_C18          17275 non-null float64
cond_C19          17275 non-null float64
cond_C2           14229 non-null float64
cond_C20          17

### total_condicion

In [12]:
# create sum of conditions
df["total_condicion"] = df.iloc[:, 1:-2].sum(axis=1)
df.head()

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,FEC_ACT,USR_ACT,total_condicion
0,516359994,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,1.0,3.0,2018-06-19,SALINASL,4.0
1,1762619999,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,2017-03-20,COLOMBOM,0.0
2,4768809999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,2018-03-08,ALAIS,3.0
3,6444209999,0.0,,,,0.0,,,,,,,,0.0,,,,,,,,,,,,,0.0,,0.0,3.0,2018-02-26,KLEIN,3.0
4,7529469970,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,2018-04-18,CHIPIAN,1.0


### es_gte_5

In [13]:
df["es_gte_5"] = df["total_condicion"] >= 5

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Data columns (total 34 columns):
NUM_SECU_EXPED     1231015 non-null int64
cond_04            108057 non-null float64
cond_05            33644 non-null float64
cond_06            33644 non-null float64
cond_09            34337 non-null float64
cond_11            131464 non-null float64
cond_12            14119 non-null float64
cond_32            74055 non-null float64
cond_37            11644 non-null float64
cond_C1            191102 non-null float64
cond_C10           75339 non-null float64
cond_C11           75339 non-null float64
cond_C12           75339 non-null float64
cond_C13           670758 non-null float64
cond_C14           10667 non-null float64
cond_C15           261730 non-null float64
cond_C16           261730 non-null float64
cond_C17           35174 non-null float64
cond_C18           17275 non-null float64
cond_C19           17275 non-null float64
cond_C2            14229 non-null float64

In [15]:
to_save = "../../../data/interim/4. transformed/CONDICIONES-red_col_with_fec_and_user-new_cols.feather"
df.to_feather(to_save)

# Checking data quality

In [4]:
# path to tmp saved file
file = "../../../data/interim/4. transformed/CONDICIONES-red_col_with_fec_and_user-new_cols.feather"
if not(os.path.exists(file) and os.path.isfile(file)):
    raise Exception("File doesn't exists.")

In [6]:
df = pd.read_feather(file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231015 entries, 0 to 1231014
Data columns (total 34 columns):
NUM_SECU_EXPED     1231015 non-null int64
cond_04            108057 non-null float64
cond_05            33644 non-null float64
cond_06            33644 non-null float64
cond_09            34337 non-null float64
cond_11            131464 non-null float64
cond_12            14119 non-null float64
cond_32            74055 non-null float64
cond_37            11644 non-null float64
cond_C1            191102 non-null float64
cond_C10           75339 non-null float64
cond_C11           75339 non-null float64
cond_C12           75339 non-null float64
cond_C13           670758 non-null float64
cond_C14           10667 non-null float64
cond_C15           261730 non-null float64
cond_C16           261730 non-null float64
cond_C17           35174 non-null float64
cond_C18           17275 non-null float64
cond_C19           17275 non-null float64
cond_C2            14229 non-null float64

In [13]:
# check if there is a row with all cond = null
df[df.iloc[:, 1:-4].notna().sum(axis=1) == 0]

Unnamed: 0,NUM_SECU_EXPED,cond_04,cond_05,cond_06,cond_09,cond_11,cond_12,cond_32,cond_37,cond_C1,cond_C10,cond_C11,cond_C12,cond_C13,cond_C14,cond_C15,cond_C16,cond_C17,cond_C18,cond_C19,cond_C2,cond_C20,cond_C21,cond_C3,cond_C4,cond_C5,cond_C6,cond_C7,cond_C8,cond_C9,FEC_ACT,USR_ACT,total_condicion,es_gte_5


<h3 style="color:green">there isn't all nulls NUM_SECU_EXPED</h3>