# preguntas - analysis & transform

In [23]:
import os

import pandas as pd

In [24]:
# general configs
pd.set_option("display.max_columns", 500)

In [25]:
# setting configs and constants
# path to raw dataset
PATH = "../../../data/interim/1. col_red"
FILENAME = "PREGUNTAS-red_col.feather"
RAW_FILE = os.path.join(PATH, FILENAME)

if not(os.path.exists(RAW_FILE) and os.path.isfile(RAW_FILE)):
    raise Exception("File doesn't exists.")

## Loading database

In [26]:
df = pd.read_feather(RAW_FILE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1808628 entries, 0 to 1808627
Data columns (total 5 columns):
COD_PREGUNTA      int64
FEC_ACT           datetime64[ns]
NUM_SECU_EXPED    int64
TIPO_EXPED        int64
VALOR_PREGUNTA    object
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 69.0+ MB


  labels, = index.labels


In [27]:
df.head()

Unnamed: 0,COD_PREGUNTA,FEC_ACT,NUM_SECU_EXPED,TIPO_EXPED,VALOR_PREGUNTA
0,9,2017-03-07,1651669769999,2,AMB
1,18,2017-03-08,1605473309999,50,T2
2,15,2017-03-08,1605473309999,50,N
3,37,2017-03-08,1605473309999,50,N
4,22,2017-03-08,1605473309999,50,12000


In [28]:
# number of unique NUM_SECU_EXPED
df["NUM_SECU_EXPED"].nunique()

153634

## pivoting table

In [29]:
id_col = "NUM_SECU_EXPED"
condition = "COD_PREGUNTA"
value_col = "VALOR_PREGUNTA"
# getting oldest repeated record
pv_df = df.pivot_table(index=id_col, columns=condition, values=value_col, aggfunc="last")
#renaming columns
pv_df = pv_df.add_prefix("preg_")
pv_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147342 entries, 728135189999 to 1961339499999
Data columns (total 27 columns):
preg_1     2544 non-null object
preg_2     52954 non-null object
preg_3     51706 non-null object
preg_4     54292 non-null object
preg_5     66654 non-null object
preg_6     66654 non-null object
preg_7     54292 non-null object
preg_8     57010 non-null object
preg_9     66654 non-null object
preg_10    54292 non-null object
preg_11    66654 non-null object
preg_12    12995 non-null object
preg_15    62152 non-null object
preg_16    12995 non-null object
preg_18    10762 non-null object
preg_19    10762 non-null object
preg_22    7923 non-null object
preg_24    10763 non-null object
preg_27    12995 non-null object
preg_28    12995 non-null object
preg_30    12995 non-null object
preg_31    65008 non-null object
preg_32    67798 non-null object
preg_33    65008 non-null object
preg_34    33748 non-null object
preg_36    16536 non-null object
preg_37    1069

In [30]:
# checking the NUM_SECU_EXPED that are not in pivot_table
pv_index = pv_df.index
# NUM_SECU_EXPED with all Nan are the ones missing
df[~df["NUM_SECU_EXPED"].isin(pv_index)]["NUM_SECU_EXPED"].nunique()

6292

In [31]:
pv_df.head()

COD_PREGUNTA,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37
NUM_SECU_EXPED,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
728135189999,,,,,,,,,,,,D,O,O,O,O,100000.0,N,07/06/2009 00:00,08/06/2009 00:00,S,,,,,,0.0
848582179999,,NO,N,NH,N,D,91.0,D,D,D,N,,,,,,,,,,,,,,,,
858568169999,,NO,N,NH,N,D,91.0,D,D,D,N,,,,,,,,,,,,,,,,
1094129479999,,,,,S,FA,,,AMB,,N,,,,,,,,,,,,,,,,
1104460669999,,,,,,,,,,,,O,O,O,O,O,0.0,N,23/09/2012 00:45,23/09/2012 00:46,S,,,,,,


In [32]:
pv_df.index.value_counts().max()

1

In [33]:
pv_df_res = pv_df.reset_index()
pv_df_res.NUM_SECU_EXPED.value_counts().max(), len(pv_df_res)

(1, 147342)

In [34]:
round((pv_df_res.isna().mean()) * 100, 2)

COD_PREGUNTA
NUM_SECU_EXPED     0.00
preg_1            98.27
preg_2            64.06
preg_3            64.91
preg_4            63.15
preg_5            54.76
preg_6            54.76
preg_7            63.15
preg_8            61.31
preg_9            54.76
preg_10           63.15
preg_11           54.76
preg_12           91.18
preg_15           57.82
preg_16           91.18
preg_18           92.70
preg_19           92.70
preg_22           94.62
preg_24           92.70
preg_27           91.18
preg_28           91.18
preg_30           91.18
preg_31           55.88
preg_32           53.99
preg_33           55.88
preg_34           77.10
preg_36           88.78
preg_37           92.74
dtype: float64

## adding date and user act

In [35]:
# getting the index of the pivot_table to concat
index = pv_df_res["NUM_SECU_EXPED"]
print(index.value_counts().max())

1


In [37]:
df_last = df[df["NUM_SECU_EXPED"].isin(index)][["NUM_SECU_EXPED", "FEC_ACT"]]
df_last.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1772083 entries, 0 to 1808627
Data columns (total 2 columns):
NUM_SECU_EXPED    int64
FEC_ACT           datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 40.6 MB


In [38]:
df_last = df_last[~df_last["NUM_SECU_EXPED"].duplicated(keep="last")]
#df_last.set_index("NUM_SECU_EXPED", inplace=True)
df_last.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147342 entries, 33 to 1808627
Data columns (total 2 columns):
NUM_SECU_EXPED    147342 non-null int64
FEC_ACT           147342 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 3.4 MB


In [39]:
# df_last_pv = df_last[df_last["NUM_SECU_EXPED"].isin(index)]
df_last["NUM_SECU_EXPED"].value_counts().max()

1

In [40]:
len(set(df_last.NUM_SECU_EXPED).intersection(set(index))), len(df_last), len(pv_df_res)

(147342, 147342, 147342)

In [41]:
set(df_last.NUM_SECU_EXPED) - (set(index))

set()

In [42]:
# COD_PREGUNTA it's the index column name
pv_df_res.head()

COD_PREGUNTA,NUM_SECU_EXPED,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37
0,728135189999,,,,,,,,,,,,D,O,O,O,O,100000.0,N,07/06/2009 00:00,08/06/2009 00:00,S,,,,,,0.0
1,848582179999,,NO,N,NH,N,D,91.0,D,D,D,N,,,,,,,,,,,,,,,,
2,858568169999,,NO,N,NH,N,D,91.0,D,D,D,N,,,,,,,,,,,,,,,,
3,1094129479999,,,,,S,FA,,,AMB,,N,,,,,,,,,,,,,,,,
4,1104460669999,,,,,,,,,,,,O,O,O,O,O,0.0,N,23/09/2012 00:45,23/09/2012 00:46,S,,,,,,


In [43]:
pv_df_res["NUM_SECU_EXPED"].value_counts().max(), df_last["NUM_SECU_EXPED"].value_counts().max()

(1, 1)

In [44]:
df_concat = pd.merge(pv_df_res, df_last, on="NUM_SECU_EXPED", how="inner")
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147342 entries, 0 to 147341
Data columns (total 29 columns):
NUM_SECU_EXPED    147342 non-null int64
preg_1            2544 non-null object
preg_2            52954 non-null object
preg_3            51706 non-null object
preg_4            54292 non-null object
preg_5            66654 non-null object
preg_6            66654 non-null object
preg_7            54292 non-null object
preg_8            57010 non-null object
preg_9            66654 non-null object
preg_10           54292 non-null object
preg_11           66654 non-null object
preg_12           12995 non-null object
preg_15           62152 non-null object
preg_16           12995 non-null object
preg_18           10762 non-null object
preg_19           10762 non-null object
preg_22           7923 non-null object
preg_24           10763 non-null object
preg_27           12995 non-null object
preg_28           12995 non-null object
preg_30           12995 non-null object
preg_31    

In [45]:
df_concat["NUM_SECU_EXPED"].value_counts().max()

1

In [46]:
df_concat.head()

Unnamed: 0,NUM_SECU_EXPED,preg_1,preg_2,preg_3,preg_4,preg_5,preg_6,preg_7,preg_8,preg_9,preg_10,preg_11,preg_12,preg_15,preg_16,preg_18,preg_19,preg_22,preg_24,preg_27,preg_28,preg_30,preg_31,preg_32,preg_33,preg_34,preg_36,preg_37,FEC_ACT
0,728135189999,,,,,,,,,,,,D,O,O,O,O,100000.0,N,07/06/2009 00:00,08/06/2009 00:00,S,,,,,,0.0,2017-11-07
1,848582179999,,NO,N,NH,N,D,91.0,D,D,D,N,,,,,,,,,,,,,,,,,2018-02-14
2,858568169999,,NO,N,NH,N,D,91.0,D,D,D,N,,,,,,,,,,,,,,,,,2018-02-14
3,1094129479999,,,,,S,FA,,,AMB,,N,,,,,,,,,,,,,,,,,2019-05-30
4,1104460669999,,,,,,,,,,,,O,O,O,O,O,0.0,N,23/09/2012 00:45,23/09/2012 00:46,S,,,,,,,2017-02-01


In [47]:
# checking unique num_secu_exped vs total number of rows
(df["NUM_SECU_EXPED"].nunique() ,  df_concat["NUM_SECU_EXPED"].nunique())

(153634, 147342)

In [48]:
no_id = list(set(df["NUM_SECU_EXPED"].unique())-set(df_concat.index.to_list()))
no_id[:5]

[1849669879999, 1867444649999, 1945268649999, 1844507049999, 1849094569999]

In [49]:
df[df["NUM_SECU_EXPED"] == 1849669879999]["COD_PREGUNTA"].unique()

array([34, 32, 15, 33, 31])

<h3 style="color:red">There are exped that have preg with nulls. That's the reason of fewer rows in pv_df.</h3>

In [50]:
df[df["NUM_SECU_EXPED"] == 1849669879999]

Unnamed: 0,COD_PREGUNTA,FEC_ACT,NUM_SECU_EXPED,TIPO_EXPED,VALOR_PREGUNTA
1271031,34,2018-08-07,1849669879999,60,
1271032,32,2018-08-07,1849669879999,60,1
1271033,15,2018-08-07,1849669879999,60,N
1271034,33,2018-08-07,1849669879999,60,N
1271035,31,2018-08-07,1849669879999,60,N
1306102,34,2018-08-15,1849669879999,60,
1306103,32,2018-08-15,1849669879999,60,1
1306104,15,2018-08-15,1849669879999,60,N
1306105,33,2018-08-15,1849669879999,60,N
1306106,31,2018-08-15,1849669879999,60,N


In [51]:
df[df["NUM_SECU_EXPED"] == 1854050729999]

Unnamed: 0,COD_PREGUNTA,FEC_ACT,NUM_SECU_EXPED,TIPO_EXPED,VALOR_PREGUNTA
1590449,34,2018-08-28,1854050729999,60,
1590450,32,2018-08-28,1854050729999,60,
1590451,15,2018-08-28,1854050729999,60,
1590452,33,2018-08-28,1854050729999,60,
1590453,31,2018-08-28,1854050729999,60,


# saving into feather

In [52]:
to_save = "../../../data/interim/4. transformed/PREGUNTAS-red_col_with_fec_and_user.feather"
df_concat.reset_index().to_feather(to_save)

# Loading dataset

In [92]:
# setting configs and constants
# path to raw dataset

if not(os.path.exists(RAW_FILE) and os.path.isfile(RAW_FILE)):
    raise Exception("File doesn't exists.")

In [None]:
df = pd.read_feather(RAW_FILE)
df.info()