# merging VIGABT_POLIZAS+BTT_ASEGURADOS+tb_cif

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# general configs
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

# Merging VIGABT_POLIZAS & BTT_ASEGURADOS

## Loading VIGABT_POLIZAS

In [3]:
# path to dataset
path_vigabt = "../../../data/interim/3. row_red/VIGABT_POLIZAS-red_col-red_rows-clean.feather"

if not(os.path.exists(path_vigabt) and os.path.isfile(path_vigabt)):
    raise Exception("File {} doesn't exists.".format(path_vigabt))

In [4]:
df_vigabt = pd.read_feather(path_vigabt)
df_vigabt.info()

  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729756 entries, 0 to 729755
Data columns (total 29 columns):
CANT_RENOVACION                 709911 non-null float64
CAPITAL_ACCESORIOS              729756 non-null object
CAPITAL_ASEGURADO_COTIZACION    725415 non-null object
CAPITAL_VEHICULO                729756 non-null object
COD_COBRO                       729756 non-null object
COD_COBRO_ANTERIOR              604173 non-null object
COD_POSTAL                      729756 non-null int64
COD_PROD                        729756 non-null int64
COD_RAMO                        729756 non-null int64
COD_RIES                        729756 non-null int64
COD_ZONA_CASCO                  729754 non-null float64
COD_ZONA_RC                     729754 non-null float64
COD_ZONA_ROBO                   729754 non-null float64
CONV_COMISIONARIO               729756 non-null object
FECHA_PROCESO                   729756 non-null datetime64[ns]
FECHA_VENC_POL                  729756 non-null datetime

<b>VIGABT_POLIZAS: has both keys ID(CIF_ID) & NUM_SECU_POL</b>

In [5]:
# rename ID for CIF_ID
df_vigabt.rename(columns={"ID": "CIF_ID"}, inplace=True)

In [6]:
# checking ID is unique
df_vigabt["NUM_SECU_POL"].nunique() / len(df_vigabt)

1.0

In [7]:
# checking ID is unique
df_vigabt["CIF_ID"].nunique() / len(df_vigabt)

0.05869633137651489

In [8]:
# showing most repeated clients
df_vigabt["CIF_ID"].value_counts().head()

16647307.0    152
19613011.0     79
10991015.0     16
10806326.0     13
3081635.0      13
Name: CIF_ID, dtype: int64

## Loading BTT_ASEGURADOS

In [9]:
# path to dataset
path_aseg = "../../../data/interim/3. row_red/BTT_ASEGURADOS-red_col-red_cols-clean-unique_CIF_ID.feather"

if not(os.path.exists(path_aseg) and os.path.isfile(path_aseg)):
    raise Exception("File {} doesn't exists.".format(path_aseg))

In [10]:
df_aseg = pd.read_feather(path_aseg)
df_aseg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3385271 entries, 0 to 3385270
Data columns (total 6 columns):
CIF_ID              float64
COD_EST_CIVIL       object
FECHA_DESDE         datetime64[ns]
FECHA_NACIMIENTO    datetime64[ns]
SEXO_ASEG           object
TIPO_ACTIVIDAD      object
dtypes: datetime64[ns](2), float64(1), object(3)
memory usage: 155.0+ MB


  labels, = index.labels


In [11]:
# checking ID is unique
df_aseg["CIF_ID"].nunique() / len(df_aseg)

1.0

## Comparing VIGABT_POLIZAS vs BTT_ASEGURADOS
Checking CIF_ID intersection

In [12]:
# Checking intersection between both datasets
col = "CIF_ID"
common = set(df_vigabt[col].unique()).intersection(set(df_aseg[col].unique()))
len(common), len(common)/ df_vigabt[col].nunique() * 100, len(common)/ df_aseg[col].nunique() * 100

(42824, 99.97665405985899, 1.2650095073629262)

In [13]:
diff = set(df_vigabt[col].unique()) - (set(df_aseg[col].unique()))
len(diff)

11

In [15]:
diff

{nan,
 1744654.0,
 1772415.0,
 2620576.0,
 2891556.0,
 5944087.0,
 6037762.0,
 6102951.0,
 7264544.0,
 8558972.0,
 23020616.0}

## Merging VIGABT_POLIZAS & BTT_ASEGURADOS

In [16]:
# Merging dropping not founded INVESTIGACIONES
df_vigabt_aseg = pd.merge(df_vigabt, df_aseg, on="CIF_ID", how="left", suffixes=("_vigabt", "_aseg"))
print(len(df_vigabt), len(df_aseg), len(df_vigabt_aseg))

729756 3385271 729756


In [17]:
df_vigabt_aseg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729756 entries, 0 to 729755
Data columns (total 34 columns):
CANT_RENOVACION                 709911 non-null float64
CAPITAL_ACCESORIOS              729756 non-null object
CAPITAL_ASEGURADO_COTIZACION    725415 non-null object
CAPITAL_VEHICULO                729756 non-null object
COD_COBRO                       729756 non-null object
COD_COBRO_ANTERIOR              604173 non-null object
COD_POSTAL                      729756 non-null int64
COD_PROD                        729756 non-null int64
COD_RAMO                        729756 non-null int64
COD_RIES                        729756 non-null int64
COD_ZONA_CASCO                  729754 non-null float64
COD_ZONA_RC                     729754 non-null float64
COD_ZONA_ROBO                   729754 non-null float64
CONV_COMISIONARIO               729756 non-null object
FECHA_PROCESO                   729756 non-null datetime64[ns]
FECHA_VENC_POL                  729756 non-null datetime

### temporal saving (checkpoint)

In [18]:
path_to_save = "../../../data/interim/5. merged"
check_filename = "VIGABT_POLIZAS+BTT_ASEGURADOS.feather"
df_vigabt_aseg.to_feather(os.path.join(path_to_save, check_filename))

# Merging (VIGABT_POLIZAS & BTT_ASEGURADOS) & tb_cif

## Restart from checkpoint (VIGABT_POLIZAS+BTT_ASEGURADOS)

In [23]:
path = "../../../data/interim/5. merged"
check_filename = "VIGABT_POLIZAS+BTT_ASEGURADOS.feather"
path_file = os.path.join(path, check_filename)
df_vigabt_aseg = pd.read_feather(path_file)
df_vigabt_aseg.info()

  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729756 entries, 0 to 729755
Data columns (total 34 columns):
CANT_RENOVACION                 709911 non-null float64
CAPITAL_ACCESORIOS              729756 non-null object
CAPITAL_ASEGURADO_COTIZACION    725415 non-null object
CAPITAL_VEHICULO                729756 non-null object
COD_COBRO                       729756 non-null object
COD_COBRO_ANTERIOR              604173 non-null object
COD_POSTAL                      729756 non-null int64
COD_PROD                        729756 non-null int64
COD_RAMO                        729756 non-null int64
COD_RIES                        729756 non-null int64
COD_ZONA_CASCO                  729754 non-null float64
COD_ZONA_RC                     729754 non-null float64
COD_ZONA_ROBO                   729754 non-null float64
CONV_COMISIONARIO               729756 non-null object
FECHA_PROCESO                   729756 non-null datetime64[ns]
FECHA_VENC_POL                  729756 non-null datetime

## Loading tb_cif

In [24]:
# path to dataset
path_cif = "../../../data/interim/2. col_red_min/tb_cif-red_col-red_rows-clean.feather"

if not(os.path.exists(path_cif) and os.path.isfile(path_cif)):
    raise Exception("File {} doesn't exists.".format(path_cif))

In [25]:
df_cif = pd.read_feather(path_cif)
df_cif.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174922 entries, 0 to 174921
Data columns (total 6 columns):
CLIENTE                    129422 non-null object
CODIGO_NACION              172133 non-null object
DATECO_TIPO_ACTIVIDAD      112412 non-null object
DOMICILIO_CODIGO_POSTAL    148152 non-null float64
HABILITADO                 38716 non-null object
CIF_ID                     174922 non-null int64
dtypes: float64(1), int64(1), object(4)
memory usage: 8.0+ MB


  labels, = index.labels


In [26]:
# checking unique key
df_cif["CIF_ID"].nunique() / len(df_cif)

1.0

## Comparing merged vs tb_cif
Checking CIF_ID intersection

In [27]:
# Checking intersection between both datasets
id_common = set(df_vigabt_aseg["CIF_ID"].unique()).intersection(set(df_cif["CIF_ID"].unique()))
len(id_common), len(id_common) / df_vigabt_aseg["CIF_ID"].nunique() *100, len(id_common) / df_cif["CIF_ID"].nunique() *100

(12241, 28.5777653266097, 6.997976240838774)

<b>Few rows in common</b>

## Merging merged & tb_cif


In [28]:
# Merging dropping not founded INVESTIGACIONES
df_vigabt_aseg_cif = pd.merge(df_vigabt_aseg, df_cif, on="CIF_ID", how="left", suffixes=("", "_tbcif"))
print(len(df_vigabt_aseg_cif), len(df_cif), len(df_vigabt_aseg))

729756 174922 729756


### saving merged dataset

In [30]:
path_to_save = "../../../data/interim/5. merged"
check_filename = "VIGABT_POLIZAS+BTT_ASEGURADOS+tb_cif.feather"
df_vigabt_aseg_cif.to_feather(os.path.join(path_to_save, check_filename))