# PREPROCESAMIENTO - TRAIN

El objetivo del presente notebook consiste en la eliminación de outliers con base en variables previamente seleccionadas mediante análisis del diccionario de datos y tipo de datos por variable, junto con la dumización de las variables categóricas.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## CARGAR DATOS

In [5]:
schema = StructType([
    StructField('periodo',IntegerType(), True),
    StructField('id_cli',IntegerType(), True),
    StructField('fecha_nacimiento',IntegerType(), True),
    StructField('edad',DoubleType(), True),
    StructField('genero',StringType(), True),
    StructField('estado_civil',StringType(), True),
    StructField('nivel_academico',StringType(), True),
    StructField('profesion',StringType(), True),
    StructField('ocupacion',StringType(), True),
    StructField('tipo_vivienda',StringType(), True),
    StructField('ult_actual',IntegerType(), True),
    StructField('categoria',DoubleType(), True),
    StructField('codigo_ciiu',IntegerType(), True),
    StructField('ind_mora_vigente',StringType(), True),
    StructField('cartera_castigada',StringType(), True),
    StructField('ciudad_residencia',StringType(), True),
    StructField('departamento_residencia',StringType(), True),
    StructField('ciudad_laboral',StringType(), True),
    StructField('departamento_laboral',StringType(), True),
    StructField('rechazo_credito',StringType(), True),
    StructField('mora_max',DoubleType(), True),
    StructField('cant_moras_30_ult_12_meses',DoubleType(), True),
    StructField('cant_moras_60_ult_12_meses',DoubleType(), True),
    StructField('cant_moras_90_ult_12_meses',DoubleType(), True),
    StructField('cupo_total_tc',DoubleType(), True),
    StructField('tenencia_tc',StringType(), True),
    StructField('cuota_tc_bancolombia',DoubleType(), True),
    StructField('tiene_consumo',StringType(), True),
    StructField('tiene_crediagil',StringType(), True),
    StructField('nro_tot_cuentas',IntegerType(), True),
    StructField('ctas_activas',IntegerType(), True),
    StructField('tiene_ctas_activas',StringType(), True),
    StructField('ctas_embargadas',IntegerType(), True),
    StructField('tiene_ctas_embargadas',StringType(), True),
    StructField('pension_fopep',StringType(), True),
    StructField('cuota_cred_hipot',DoubleType(), True),
    StructField('tiene_cred_hipo_1',StringType(), True),
    StructField('tiene_cred_hipo_2',StringType(), True),
    StructField('mediana_nom3',DoubleType(), True),
    StructField('mediana_pen3',DoubleType(), True),
    StructField('ingreso_nompen',DoubleType(), True),
    StructField('cat_ingreso',StringType(), True),
    StructField('ingreso_final',DoubleType(), True),
    StructField('cant_mora_30_tdc_ult_3m_sf',DoubleType(), True),
    StructField('cant_mora_30_consum_ult_3m_sf',DoubleType(), True),
    StructField('cuota_de_vivienda',DoubleType(), True),
    StructField('cuota_de_consumo',DoubleType(), True),
    StructField('cuota_rotativos',DoubleType(), True),
    StructField('cuota_tarjeta_de_credito',DoubleType(), True),
    StructField('cuota_de_sector_solidario',DoubleType(), True),
    StructField('cuota_sector_real_comercio',DoubleType(), True),
    StructField('cupo_tc_mdo',DoubleType(), True),
    StructField('saldo_prom3_tdc_mdo',DoubleType(), True),
    StructField('cuota_tc_mdo',DoubleType(), True),
    StructField('saldo_no_rot_mdo',DoubleType(), True),
    StructField('cuota_libranza_sf',DoubleType(), True),
    StructField('cant_oblig_tot_sf',DoubleType(), True),
    StructField('cant_cast_ult_12m_sr',DoubleType(), True),
    StructField('ind',DoubleType(), True),
    StructField('rep_calif_cred',StringType(), True),
    StructField('pol_centr_ext',DoubleType(), True),
    StructField('convenio_lib',StringType(), True),
    StructField('ingreso_nomina',DoubleType(), True),
    StructField('ingreso_segurida_social',DoubleType(), True),
    StructField('gasto_familiar',DoubleType(), True)
])

In [6]:
df = spark.read.schema(schema).option("delimiter", ";").option("nullValue", "\\N").csv("input/Dataton_train_semicolon.csv")

In [7]:
df.count()

20988748

## PREPROCESAMIENTO

### Variables seleccionadas para reducir columnas

In [8]:
lista = ['tiene_ctas_embargadas', 'rechazo_credito', 'pension_fopep', 'tiene_cred_hipo_2', 'tiene_cred_hipo_1', 'cuota_cred_hipot', 'tiene_consumo', 'tiene_crediagil', 'convenio_lib', 'ingreso_nomina', 'ingreso_segurida_social','ciudad_residencia','ciudad_laboral','departamento_laboral']

### Eliminación de dimensionalidad

In [9]:
df = df.drop(*lista)

### Variables seleccionadas para reducir registros

In [10]:
variables = ['ind','ingreso_final','cupo_total_tc','cuota_tc_bancolombia','mediana_nom3','cuota_de_consumo','cuota_tarjeta_de_credito','cupo_tc_mdo','saldo_prom3_tdc_mdo','cuota_tc_mdo','saldo_no_rot_mdo']

### Resumen estadístico por variable para obtener la reglas a aplicar en los outliers

In [11]:
summary = pd.read_csv("summary.csv")
summary = summary.rename(columns={'Unnamed: 0': 'summary'})
summary = summary.set_index("summary")

In [12]:
summary.head()

Unnamed: 0_level_0,count,mean,stddev,min,25%,50%,75%,max,IQR,upper,lower
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
mora_max,15017120,5.008723,23.71134,0.0,0.0,0.0,1.0,999.0,1.0,2.5,-1.5
cupo_total_tc,20988748,4426551.0,10936240.0,0.0,0.0,0.0,4700000.0,9999999.0,4700000.0,11750000.0,-7050000.0
cuota_tc_bancolombia,20988748,194987.5,607943.1,0.0,0.0,0.0,197200.0,999999.999,197200.0,493000.0,-295800.0
nro_tot_cuentas,20988520,1.077879,0.4400288,0.0,1.0,1.0,1.0,9.0,0.0,1.0,1.0
ctas_activas,20988520,1.020634,0.406999,0.0,1.0,1.0,1.0,9.0,0.0,1.0,1.0


### Eliminación de outliers

In [13]:
data = df

In [14]:
# Getting summary of variables
# Initialize missing variables
error = []
# Get the path
for var in variables:
    try:
        # Getting parameters
        upper =summary[summary.index == var].iloc[0]["upper"].astype("float64")
        lower = summary[summary.index == var].iloc[0]["lower"].astype("float64")
        # Dropping outliers
        data = data.where((col(var)>lower) & (col(var)<upper))
        print("variable: " + str(var),",rows: " + str(data.count()))
    except:
        # Getting missing variables
        error.append(var)
        pass 

variable: ind ,rows: 18354590
variable: ingreso_final ,rows: 17872401
variable: cupo_total_tc ,rows: 16243361
variable: cuota_tc_bancolombia ,rows: 15736006
variable: mediana_nom3 ,rows: 14799177
variable: cuota_de_consumo ,rows: 13865267
variable: cuota_tarjeta_de_credito ,rows: 13531170
variable: cupo_tc_mdo ,rows: 12870610
variable: saldo_prom3_tdc_mdo ,rows: 11994388
variable: cuota_tc_mdo ,rows: 11421656
variable: saldo_no_rot_mdo ,rows: 10665003


In [15]:
print(error)

[]


In [16]:
# Dropping outliers of Edad
data = data.withColumn("edad",round(col("edad"),0))
data = data.where((col("edad")<99) & (col("edad")>=18))
data.count()

10660715

In [17]:
data_preprocessed = data

## One hot encoding

In [18]:
categorical_variables = ['genero','estado_civil','nivel_academico','profesion','ocupacion','tipo_vivienda','departamento_residencia','cat_ingreso','rep_calif_cred','ind_mora_vigente','cartera_castigada','tenencia_tc','tiene_ctas_activas']

In [19]:
data_preprocessed = data_preprocessed.fillna("NA")

In [20]:
dummy = "_dummy"
dumies = []
for var in categorical_variables:
    dumies.append(var + dummy)

In [21]:
indexer = "_indexer"
indexers = []
for var in categorical_variables:
    indexers.append(var + indexer)

In [22]:
indexed = data_preprocessed

for i in range(len(categorical_variables)):
    indexer = StringIndexer(inputCol = categorical_variables[i], outputCol=indexers[i])
    indexed = indexer.fit(indexed).transform(indexed)

In [23]:
df_encode = indexed

for i in range(len(categorical_variables)):
    encoder = OneHotEncoder(inputCol=indexers[i], outputCol=dumies[i])
    encoder = encoder.fit(df_encode)
    df_encode = encoder.transform(df_encode)

In [24]:
## clean variables
df_encode = df_encode.drop(*categorical_variables)
df_encode = df_encode.drop(*indexers)

In [25]:
df_encode.columns

['periodo',
 'id_cli',
 'fecha_nacimiento',
 'edad',
 'ult_actual',
 'categoria',
 'codigo_ciiu',
 'mora_max',
 'cant_moras_30_ult_12_meses',
 'cant_moras_60_ult_12_meses',
 'cant_moras_90_ult_12_meses',
 'cupo_total_tc',
 'cuota_tc_bancolombia',
 'nro_tot_cuentas',
 'ctas_activas',
 'ctas_embargadas',
 'mediana_nom3',
 'mediana_pen3',
 'ingreso_nompen',
 'ingreso_final',
 'cant_mora_30_tdc_ult_3m_sf',
 'cant_mora_30_consum_ult_3m_sf',
 'cuota_de_vivienda',
 'cuota_de_consumo',
 'cuota_rotativos',
 'cuota_tarjeta_de_credito',
 'cuota_de_sector_solidario',
 'cuota_sector_real_comercio',
 'cupo_tc_mdo',
 'saldo_prom3_tdc_mdo',
 'cuota_tc_mdo',
 'saldo_no_rot_mdo',
 'cuota_libranza_sf',
 'cant_oblig_tot_sf',
 'cant_cast_ult_12m_sr',
 'ind',
 'pol_centr_ext',
 'gasto_familiar',
 'genero_dummy',
 'estado_civil_dummy',
 'nivel_academico_dummy',
 'profesion_dummy',
 'ocupacion_dummy',
 'tipo_vivienda_dummy',
 'departamento_residencia_dummy',
 'cat_ingreso_dummy',
 'rep_calif_cred_dummy',
 'in

In [26]:
df_encode.count()

10660715

In [28]:
# Save as parquet file
df_encode.write.parquet("output/preprocessing/preprocessing_data.parquet")