# ENTENDIMIENTO DE LOS DATOS

El objetivo del presente notebook consiste en obtener las métricas de variables previamente seleccionadas mediante análisis de negocio, para generar reglas que ayuden a identificar outliers.

## SET UP

In [24]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [25]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()

In [26]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *

In [145]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [28]:
schema = StructType([
    StructField('periodo',IntegerType(), True),
    StructField('id_cli',IntegerType(), True),
    StructField('fecha_nacimiento',IntegerType(), True),
    StructField('edad',DoubleType(), True),
    StructField('genero',StringType(), True),
    StructField('estado_civil',StringType(), True),
    StructField('nivel_academico',StringType(), True),
    StructField('profesion',StringType(), True),
    StructField('ocupacion',StringType(), True),
    StructField('tipo_vivienda',StringType(), True),
    StructField('ult_actual',IntegerType(), True),
    StructField('categoria',DoubleType(), True),
    StructField('codigo_ciiu',IntegerType(), True),
    StructField('ind_mora_vigente',StringType(), True),
    StructField('cartera_castigada',StringType(), True),
    StructField('ciudad_residencia',StringType(), True),
    StructField('departamento_residencia',StringType(), True),
    StructField('ciudad_laboral',StringType(), True),
    StructField('departamento_laboral',StringType(), True),
    StructField('rechazo_credito',StringType(), True),
    StructField('mora_max',DoubleType(), True),
    StructField('cant_moras_30_ult_12_meses',DoubleType(), True),
    StructField('cant_moras_60_ult_12_meses',DoubleType(), True),
    StructField('cant_moras_90_ult_12_meses',DoubleType(), True),
    StructField('cupo_total_tc',DoubleType(), True),
    StructField('tenencia_tc',StringType(), True),
    StructField('cuota_tc_bancolombia',DoubleType(), True),
    StructField('tiene_consumo',StringType(), True),
    StructField('tiene_crediagil',StringType(), True),
    StructField('nro_tot_cuentas',IntegerType(), True),
    StructField('ctas_activas',IntegerType(), True),
    StructField('tiene_ctas_activas',StringType(), True),
    StructField('ctas_embargadas',IntegerType(), True),
    StructField('tiene_ctas_embargadas',StringType(), True),
    StructField('pension_fopep',StringType(), True),
    StructField('cuota_cred_hipot',DoubleType(), True),
    StructField('tiene_cred_hipo_1',StringType(), True),
    StructField('tiene_cred_hipo_2',StringType(), True),
    StructField('mediana_nom3',DoubleType(), True),
    StructField('mediana_pen3',DoubleType(), True),
    StructField('ingreso_nompen',DoubleType(), True),
    StructField('cat_ingreso',StringType(), True),
    StructField('ingreso_final',DoubleType(), True),
    StructField('cant_mora_30_tdc_ult_3m_sf',DoubleType(), True),
    StructField('cant_mora_30_consum_ult_3m_sf',DoubleType(), True),
    StructField('cuota_de_vivienda',DoubleType(), True),
    StructField('cuota_de_consumo',DoubleType(), True),
    StructField('cuota_rotativos',DoubleType(), True),
    StructField('cuota_tarjeta_de_credito',DoubleType(), True),
    StructField('cuota_de_sector_solidario',DoubleType(), True),
    StructField('cuota_sector_real_comercio',DoubleType(), True),
    StructField('cupo_tc_mdo',DoubleType(), True),
    StructField('saldo_prom3_tdc_mdo',DoubleType(), True),
    StructField('cuota_tc_mdo',DoubleType(), True),
    StructField('saldo_no_rot_mdo',DoubleType(), True),
    StructField('cuota_libranza_sf',DoubleType(), True),
    StructField('cant_oblig_tot_sf',DoubleType(), True),
    StructField('cant_cast_ult_12m_sr',DoubleType(), True),
    StructField('ind',DoubleType(), True),
    StructField('rep_calif_cred',StringType(), True),
    StructField('pol_centr_ext',DoubleType(), True),
    StructField('convenio_lib',StringType(), True),
    StructField('ingreso_nomina',DoubleType(), True),
    StructField('ingreso_segurida_social',DoubleType(), True),
    StructField('gasto_familiar',DoubleType(), True)
])

In [29]:
df = spark.read.schema(schema).option("delimiter", ";").option("nullValue", "\\N").csv("input/Dataton_train_semicolon.csv")

In [30]:
df.count()

20988748

## ANÁLISIS EXPLORATORIO

### Variables seleccionadas para realizar los resumenes estadísticos

In [31]:
variables = ['mora_max','cupo_total_tc','cuota_tc_bancolombia','nro_tot_cuentas','ctas_activas','mediana_nom3','mediana_pen3','ingreso_final','cuota_de_vivienda','cuota_de_consumo','cuota_rotativos','cuota_tarjeta_de_credito','cuota_de_sector_solidario','cuota_sector_real_comercio','cupo_tc_mdo','saldo_prom3_tdc_mdo','cuota_tc_mdo','saldo_no_rot_mdo','cuota_libranza_sf','cant_oblig_tot_sf','cant_cast_ult_12m_sr','ind','gasto_familiar']

### Obtención de limites para determinar outliers

In [32]:
# Getting summary of groups and frequencies
columns = ['count','mean','stddev','min','25%','50%','75%','max','IQR','upper','lower']
data = [20988748,201959.3282136219,49.69330198063793,201902,201907.0,202001.0,202007.0,202011,100.0,202157.0,201757.0]
index = ["start"]
# Initialize empty dataframe 
collector = pd.DataFrame(data=[data],index=index,columns=columns)
# Set index name
collector.index.name = "summary"
# Initialize missing variables
error = []
# Get the path
for var in variables:
    try:
        # Getting Metrics
        summary = df[[var]].summary()
        # Transpose metrics to estimate boundaries
        df_pandas = summary.toPandas()
        df_pandas = df_pandas.set_index("summary")
        df_pandas = df_pandas.T
        # Getting boundaries
        df_pandas["IQR"] = df_pandas["75%"].astype("float64") - df_pandas["25%"].astype("float64") 
        # Upper Q3+1.5*IQR
        df_pandas["upper"] = df_pandas["75%"].astype("float64") + 1.5*df_pandas["IQR"].astype("float64")
        # Lower Q1-1.5*IQR 
        df_pandas["lower"] = df_pandas["25%"].astype("float64") - 1.5*df_pandas["IQR"].astype("float64")
        # Apend rows
        collector = collector.append(df_pandas)
    except:
        # Getting missing variables
        error.append(var)
        pass 

In [33]:
print(error)

[]


In [34]:
# Cleaning summary dataframe
summary = collector.drop("start",axis=0)

In [35]:
# Setting max number of rows to display in a dataframe
pd.set_option('display.max_rows', None)
display(summary)

summary,count,mean,stddev,min,25%,50%,75%,max,IQR,upper,lower
mora_max,15017120,5.008722644555014,23.71134088246277,0.0,0.0,0.0,1.0,3367.0,1.0,2.5,-1.5
cupo_total_tc,20988748,4426551.108144218,10936236.909264078,0.0,0.0,0.0,4700000.0,966200000.0,4700000.0,11750000.0,-7050000.0
cuota_tc_bancolombia,20988748,194987.48602254904,607943.0880680818,0.0,0.0,0.0,197200.0,242766616.95,197200.0,493000.0,-295800.0
nro_tot_cuentas,20988520,1.0778785736202456,0.4400287935782586,0.0,1.0,1.0,1.0,36.0,0.0,1.0,1.0
ctas_activas,20988520,1.0206335653967025,0.4069990095936185,0.0,1.0,1.0,1.0,36.0,0.0,1.0,1.0
mediana_nom3,20988748,1065056.9633973849,2491489.0416699634,0.0,0.0,0.0,1488485.0,582629596.0,1488485.0,3721212.5,-2232727.5
mediana_pen3,20988748,154719.52523034596,875768.2804332058,0.0,0.0,0.0,0.0,825295278.0,0.0,0.0,0.0
ingreso_final,20834697,4952928.470613839,13803849.167957649,0.0,1600000.0,2735000.0,5012939.0,6646637302.04,3412939.0,10132347.5,-3519408.5
cuota_de_vivienda,20988748,160509.97694142075,890804.3113849775,-21225012.72428308,0.0,0.0,0.0,572309000.0,0.0,0.0,0.0
cuota_de_consumo,20988748,636921.0889199376,4907928.535098017,-687131500.0,0.0,0.0,480000.0,3461069000.0,480000.0,1200000.0,-720000.0


In [36]:
# Save point
summary.to_csv("summary.csv")