# ENTENDIMIENTO DE LOS DATOS

El presente notebook busca ayudar a entender cuál es la tipologia de datos de las variables independientes, para estructurar el esquema de la base de datos y definir tratamientos posteriores.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## CARGAR DATOS

In [5]:
schema = StructType([
    StructField('periodo',StringType(), True),
    StructField('id_cli',StringType(), True),
    StructField('fecha_nacimiento',StringType(), True),
    StructField('edad',StringType(), True),
    StructField('genero',StringType(), True),
    StructField('estado_civil',StringType(), True),
    StructField('nivel_academico',StringType(), True),
    StructField('profesion',StringType(), True),
    StructField('ocupacion',StringType(), True),
    StructField('tipo_vivienda',StringType(), True),
    StructField('ult_actual',StringType(), True),
    StructField('categoria',StringType(), True),
    StructField('codigo_ciiu',StringType(), True),
    StructField('ind_mora_vigente',StringType(), True),
    StructField('cartera_castigada',StringType(), True),
    StructField('ciudad_residencia',StringType(), True),
    StructField('departamento_residencia',StringType(), True),
    StructField('ciudad_laboral',StringType(), True),
    StructField('departamento_laboral',StringType(), True),
    StructField('rechazo_credito',StringType(), True),
    StructField('mora_max',StringType(), True),
    StructField('cant_moras_30_ult_12_meses',StringType(), True),
    StructField('cant_moras_60_ult_12_meses',StringType(), True),
    StructField('cant_moras_90_ult_12_meses',StringType(), True),
    StructField('cupo_total_tc',StringType(), True),
    StructField('tenencia_tc',StringType(), True),
    StructField('cuota_tc_bancolombia',StringType(), True),
    StructField('tiene_consumo',StringType(), True),
    StructField('tiene_crediagil',StringType(), True),
    StructField('nro_tot_cuentas',StringType(), True),
    StructField('ctas_activas',StringType(), True),
    StructField('tiene_ctas_activas',StringType(), True),
    StructField('ctas_embargadas',StringType(), True),
    StructField('tiene_ctas_embargadas',StringType(), True),
    StructField('pension_fopep',StringType(), True),
    StructField('cuota_cred_hipot',StringType(), True),
    StructField('tiene_cred_hipo_1',StringType(), True),
    StructField('tiene_cred_hipo_2',StringType(), True),
    StructField('mediana_nom3',StringType(), True),
    StructField('mediana_pen3',StringType(), True),
    StructField('ingreso_nompen',StringType(), True),
    StructField('cat_ingreso',StringType(), True),
    StructField('ingreso_final',StringType(), True),
    StructField('cant_mora_30_tdc_ult_3m_sf',StringType(), True),
    StructField('cant_mora_30_consum_ult_3m_sf',StringType(), True),
    StructField('cuota_de_vivienda',StringType(), True),
    StructField('cuota_de_consumo',StringType(), True),
    StructField('cuota_rotativos',StringType(), True),
    StructField('cuota_tarjeta_de_credito',StringType(), True),
    StructField('cuota_de_sector_solidario',StringType(), True),
    StructField('cuota_sector_real_comercio',StringType(), True),
    StructField('cupo_tc_mdo',StringType(), True),
    StructField('saldo_prom3_tdc_mdo',StringType(), True),
    StructField('cuota_tc_mdo',StringType(), True),
    StructField('saldo_no_rot_mdo',StringType(), True),
    StructField('cuota_libranza_sf',StringType(), True),
    StructField('cant_oblig_tot_sf',StringType(), True),
    StructField('cant_cast_ult_12m_sr',StringType(), True),
    StructField('ind',StringType(), True),
    StructField('rep_calif_cred',StringType(), True),
    StructField('pol_centr_ext',StringType(), True),
    StructField('convenio_lib',StringType(), True),
    StructField('ingreso_nomina',StringType(), True),
    StructField('ingreso_segurida_social',StringType(), True),
    StructField('gasto_familiar',StringType(), True)
])

In [6]:
df = spark.read.schema(schema).option("delimiter", ";").csv("input/Dataton_train_semicolon.csv")

## ANÁLISIS EXPLORATORIO

### Sampling

In [7]:
deseados = 1000

In [8]:
df_pandas = df.limit(deseados).toPandas()

In [9]:
df_pandas.head()

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,...,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201908,2089776,19840630,35.05817932922655,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311715.5,C,0,\N,1255032,\N,232526
1,201909,2089776,19840630,35.14305270362765,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311715.5,C,0,\N,1255032,\N,265900
2,201907,2089776,19840630,34.96235455167693,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311306.0,C,0,\N,1255032,\N,243052
3,201903,2089776,19840630,34.63928815879535,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311306.0,C,0,\N,1172612,\N,276014
4,201911,2089776,19840630,35.30732375085558,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,0,0,677516.0,C,0,\N,1255032,\N,418866


In [10]:
df_pandas.to_csv("output/sample.csv")

### Grupos y Frecuencias por columna

In [6]:
header = spark.read.text("input/header.txt").collect()[0][:][0].split(",")

In [9]:
registros = df.count()

In [13]:
# Getting groups and frequencies
# Initialize missing variables
missing = []
for var in header:
    try:
        df_var = df.groupby(col(var).alias("categoria")).count()
        df_var = df_var.withColumn("variable",lit(var))
        df_var = df_var.withColumn("frecuencia_relativa",col("count")/registros)
        df_var = df_var.withColumnRenamed("count","frecuencia_absoluta")
        df_var = df_var.select("variable","categoria","frecuencia_absoluta","frecuencia_relativa")
        path = 'output/' + var + '.csv'
        df_var.toPandas().to_csv(path)          
    except:
        # Getting missing variables
        missing.append(var)
        pass 

In [None]:
# Missing Variables
print(missing)

In [7]:
# Getting summary of groups and frequencies
# Initialize empty dataframe 
collector = pd.DataFrame([["start",0,[0]]], columns = ['variables', 'numero_categorias','muestra'])
# Initialize missing variables
missing = []
# Get the path
for var in header:
    try:
        path = "output/" + var + ".csv"
        # Getting data
        data = pd.read_csv(path)
        # Getting collection
        df_collection = spark.read.option("header",True).csv(path)
        df_collection = df_collection.sort("frecuencia_absoluta",ascending=False).limit(10)
        df_collection = df_collection.groupby("variable").agg(collect_list("categoria").alias("sample"))
        collection = df_collection.collect()[0][1]
        # Summarizing
        new_row = pd.DataFrame([[var,len(data.index),collection]], columns = ['variables', 'numero_categorias','muestra'])
        # Apend rows
        collector = collector.append(new_row)
    except:
        # Getting missing variables
        missing.append(var)
        pass 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
# Missing Variables
print(missing)

['ind', 'rep_calif_cred', 'pol_centr_ext', 'convenio_lib', 'ingreso_nomina', 'ingreso_segurida_social', 'gasto_familiar']


In [9]:
# Cleaning summary dataframe
summary = (collector
           .set_index("variables")
           .sort_values(by="numero_categorias",ascending=False)
           .drop("start",axis=0)
           .reset_index())

In [10]:
# Setting max number of rows to display in a dataframe
pd.set_option('display.max_rows', None)
display(summary)

Unnamed: 0,variables,numero_categorias,muestra
0,ingreso_final,5252621,"[855000, 3930000, 5039809, 2345000, 4090000, 1..."
1,cuota_tc_bancolombia,2921647,"[134850, 79750, 1260000, 183750, 389300, 13545..."
2,ingreso_nompen,2846752,"[4468000, 1676400, 2856600, 1453054, 2884000, ..."
3,mediana_nom3,2679651,"[3680000, 3248700, 13500000, 3330000, 1453054,..."
4,cuota_de_consumo,1209174,"[1473000, 1412000, 1472000, 1372000, 1405000, ..."
5,id_cli,1126662,"[1412022, 412979, 1299855, 2955989, 5648023, 4..."
6,cuota_de_vivienda,473714,"[627000, 985000, 302000, 641000, 1505000, 5840..."
7,cuota_cred_hipot,455560,"[709973.17, 755924.88, 272264.74, 238076.12, 8..."
8,saldo_no_rot_mdo,290978,"[889000, 1034000, 25000000, 942000, 988000, 89..."
9,mediana_pen3,223588,"[794916, 835303, 8463643, 728616, 1076183, 122..."


In [None]:
#['ind', 'rep_calif_cred', 'pol_centr_ext', 'convenio_lib', 'ingreso_nomina', 'ingreso_segurida_social', 'gasto_familiar']

In [10]:
# New Header
var = 'ingreso_segurida_social'

df_var = df.groupby(col(var).alias("categoria")).count()
df_var = df_var.withColumn("variable",lit(var))
df_var = df_var.withColumn("frecuencia_relativa",col("count")/registros)
df_var = df_var.withColumnRenamed("count","frecuencia_absoluta")
df_var = df_var.select("variable","categoria","frecuencia_absoluta","frecuencia_relativa")
path = 'output/' + var + '.csv'
df_var.toPandas().to_csv(path)          