# ENTENDIMIENTO DE LOS DATOS

El presente notebook busca analizar la incompletitud de los datos con el objetivo de seleccionar variables tentativas a ser eliminadas o que requieren un mayor tratamiento en caso de que desde la definición de negocio sean relevantes.

## SET UP

In [11]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [12]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()

In [13]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## ANÁLISIS EXPLORATORIO

In [15]:
header = spark.read.text("input/header.txt").collect()[0][:][0].split(",")

In [16]:
# Getting summary of groups and frequencies
# Initialize empty dataframe 
collector = pd.DataFrame([["start","start",0,0.0]], columns = ["variable","categoria","frecuencia_absoluta","frecuencia_relativa"])
# Initialize missing variables
missing = []
# Get the path
for var in header:
    try:
        #
        path = "output/" + var + ".csv"
        df = spark.read.option("header",True).csv(path)
        df = df.where(col("categoria") == "\\N")
        #
        if df.count() != 0:
            # 
            new_row = df.select("variable","categoria","frecuencia_absoluta","frecuencia_relativa").toPandas()
            # Apend rows
            collector = collector.append(new_row)
    except:
        # Getting missing variables
        missing.append(var)
        pass 

In [17]:
# Missing Variables
print(missing)

['ind', 'gasto_familiar']


In [18]:
# Cleaning incomplete dataframe
incomplete = (collector
              .set_index("variable")
              .drop(["start"],axis=0)
              .sort_values(by="frecuencia_relativa",ascending=False)
              .reset_index())

In [19]:
# Setting max number of rows to display in a dataframe
pd.set_option('display.max_rows', None)
display(incomplete)

Unnamed: 0,variable,categoria,frecuencia_absoluta,frecuencia_relativa
0,nro_tot_cuentas,\N,228,1.086296333635527e-05
1,ctas_embargadas,\N,228,1.086296333635527e-05
2,ctas_activas,\N,228,1.086296333635527e-05
3,tiene_ctas_embargadas,\N,20929152,0.99716057384652
4,rechazo_credito,\N,20874444,0.9945540343807072
5,pension_fopep,\N,20546842,0.97894557598195
6,tiene_cred_hipo_2,\N,19355492,0.922184210320692
7,tiene_cred_hipo_1,\N,19355492,0.922184210320692
8,cuota_cred_hipot,\N,19355492,0.922184210320692
9,tiene_consumo,\N,18315025,0.8726116012255709


## Seleccionar variables a eliminar:

In [35]:
drop_variables =incomplete.astype({"variable":'category',"categoria":'category', "frecuencia_absoluta":'int64', "frecuencia_relativa":'float64'}) 
drop_variables = drop_variables[drop_variables["frecuencia_relativa"] >= 0.7]
lista = drop_variables["variable"].tolist()
print(lista)

['tiene_ctas_embargadas', 'rechazo_credito', 'pension_fopep', 'tiene_cred_hipo_2', 'tiene_cred_hipo_1', 'cuota_cred_hipot', 'tiene_consumo', 'tiene_crediagil', 'convenio_lib', 'ingreso_nomina', 'ingreso_segurida_social']


In [38]:
pd.set_option('display.max_rows', None)
display(drop_variables)

Unnamed: 0,variable,categoria,frecuencia_absoluta,frecuencia_relativa
3,tiene_ctas_embargadas,\N,20929152,0.997161
4,rechazo_credito,\N,20874444,0.994554
5,pension_fopep,\N,20546842,0.978946
6,tiene_cred_hipo_2,\N,19355492,0.922184
7,tiene_cred_hipo_1,\N,19355492,0.922184
8,cuota_cred_hipot,\N,19355492,0.922184
9,tiene_consumo,\N,18315025,0.872612
10,tiene_crediagil,\N,18116033,0.863131
11,convenio_lib,\N,17942961,0.854885
12,ingreso_nomina,\N,15472674,0.737189


In [36]:
len(lista)

11