# TRAINING - GRADIENT BOOSTING TREES

The objective of this notebook is to train a Gradient Boosting Trees model to apply it to the test data.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRARIES

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor 

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
import time

warnings.filterwarnings('ignore')

## LOAD DATA

In [5]:
# Path definition
path = "output/preprocessing/preprocessing_data"

In [6]:
# Read dataframe
train = spark.read.parquet(path, header= True, inferSchema=True)

In [7]:
# Verify rows
train.count()

9670308

In [8]:
# Cast binary variables
train = (train
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [9]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test"
# Read dataframe
test = spark.read.parquet(path, header= True, inferSchema=True)
# Verify rows
test.count()

281666

In [10]:
test = (test
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

## FEATURE ENGINEER

In [11]:
train = (train
        .withColumn("gastos"
                    , col('cuota_cred_hipot') 
                    + col('cuota_de_vivienda') 
                    + col('cuota_de_consumo')
                    + col('cuota_rotativos')
                    + col('cuota_tarjeta_de_credito')
                    + col('cuota_de_sector_solidario')
                    + col('cuota_sector_real_comercio')
                    + col('cuota_tc_mdo')
                    + col('cuota_libranza_sf')
                   )
        .withColumn("ingresos" 
                    ,col('ingreso_nompen')
                    + col('ingreso_final')
                    + col('ingreso_nomina')
                    + col('ingreso_segurida_social')
                   )
       )

In [12]:
test = (test
        .withColumn("gastos"
                    , col('cuota_cred_hipot') 
                    + col('cuota_de_vivienda') 
                    + col('cuota_de_consumo')
                    + col('cuota_rotativos')
                    + col('cuota_tarjeta_de_credito')
                    + col('cuota_de_sector_solidario')
                    + col('cuota_sector_real_comercio')
                    + col('cuota_tc_mdo')
                    + col('cuota_libranza_sf')
                   )
        .withColumn("ingresos" 
                    ,col('ingreso_nompen')
                    + col('ingreso_final')
                    + col('ingreso_nomina')
                    + col('ingreso_segurida_social')
                   )
       )

In [13]:
# Clean input variables
input_features = [
    'ingreso_final',
    'cuota_cred_hipot',
    'gastos',
    'ingresos',
    'Independiente',
    'EXTERIOR',
    'edad',
    'estado_civil',
    'ctas_activas',
    'tipo_vivienda',
    'ingreso_segurida_social',
    'tiene_cred_hipo_2',
    'ind_mora_vigente',
    'cuota_de_consumo',
    'cant_moras_90_ult_12_meses',
    'cant_moras_30_ult_12_meses',
    'tiene_crediagil',
    'cant_oblig_tot_sf',
    'ANDINA',
    'pension_fopep',
    'pol_centr_ext',
    'cant_cast_ult_12m_sr',
    'cuota_de_sector_solidario',
    'cuota_libranza_sf',
    'saldo_no_rot_mdo',
    'cuota_tc_mdo',
    'saldo_prom3_tdc_mdo',
    'cuota_tarjeta_de_credito',
    'mediana_pen3',
    'tenencia_tc',
    'Pensionado',
    'nivel_academico',
    'Empleado',
    'Estudiante',
    'cat_ingreso'
]

## MODELING

In [14]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [15]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [16]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")

In [17]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [18]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")

In [19]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|1165927| 201908|(35,[0,1,2,3,4,6,...|
|1172919| 201908|(35,[0,2,3,6,7,8,...|
|1538512| 201909|(35,[0,1,2,3,6,7,...|
|3371270| 202004|(35,[0,2,3,6,7,8,...|
|2784853| 202003|(35,[0,2,3,6,7,8,...|
+-------+-------+--------------------+
only showing top 5 rows



In [20]:
# Apply statistical learning
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'gasto_familiar', maxIter=15)
gbt_model = gbt.fit(train_model)

In [21]:
# Predict
predictions = gbt_model.transform(test_model)

In [22]:
predictions.count()

281666

In [23]:
# Show prediction
df_final = predictions.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|1165927#201908|1736145.5984100662|
|1172919#201908|1131582.0031835462|
|1538512#201909| 545073.4598844848|
|3371270#202004| 575438.6233280987|
|2784853#202003| 668426.5306929757|
|2219310#202001|  473558.558804195|
|2220638#202001| 614897.5241146946|
|1568926#201909| 625914.8455061114|
|1573039#201909| 675365.1414532105|
|2430420#202002|1236472.2552926138|
|1455662#201909| 850456.2860925905|
|1461386#201909|1101112.2463947332|
|1463134#201909|1061031.4693321986|
|1477680#201909| 581021.0071983351|
|1361632#201909|1500121.7197952813|
|1362813#201909| 658178.7082106702|
|1379397#201909| 891884.4841366474|
| 834901#201905|  595549.942841727|
| 839363#201905| 947215.8302712908|
| 842749#201905|2142878.3229153035|
+--------------+------------------+
only showing top 20 rows



In [24]:
ts = time.strftime('%Y%m%d%H%M%S')
path = "output/implementations/model_GBT_" + str(ts) + ".csv"

In [25]:
df_final.toPandas().to_csv(path, index=False)