# ENTRENAMIENTO - GRADIENT BOOSTING TREES

El objetivo del presente notebook consiste en entrenar un modelo de Gradient Boosting Trees para aplicarlo a los datos de prueba.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor 

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [5]:
# Path definition
path = "output/preprocessing/preprocessing_data.parquet"

In [6]:
# Read dataframe
train = spark.read.parquet(path, header= True, inferSchema=True)

In [7]:
# Verify rows
train.count()

9670308

In [8]:
# Cast binary variables
train = (train
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [9]:
# Set input and drop variables
input_values = train.columns
drop_values = ['periodo','id_cli','fecha_nacimiento','ult_actual','gasto_familiar','genero_dummy','codigo_ciiu','NI','rep_calif_cred']

In [10]:
# Clean input variables
input_features = [x for x in input_values if x not in drop_values]

In [11]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test.parquet"
# Read dataframe
test = spark.read.parquet(path, header= True, inferSchema=True)
# Verify rows
test.count()

281666

In [12]:
test = (test
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

## MODELADO

### Vectorizar - Train

In [13]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [14]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [15]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")

In [16]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [17]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")

In [18]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|1165927| 201908|(62,[0,1,3,4,8,12...|
|1172919| 201908|(62,[0,1,2,4,17,1...|
|1538512| 201909|(62,[0,1,2,3,4,17...|
|3371270| 202004|(62,[0,1,2,4,5,8,...|
|2784853| 202003|(62,[0,1,2,4,12,1...|
+-------+-------+--------------------+
only showing top 5 rows



In [19]:
# Apply statistical learning
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'gasto_familiar', maxIter=10)
gbt_model = gbt.fit(train_model)

In [20]:
# Predict
predictions = gbt_model.transform(test_model)

In [21]:
predictions.count()

281666

In [22]:
# Show prediction
df_final = predictions.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|1165927#201908|1908153.4082411348|
|1172919#201908|1043598.1025032308|
|1538512#201909| 533523.6525717461|
|3371270#202004| 559804.2486788722|
|2784853#202003| 545032.3047431569|
|2219310#202001|484943.65334678243|
|2220638#202001| 637330.7321198647|
|1568926#201909| 645213.3154352331|
|1573039#201909| 715729.7898393531|
|2430420#202002| 1261099.398877888|
|1455662#201909| 965390.3112039503|
|1461386#201909|1215270.7181463738|
|1463134#201909|  897654.904836935|
|1477680#201909| 566263.5597818812|
|1361632#201909|1385525.0527353412|
|1362813#201909| 644618.1801736299|
|1379397#201909| 986222.7562428153|
| 834901#201905| 600845.9716702942|
| 839363#201905| 971400.3325544841|
| 842749#201905|1934200.0683489041|
+--------------+------------------+
only showing top 20 rows



In [23]:
df_final.toPandas().to_csv("output/implementations/model_GBT_20200128.csv", index=False)