# ENTRENAMIENTO - GRADIENT BOOSTING TREES

El objetivo del presente notebook consiste en entrenar un modelo de Gradient Boosting Trees para aplicarlo a los datos de prueba.

## SET UP

In [17]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [18]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [19]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor 

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [21]:
# Path definition
path = "output/preprocessing/preprocessing_data.parquet"

In [22]:
# Read dataframe
df = spark.read.parquet(path, header= True, inferSchema=True)

In [23]:
# Verify rows
df.count()

10660715

In [24]:
# Set input and drop variables
input_values = df.columns
drop_values = ['periodo','id_cli','fecha_nacimiento','ult_actual','gasto_familiar','genero_dummy']

In [25]:
# Clean input variables
input_features = [x for x in input_values if x not in drop_values]

In [26]:
# Replace null
train = df.fillna(0)

In [27]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test.parquet"
# Read dataframe
df_2 = spark.read.parquet(path, header= True, inferSchema=True)
# Replace null
test = df_2.fillna(0)
# Verify rows
df_2.count()

281666

## MODELADO

### Vectorizar - Train

In [28]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [29]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [30]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")
train_model = train_model.fillna("NA")

In [31]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [32]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")
test_model = test_model.fillna("NA")

In [33]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|2311966| 202001|(210,[0,1,2,7,8,9...|
|3161308| 202003|(210,[0,1,2,9,10,...|
|3175733| 202003|(210,[0,1,2,7,8,9...|
|3180301| 202003|(210,[0,1,2,9,10,...|
|3189113| 202004|(210,[0,1,2,9,10,...|
+-------+-------+--------------------+
only showing top 5 rows



In [35]:
# Apply statistical learning
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'gasto_familiar', maxIter=10)
gbt_model = gbt.fit(train_model)

In [36]:
# Predict
predictions = gbt_model.transform(test_model)

In [37]:
predictions.count()

281666

In [38]:
# Show prediction
df_final = predictions.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|2311966#202001| 595871.1489608595|
|3161308#202003| 558908.0614379189|
|3175733#202003|1087309.9321845034|
|3180301#202003| 475973.2770270521|
|3189113#202004|1253690.1152068963|
|3200186#202004|1099354.6912965178|
|8135607#202009| 1289468.422120331|
|4891981#202007| 1073536.923961848|
|4264978#202007| 554154.5817756066|
|3848030#202005|  470194.388444068|
|3849659#202005|409231.67288724787|
|8150594#202009| 498916.0703908676|
|8163742#202009|  888659.557870885|
|8172268#202009| 411837.8782343126|
|8174312#202009|2118319.0940481625|
|4319165#202007| 581229.7657017352|
|4344726#202007| 506063.4926351437|
|4363030#202007|395401.16342389246|
|8175960#202009| 2697446.742138687|
|8177985#202009|1174634.4508090934|
+--------------+------------------+
only showing top 20 rows



In [39]:
df_final.toPandas().to_csv("output/implementations/df_final_20200128.csv", index=False)