# ENTRENAMIENTO - RANDOM FOREST

El objetivo del presente notebook consiste en entrenar un modelo de Random Forest para aplicarlo a los datos de prueba.

## SET UP

In [25]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [26]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [27]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [29]:
# Path definition
path = "output/preprocessing/preprocessing_data.parquet"

In [30]:
# Read dataframe
train = spark.read.parquet(path, header= True, inferSchema=True)

In [31]:
# Verify rows
train.count()

9670308

In [32]:
# Cast binary variables
train = (train
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [33]:
# Set input and drop variables
input_values = train.columns
drop_values = ['periodo','id_cli','fecha_nacimiento','ult_actual','gasto_familiar','genero_dummy','codigo_ciiu','NI','rep_calif_cred']

In [34]:
# Clean input variables
input_features = [x for x in input_values if x not in drop_values]

In [35]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test.parquet"
# Read dataframe
test = spark.read.parquet(path, header= True, inferSchema=True)
# Verify rows
test.count()

281666

In [36]:
test = (test
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

## MODELADO

### Vectorizar - Train

In [37]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [38]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [39]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")

In [40]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [41]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")

In [42]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|1165927| 201908|(62,[0,1,3,4,8,12...|
|1172919| 201908|(62,[0,1,2,4,17,1...|
|1538512| 201909|(62,[0,1,2,3,4,17...|
|3371270| 202004|(62,[0,1,2,4,5,8,...|
|2784853| 202003|(62,[0,1,2,4,12,1...|
+-------+-------+--------------------+
only showing top 5 rows



In [43]:
# Homologate target variable to label
trainingData = train_model.withColumnRenamed("gasto_familiar","label")

In [44]:
# Homologate target variable to label
testData = test_model.withColumnRenamed("gasto_familiar","label")

In [45]:
# https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

In [46]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

In [47]:
# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

In [48]:
# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

In [49]:
# Make predictions.
predictions = model.transform(testData)

In [50]:
# Select example rows to display.
predictions = predictions.withColumnRenamed("label","gasto_familiar")

In [51]:
predictions.count()

281666

In [52]:
# Show prediction
df_final = predictions.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|1165927#201908|2245627.2577155763|
|1172919#201908| 928273.0586991375|
|1538512#201909| 519093.5845291334|
|3371270#202004| 518277.5145250812|
|2784853#202003| 589125.6672007655|
|2219310#202001| 546253.2360973016|
|2220638#202001| 586685.4051289329|
|1568926#201909| 646398.5598656738|
|1573039#201909| 709633.2268067767|
|2430420#202002| 991626.9503412399|
|1455662#201909| 857854.5563419787|
|1461386#201909|1081357.2737375868|
|1463134#201909| 815969.0614879329|
|1477680#201909| 512339.2879311177|
|1361632#201909|  1465039.83426048|
|1362813#201909| 641757.9560151295|
|1379397#201909|1393013.6347733508|
| 834901#201905| 593604.9020455598|
| 839363#201905| 1507261.971837223|
| 842749#201905|1733074.7318330582|
+--------------+------------------+
only showing top 20 rows



In [53]:
df_final.toPandas().to_csv("output/implementations/model_RF_20200129.csv", index=False)