# ENTRENAMIENTO - LINEAR REGRESSION

El objetivo del presente notebook consiste en entrenar un modelo de regression multiple para aplicarlo a los datos de prueba.

## SET UP

In [13]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [14]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [15]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression 
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [17]:
# Path definition
path = "output/preprocessing/preprocessing_data.parquet"

In [18]:
# Read dataframe
train = spark.read.parquet(path, header= True, inferSchema=True)

In [19]:
# Cast binary variables
train = (train
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [20]:
# Verify rows
train.count()

9670308

In [21]:
# Set input and drop variables
input_values = train.columns
drop_values = ['periodo','id_cli','fecha_nacimiento','ult_actual','gasto_familiar','genero_dummy','codigo_ciiu','NI','rep_calif_cred']

In [22]:
# Clean input variables
input_features = [x for x in input_values if x not in drop_values]

## MODELADO

### Vectorizar - Train

In [23]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [24]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [25]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")

In [26]:
# Split
splits = train_model.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [27]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar')
regression = regression.fit(train_df)

In [28]:
# Predict
predictions = regression.transform(test_df)

In [29]:
# Show prediction
predictions.select("id_cli","periodo","features","gasto_familiar","prediction").show()

+------+-------+--------------------+--------------+------------------+
|id_cli|periodo|            features|gasto_familiar|        prediction|
+------+-------+--------------------+--------------+------------------+
|   192| 201905|(62,[0,1,2,3,4,17...|      418000.0| 1090093.431740284|
|   192| 201908|(62,[0,1,2,3,4,17...|      333581.0| 1090093.431740284|
|   192| 201909|(62,[0,1,2,3,4,17...|      347490.0| 1090093.431740284|
|   192| 201911|(62,[0,1,2,3,4,17...|           0.0| 1090093.431740284|
|   213| 202010|(62,[0,1,2,3,4,17...|     527998.67|498967.22339105606|
|   332| 201910|(62,[0,4,8,12,13,...|     152067.95| 583714.2970309258|
|   332| 202007|(62,[0,4,8,12,13,...|     154427.03| 673544.5618057251|
|   332| 202008|(62,[0,4,5,12,13,...|     156373.35| 690064.5860490799|
|   371| 201903|(62,[0,1,4,17,18,...|      595836.0|1208236.0570220947|
|   400| 201903|(62,[0,1,2,3,4,8,...|        6700.0| 281990.0342140198|
|   400| 201904|(62,[0,1,2,3,4,8,...|       74100.0| 354339.5066

In [30]:
# Count rows
predictions.count()

2899779

## EVALUACIÓN

In [31]:
# Show mean (MAPE)
evaluation = predictions.withColumn("difference",abs(col("gasto_familiar")-col("prediction"))/col("gasto_familiar"))
evaluation[["difference"]].describe().show()

+-------+-------------------+
|summary|         difference|
+-------+-------------------+
|  count|            2720434|
|   mean| 169.15687409550472|
| stddev|  79086.14484680894|
|    min|-3.0094063191267795|
|    max|8.860992813050652E7|
+-------+-------------------+



In [32]:
# Show evaluation metrics
regression_summary = regression.summary
print("RMSE: %f" % regression_summary.rootMeanSquaredError)
print("r2: %f" % regression_summary.r2)

RMSE: 1338594.816451
r2: 0.041836


In [40]:
coefficients = regression.coefficients.values
intercept = regression.intercept

In [38]:
index_values = train.columns.remove("gasto_familiar")

In [41]:
df_equation = pd.DataFrame(coefficients,index=index_values,columns=["coefficients"])

In [42]:
df_equation.to_csv("output/equations/equation_analysis.csv")

In [43]:
intercept

-2320221901.6527786

In [44]:
# Check mape levels
revision = (evaluation.withColumn("categoria",
                                   when(col("difference")>10,10)
                                   .when(col("difference")>9,9)
                                   .when(col("difference")>8,8)
                                   .when(col("difference")>7,7)
                                   .when(col("difference")>6,6)
                                   .when(col("difference")>5,5)
                                   .when(col("difference")>4,4)
                                   .when(col("difference")>3,3)
                                   .when(col("difference")>2,2)
                                   .when(col("difference")>1,1)
                                   .otherwise(0)))

In [45]:
# get total of prediction row
rows = revision.count()

In [46]:
# Estimate frecuency and percentage per category
participation = revision.groupby("categoria").count()
participation = participation.withColumn("%",round(col("count")/rows,2))

In [47]:
# Show results
participation.orderBy("%").show()

+---------+-------+----+
|categoria|  count|   %|
+---------+-------+----+
|        9|  16698|0.01|
|        6|  35960|0.01|
|        8|  21232|0.01|
|        7|  27251|0.01|
|        5|  49191|0.02|
|        4|  72545|0.03|
|        3| 114845|0.04|
|       10| 131910|0.05|
|        2| 196002|0.07|
|        1| 369979|0.13|
|        0|1864166|0.64|
+---------+-------+----+



## IMPLEMENTACIÓN

In [83]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test.parquet"
# Read dataframe
test = spark.read.parquet(path, header= True, inferSchema=True)
# Verify rows
test.count()

281666

In [85]:
# Cast binary variables
test = (test
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [86]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [87]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")

In [88]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar',maxIter=10,regParam=0.3, elasticNetParam=0.8)
regression = regression.fit(train_model)

In [89]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|1165927| 201908|(62,[0,1,3,4,8,12...|
|1172919| 201908|(62,[0,1,2,4,17,1...|
|1538512| 201909|(62,[0,1,2,3,4,17...|
|3371270| 202004|(62,[0,1,2,4,5,8,...|
|2784853| 202003|(62,[0,1,2,4,12,1...|
+-------+-------+--------------------+
only showing top 5 rows



In [90]:
# Predict
predictionsImpl = regression.transform(test_model)

In [91]:
predictionsImpl.count()

281666

In [92]:
# Show prediction
df_final = predictionsImpl.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|1165927#201908| 6697111.195188613|
|1172919#201908|1071127.8537389168|
|1538512#201909|397318.42602455005|
|3371270#202004| 504734.8045365114|
|2784853#202003| 723898.7143897307|
|2219310#202001| 404036.4295673901|
|2220638#202001|   697352.35682594|
|1568926#201909| 624878.8911242125|
|1573039#201909| 754588.9431789878|
|2430420#202002| 4458586.199835714|
|1455662#201909| 825592.6695296242|
|1461386#201909|1411427.3541204634|
|1463134#201909|1034323.6044531914|
|1477680#201909| 627417.0205473587|
|1361632#201909|1579337.2494138726|
|1362813#201909| 677649.8049222067|
|1379397#201909| 1230200.768053591|
| 834901#201905| 544779.1370414208|
| 839363#201905|1078801.2176850596|
| 842749#201905| 2691585.463587974|
+--------------+------------------+
only showing top 20 rows



In [93]:
df_final.toPandas().to_csv("output/implementations/model_LR_20200128.csv", index=False)

## ANÁLISIS

In [94]:
coefficients = regression.coefficients.values
intercept = regression.intercept

In [95]:
df_equation = pd.DataFrame(coefficients,index=index_values,columns=["coefficients"])

In [96]:
df_equation.to_csv("output/equations/equation_implementation.csv")

In [97]:
intercept

220941.60284442044