# ENTRENAMIENTO - LINEAR REGRESSION

El objetivo del presente notebook consiste en entrenar un modelo de regression multiple para aplicarlo a los datos de prueba.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression 
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [5]:
# Path definition
path = "output/preprocessing/preprocessing_data.parquet"

In [6]:
# Read dataframe
df = spark.read.parquet(path, header= True, inferSchema=True)

In [7]:
# Verify rows
df.count()

10660715

In [8]:
# Set input and drop variables
input_values = df.columns
drop_values = ['periodo','id_cli','fecha_nacimiento','ult_actual','gasto_familiar']

In [9]:
# Clean input variables
input_features = [x for x in input_values if x not in drop_values]

In [10]:
# Replace null
train = df.fillna(0)

## MODELADO

### Vectorizar - Train

In [11]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [12]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [13]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")
train_model = train_model.fillna("NA")

In [14]:
# Split
splits = train_model.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [34]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar',maxIter=10,regParam=0.3, elasticNetParam=0.8)
regression = regression.fit(train_df)

In [35]:
# Predict
predictions = regression.transform(test_df)

In [36]:
# Show prediction
predictions.select("id_cli","periodo","features","gasto_familiar","prediction").show()

+------+-------+--------------------+--------------+------------------+
|id_cli|periodo|            features|gasto_familiar|        prediction|
+------+-------+--------------------+--------------+------------------+
|    17| 201903|(212,[0,1,2,7,8,1...|       33350.0| 557001.8999391494|
|    17| 201910|(212,[0,1,2,3,4,7...|      456092.0|  777464.283014857|
|    17| 201911|(212,[0,1,2,3,4,7...|      722670.0| 795243.0997271333|
|    17| 202004|(212,[0,1,2,3,4,7...|      338407.0| 864202.9904496818|
|    17| 202008|(212,[0,1,2,7,8,9...|      294113.0| 802104.4736187836|
|    17| 202009|(212,[0,1,2,7,8,9...|      172379.0| 728024.7795735411|
|   213| 201904|(212,[0,1,2,3,9,1...|      918071.0| 522370.7110925317|
|   213| 201905|(212,[0,1,2,3,9,1...|      662900.0| 522370.7110925317|
|   213| 201911|(212,[0,1,2,9,10,...|      196736.0| 593066.3217000682|
|   213| 202001|(212,[0,1,2,9,10,...|           0.0| 617089.4188316514|
|   213| 202004|(212,[0,1,2,9,10,...|      352859.0| 625418.5439

In [18]:
# Count rows
predictions.count()

3198308

## EVALUACIÓN

In [19]:
# Show mean (MAPE)
evaluation = predictions.withColumn("difference",abs(col("gasto_familiar")-col("prediction"))/col("gasto_familiar"))
evaluation[["difference"]].describe().show()

+-------+--------------------+
|summary|          difference|
+-------+--------------------+
|  count|             2996400|
|   mean|  189.57669576803443|
| stddev|   115252.4795068138|
|    min| -17.461297507113862|
|    max|1.7960568374397865E8|
+-------+--------------------+



In [20]:
# Show evaluation metrics
regression_summary = regression.summary
print("RMSE: %f" % regression_summary.rootMeanSquaredError)
print("r2: %f" % regression_summary.r2)

RMSE: 1368251.616789
r2: 0.049794


## ANÁLISIS

In [21]:
# Check mape levels
revision = (evaluation.withColumn("categoria",
                                   when(col("difference")>10,10)
                                   .when(col("difference")>9,9)
                                   .when(col("difference")>8,8)
                                   .when(col("difference")>7,7)
                                   .when(col("difference")>6,6)
                                   .when(col("difference")>5,5)
                                   .when(col("difference")>4,4)
                                   .when(col("difference")>3,3)
                                   .when(col("difference")>2,2)
                                   .when(col("difference")>1,1)
                                   .otherwise(0)))

In [22]:
# get total of prediction row
rows = revision.count()

In [23]:
# Estimate frecuency and percentage per category
participation = revision.groupby("categoria").count()
participation = participation.withColumn("%",round(col("count")/rows,2))

In [24]:
# Show results
participation.orderBy("%").show()

+---------+-------+----+
|categoria|  count|   %|
+---------+-------+----+
|        9|  17978|0.01|
|        7|  29163|0.01|
|        8|  22667|0.01|
|        6|  38496|0.01|
|        4|  77345|0.02|
|        5|  53123|0.02|
|        3| 124347|0.04|
|       10| 144534|0.05|
|        2| 211805|0.07|
|        1| 402271|0.13|
|        0|2076579|0.65|
+---------+-------+----+



## IMPLEMENTACIÓN

In [25]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test.parquet"
# Read dataframe
df_2 = spark.read.parquet(path, header= True, inferSchema=True)
# Replace null
test = df_2.fillna(0)
# Verify rows
df_2.count()

281666

In [26]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [27]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")
test_model = test_model.fillna("NA")

In [28]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar',maxIter=10,regParam=0.3, elasticNetParam=0.8)
regression = regression.fit(train_model)

In [33]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|2311966| 202001|(211,[0,1,2,7,8,9...|
|3161308| 202003|(211,[0,1,2,9,10,...|
|3175733| 202003|(211,[0,1,2,7,8,9...|
|3180301| 202003|(211,[0,1,2,9,10,...|
|3189113| 202004|(211,[0,1,2,9,10,...|
+-------+-------+--------------------+
only showing top 5 rows



In [29]:
# Predict
predictions = regression.transform(test_model)

In [45]:
# Show prediction
df_final = predictions.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+-----------+------------------+
|id_registro|    gasto_familiar|
+-----------+------------------+
|  17#201903| 557001.8999391494|
|  17#201910|  777464.283014857|
|  17#201911| 795243.0997271333|
|  17#202004| 864202.9904496818|
|  17#202008| 802104.4736187836|
|  17#202009| 728024.7795735411|
| 213#201904| 522370.7110925317|
| 213#201905| 522370.7110925317|
| 213#201911| 593066.3217000682|
| 213#202001| 617089.4188316514|
| 213#202004| 625418.5439857439|
| 213#202005|   559115.04601236|
| 213#202008| 550272.6431850082|
| 274#202009| 860235.3384508649|
| 332#201907| 812542.9226471953|
| 332#201910| 847538.4959882563|
| 332#201911| 831575.1416064955|
| 332#202010|  871165.690379369|
| 345#201904| 614038.1129008438|
| 400#201905|372798.63809345086|
+-----------+------------------+
only showing top 20 rows



In [47]:
df_final.count()

3198308

In [48]:
df_final.toPandas().to_csv("output/implementations/df_final_20200127.csv", index=False)