# ENTRENAMIENTO - LINEAR REGRESSION

El objetivo del presente notebook consiste en entrenar un modelo de regression multiple para aplicarlo a los datos de prueba.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRERIAS

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression 
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

## CARGAR DATOS

In [5]:
# Path definition
path = "output/preprocessing/preprocessing_data.parquet"

In [6]:
# Read dataframe
df = spark.read.parquet(path, header= True, inferSchema=True)

In [7]:
# Verify rows
df.count()

10660715

In [8]:
# Set input and drop variables
input_values = df.columns
drop_values = ['periodo','id_cli','fecha_nacimiento','ult_actual','gasto_familiar','genero_dummy']

In [9]:
# Clean input variables
input_features = [x for x in input_values if x not in drop_values]

In [10]:
# Replace null
train = df.fillna(0)

## MODELADO

### Vectorizar - Train

In [11]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [12]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [13]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")
train_model = train_model.fillna("NA")

In [14]:
# Split
splits = train_model.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [15]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar',maxIter=10,regParam=0.3, elasticNetParam=0.8)
regression = regression.fit(train_df)

In [16]:
# Predict
predictions = regression.transform(test_df)

In [17]:
# Show prediction
predictions.select("id_cli","periodo","features","gasto_familiar","prediction").show()

+------+-------+--------------------+--------------+------------------+
|id_cli|periodo|            features|gasto_familiar|        prediction|
+------+-------+--------------------+--------------+------------------+
|    17| 201904|(210,[0,1,2,3,7,8...|           0.0|  635749.283659599|
|    17| 201905|(210,[0,1,2,3,7,8...|       90550.0| 636413.2333022021|
|    17| 201907|(210,[0,1,2,3,4,7...|      491932.0| 729680.8458136437|
|    17| 201911|(210,[0,1,2,3,4,7...|      722670.0| 767489.2260929323|
|    17| 202005|(210,[0,1,2,3,4,7...|      291559.0| 679996.2674725873|
|    17| 202011|(210,[0,1,2,7,8,9...|     1003560.0| 920798.7259064047|
|   213| 201902|(210,[0,1,2,3,9,1...|       86000.0|384249.67284133384|
|   213| 202003|(210,[0,1,2,9,10,...|      328400.0| 554520.0731553443|
|   213| 202009|(210,[0,1,2,9,10,...|     907114.57| 437735.3577401772|
|   213| 202011|(210,[0,1,2,9,10,...|     436720.97|  439075.273971147|
|   274| 202009|(210,[0,1,2,9,10,...|      505399.0| 816131.6112

In [18]:
# Count rows
predictions.count()

3196920

## EVALUACIÓN

In [19]:
# Show mean (MAPE)
evaluation = predictions.withColumn("difference",abs(col("gasto_familiar")-col("prediction"))/col("gasto_familiar"))
evaluation[["difference"]].describe().show()

+-------+-------------------+
|summary|         difference|
+-------+-------------------+
|  count|            2995817|
|   mean| 164.73770576616883|
| stddev| 57830.769061963554|
|    min| -4998.143697679372|
|    max|4.233149601566107E7|
+-------+-------------------+



In [20]:
# Show evaluation metrics
regression_summary = regression.summary
print("RMSE: %f" % regression_summary.rootMeanSquaredError)
print("r2: %f" % regression_summary.r2)

RMSE: 1371274.918418
r2: 0.048868


## ANÁLISIS

In [21]:
#https://stackoverflow.com/questions/42935914/how-to-map-features-from-the-output-of-a-vectorassembler-back-to-the-column-name
train_features = [x["name"] for x in sorted(train_model.schema["features"].metadata["ml_attr"]["attrs"]["binary"]+
   train_model.schema["features"].metadata["ml_attr"]["attrs"]["numeric"], 
   key=lambda x: x["idx"])]

In [22]:
#coefficients = regression.coefficients
#intercept = regression.intercept

In [23]:
##df_features = pd.DataFrame(coefficients,index=train_features)

ValueError: DataFrame constructor not properly called!

In [24]:
# Setting max number of rows to display in a dataframe
#pd.set_option('display.max_rows', None)
#display(df_features)

NameError: name 'df_featuresy' is not defined

In [25]:
# Check mape levels
revision = (evaluation.withColumn("categoria",
                                   when(col("difference")>10,10)
                                   .when(col("difference")>9,9)
                                   .when(col("difference")>8,8)
                                   .when(col("difference")>7,7)
                                   .when(col("difference")>6,6)
                                   .when(col("difference")>5,5)
                                   .when(col("difference")>4,4)
                                   .when(col("difference")>3,3)
                                   .when(col("difference")>2,2)
                                   .when(col("difference")>1,1)
                                   .otherwise(0)))

In [26]:
# get total of prediction row
rows = revision.count()

In [27]:
# Estimate frecuency and percentage per category
participation = revision.groupby("categoria").count()
participation = participation.withColumn("%",round(col("count")/rows,2))

In [28]:
# Show results
participation.orderBy("%").show()

+---------+-------+----+
|categoria|  count|   %|
+---------+-------+----+
|        8|  22815|0.01|
|        6|  38444|0.01|
|        9|  17923|0.01|
|        7|  29282|0.01|
|        5|  52525|0.02|
|        4|  77447|0.02|
|        3| 122866|0.04|
|       10| 144911|0.05|
|        2| 212511|0.07|
|        1| 402469|0.13|
|        0|2075727|0.65|
+---------+-------+----+



## IMPLEMENTACIÓN

In [29]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test.parquet"
# Read dataframe
df_2 = spark.read.parquet(path, header= True, inferSchema=True)
# Replace null
test = df_2.fillna(0)
# Verify rows
df_2.count()

281666

In [30]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [31]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")
test_model = test_model.fillna("NA")

In [32]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar',maxIter=10,regParam=0.3, elasticNetParam=0.8)
regression = regression.fit(train_model)

In [33]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|2311966| 202001|(210,[0,1,2,7,8,9...|
|3161308| 202003|(210,[0,1,2,9,10,...|
|3175733| 202003|(210,[0,1,2,7,8,9...|
|3180301| 202003|(210,[0,1,2,9,10,...|
|3189113| 202004|(210,[0,1,2,9,10,...|
+-------+-------+--------------------+
only showing top 5 rows



In [35]:
# Predict
predictionsImpl = regression.transform(test_model)

In [36]:
predictionsImpl.count()

281666

In [37]:
# Show prediction
df_final = predictionsImpl.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|2311966#202001| 713560.2180773198|
|3161308#202003| 572967.9176994722|
|3175733#202003|1994151.2095735557|
|3180301#202003|510161.84091302764|
|3189113#202004| 1317703.659861727|
|3200186#202004|1695499.3669784688|
|8135607#202009|1840753.1755526222|
|4891981#202007|1428376.1432914748|
|4264978#202007| 590675.5301604049|
|3848030#202005| 563875.8496283952|
|3849659#202005|  348792.518939722|
|8150594#202009| 594547.7431794183|
|8163742#202009| 913649.0463479689|
|8172268#202009|  538016.566532518|
|8174312#202009| 2609517.889639274|
|4319165#202007| 621638.1877557415|
|4344726#202007| 414518.7441844414|
|4363030#202007| 419716.6502884845|
|8175960#202009| 2221309.507268043|
|8177985#202009|2535889.3298435165|
+--------------+------------------+
only showing top 20 rows



In [38]:
df_final.toPandas().to_csv("output/implementations/df_final_20200127.csv", index=False)