# TRAIN - LINEAR REGRESSION

The objective of this notebook is to train a multiple regression model to apply it to the test data.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



## LIBRARIES

In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("model").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression 
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
import time

warnings.filterwarnings('ignore')

## LOAD DATA

In [1]:
# Path definition
path = "output/preprocessing/preprocessing_data"

In [6]:
# Read dataframe
train = spark.read.parquet(path, header= True, inferSchema=True)

In [7]:
# Cast binary variables
train = (train
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [8]:
# Verify rows
train.count()

9670308

## FEATURE ENGINEER

In [9]:
train = (train
        .withColumn("gastos"
                    , col('cuota_cred_hipot') 
                    + col('cuota_de_vivienda') 
                    + col('cuota_de_consumo')
                    + col('cuota_rotativos')
                    + col('cuota_tarjeta_de_credito')
                    + col('cuota_de_sector_solidario')
                    + col('cuota_sector_real_comercio')
                    + col('cuota_tc_mdo')
                    + col('cuota_libranza_sf')
                   )
        .withColumn("ingresos" 
                    ,col('ingreso_nompen')
                    + col('ingreso_final')
                    + col('ingreso_nomina')
                    + col('ingreso_segurida_social')
                   )
       )

In [10]:
# Clean input variables
input_features = [
    'ingreso_final',
    'cuota_cred_hipot',
    'gastos',
    'ingresos',
    'Independiente',
    'EXTERIOR',
    'edad',
    'estado_civil',
    'ctas_activas',
    'tipo_vivienda',
    'ingreso_segurida_social',
    'tiene_cred_hipo_2',
    'ind_mora_vigente',
    'cuota_de_consumo',
    'cant_moras_90_ult_12_meses',
    'cant_moras_30_ult_12_meses',
    'tiene_crediagil',
    'cant_oblig_tot_sf',
    'ANDINA',
    'pension_fopep',
    'pol_centr_ext',
    'cant_cast_ult_12m_sr',
    'cuota_de_sector_solidario',
    'cuota_libranza_sf',
    'saldo_no_rot_mdo',
    'cuota_tc_mdo',
    'saldo_prom3_tdc_mdo',
    'cuota_tarjeta_de_credito',
    'mediana_pen3',
    'tenencia_tc',
    'Pensionado',
    'nivel_academico',
    'Empleado',
    'Estudiante',
    'cat_ingreso'
]

## MODELING

In [11]:
# Vectorize variables to define the features column
feat_vector = VectorAssembler(inputCols=input_features, outputCol= "features")

In [12]:
# Appply vectorization to train
transTrain = feat_vector.transform(train)

In [13]:
# select variables for the train model
train_model = transTrain.select("id_cli","periodo","features","gasto_familiar")

In [14]:
# Split
splits = train_model.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [15]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar')
regression = regression.fit(train_df)

In [16]:
# Predict
predictions = regression.transform(test_df)

In [17]:
# Show prediction
predictions.select("id_cli","periodo","features","gasto_familiar","prediction").show()

+------+-------+--------------------+--------------+------------------+
|id_cli|periodo|            features|gasto_familiar|        prediction|
+------+-------+--------------------+--------------+------------------+
|   192| 201908|(35,[0,3,4,6,7,8,...|      333581.0| 1056716.367480738|
|   192| 201910|(35,[0,3,4,6,7,8,...|      803590.0| 1056716.367480738|
|   192| 201911|(35,[0,3,4,6,7,8,...|           0.0| 1056716.367480738|
|   213| 201911|(35,[0,2,3,6,7,8,...|      196736.0| 556577.9833860581|
|   274| 202011|(35,[0,3,4,6,7,8,...|     1201628.0|  934212.336194071|
|   332| 201907|(35,[0,1,2,3,6,8,...|     152067.95| 703019.8025187006|
|   332| 201909|(35,[0,1,2,3,6,8,...|     152067.95| 703146.1537436687|
|   400| 201905|(35,[0,2,3,6,7,8,...|      135138.0|419268.98690919403|
|   400| 201910|(35,[0,2,3,6,7,9,...|      273541.0|438116.49097787787|
|   400| 202003|(35,[0,3,6,7,8,9,...|     457000.98|  522401.459744308|
|   400| 202007|(35,[0,2,3,6,7,8,...|      673909.0| 544999.7306

In [18]:
# Count rows
predictions.count()

2899767

## EVALUATION

In [19]:
# Show mean (MAPE)
evaluation = predictions.withColumn("difference",abs(col("gasto_familiar")-col("prediction"))/col("gasto_familiar"))
evaluation[["difference"]].describe().show()

+-------+-------------------+
|summary|         difference|
+-------+-------------------+
|  count|            2720706|
|   mean| 132.68873241530306|
| stddev|   53124.6520251922|
|    min| -8.411494796379241|
|    max|4.667591396242523E7|
+-------+-------------------+



In [20]:
# Show evaluation metrics
regression_summary = regression.summary
print("RMSE: %f" % regression_summary.rootMeanSquaredError)
print("r2: %f" % regression_summary.r2)

RMSE: 1310282.750686
r2: 0.041740


In [21]:
coefficients = regression.coefficients.values
intercept = regression.intercept

In [22]:
index_values = train.columns.remove("gasto_familiar")

In [23]:
df_equation = pd.DataFrame(coefficients,index=index_values,columns=["coefficients"])

In [24]:
#df_equation.to_csv("output/equations/equation_analysis.csv")

In [25]:
intercept

327957.080005972

In [26]:
# Check mape levels
revision = (evaluation.withColumn("categoria",
                                   when(col("difference")>10,10)
                                   .when(col("difference")>9,9)
                                   .when(col("difference")>8,8)
                                   .when(col("difference")>7,7)
                                   .when(col("difference")>6,6)
                                   .when(col("difference")>5,5)
                                   .when(col("difference")>4,4)
                                   .when(col("difference")>3,3)
                                   .when(col("difference")>2,2)
                                   .when(col("difference")>1,1)
                                   .otherwise(0)))

In [27]:
# get total of prediction row
rows = revision.count()

In [28]:
# Estimate frecuency and percentage per category
participation = revision.groupby("categoria").count()
participation = participation.withColumn("%",round(col("count")/rows,2))

In [29]:
# Show results
participation.orderBy("%").show()

+---------+-------+----+
|categoria|  count|   %|
+---------+-------+----+
|        6|  35758|0.01|
|        7|  26982|0.01|
|        8|  21309|0.01|
|        9|  16705|0.01|
|        4|  72058|0.02|
|        5|  48991|0.02|
|        3| 116226|0.04|
|       10| 132117|0.05|
|        2| 198261|0.07|
|        1| 372289|0.13|
|        0|1859071|0.64|
+---------+-------+----+



## IMPLEMENTATION

In [30]:
# Set testing path
path = "output/preprocessing/preprocessing_data_test"
# Read dataframe
test = spark.read.parquet(path, header= True, inferSchema=True)
# Verify rows
test.count()

281666

In [31]:
# Cast binary variables
test = (test
         .withColumn("ind_mora_vigente",col("ind_mora_vigente").cast("integer"))
         .withColumn("cartera_castigada",col("cartera_castigada").cast("integer"))
         .withColumn("tenencia_tc",col("tenencia_tc").cast("integer"))
         .withColumn("tiene_consumo",col("tiene_consumo").cast("integer"))
         .withColumn("tiene_crediagil",col("tiene_crediagil").cast("integer"))
         .withColumn("tiene_ctas_activas",col("tiene_ctas_activas").cast("integer"))
         .withColumn("tiene_ctas_embargadas",col("tiene_ctas_embargadas").cast("integer"))
         .withColumn("tiene_cred_hipo_1",col("tiene_cred_hipo_1").cast("integer"))
         .withColumn("tiene_cred_hipo_2",col("tiene_cred_hipo_2").cast("integer"))
         .withColumn("pension_fopep",col("pension_fopep").cast("integer"))
        )

In [32]:
test = (test
        .withColumn("gastos"
                    , col('cuota_cred_hipot') 
                    + col('cuota_de_vivienda') 
                    + col('cuota_de_consumo')
                    + col('cuota_rotativos')
                    + col('cuota_tarjeta_de_credito')
                    + col('cuota_de_sector_solidario')
                    + col('cuota_sector_real_comercio')
                    + col('cuota_tc_mdo')
                    + col('cuota_libranza_sf')
                   )
        .withColumn("ingresos" 
                    ,col('ingreso_nompen')
                    + col('ingreso_final')
                    + col('ingreso_nomina')
                    + col('ingreso_segurida_social')
                   )
       )

In [33]:
# Appply vectorization to test
transTest = feat_vector.transform(test)

In [34]:
# select variables for the test model
test_model = transTest.select("id_cli","periodo","features")

In [35]:
# Apply statistical learning
regression = LinearRegression(labelCol='gasto_familiar',maxIter=10,regParam=0.3, elasticNetParam=0.8)
regression = regression.fit(train_model)

In [36]:
test_model.show(5)

+-------+-------+--------------------+
| id_cli|periodo|            features|
+-------+-------+--------------------+
|1165927| 201908|(35,[0,1,2,3,4,6,...|
|1172919| 201908|(35,[0,2,3,6,7,8,...|
|1538512| 201909|(35,[0,1,2,3,6,7,...|
|3371270| 202004|(35,[0,2,3,6,7,8,...|
|2784853| 202003|(35,[0,2,3,6,7,8,...|
+-------+-------+--------------------+
only showing top 5 rows



In [37]:
# Predict
predictionsImpl = regression.transform(test_model)

In [38]:
predictionsImpl.count()

281666

In [39]:
# Show prediction
df_final = predictionsImpl.select(concat(str("id_cli"),lit('#'),str("periodo")).alias("id_registro"),col("prediction").alias("gasto_familiar"))
df_final.show()

+--------------+------------------+
|   id_registro|    gasto_familiar|
+--------------+------------------+
|1165927#201908| 6655819.418661765|
|1172919#201908| 948302.5570941041|
|1538512#201909| 585794.6239035076|
|3371270#202004|451142.66094330663|
|2784853#202003| 703600.4682528283|
|2219310#202001|424675.23457814334|
|2220638#202001|  780116.984847947|
|1568926#201909| 576971.8169634055|
|1573039#201909| 716146.3453313763|
|2430420#202002| 2332585.678762756|
|1455662#201909| 827507.7547779795|
|1461386#201909| 1415758.370124515|
|1463134#201909|1155552.4174266548|
|1477680#201909| 606432.0771466385|
|1361632#201909| 1602208.591673243|
|1362813#201909| 641402.3415134074|
|1379397#201909|1343564.2055354752|
| 834901#201905| 546603.3825196422|
| 839363#201905|1133867.9198002308|
| 842749#201905|2490104.7897326313|
+--------------+------------------+
only showing top 20 rows



In [None]:
ts = time.strftime('%Y%m%d%H%M%S')
path = "output/implementations/model_LR_" + str(ts) + ".csv"

In [40]:
df_final.toPandas().to_csv(path, index=False)

## ANALYSIS

In [41]:
coefficients = regression.coefficients.values
intercept = regression.intercept

In [42]:
df_equation = pd.DataFrame(coefficients,index=index_values,columns=["coefficients"])

In [43]:
#df_equation.to_csv("output/equations/equation_implementation.csv")

In [44]:
intercept

339138.22780574876