# PARTE 3 - Treino do modelo, alternativa B

Para este modelo, vai ser usado o dataset criado na parte 2 designado **df_corr_category_and_category_success.csv.gz** que resulta do join das tabelas trainHistory.csv.gz com offers.csv.gz para cada offer.

In [43]:
# Basic imports

from pyspark.sql import SparkSession
from dotenv import load_dotenv
load_dotenv('.env')
import os
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [3]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

In [8]:
df_dataset = spark.read.csv(
    f"{base_path}-ml/df_corr_category_and_category_success.csv.gz",
    header=True,
    inferSchema=True
)

In [9]:
df_dataset.printSchema()

root
 |-- offer: integer (nullable = true)
 |-- id: long (nullable = true)
 |-- chain: integer (nullable = true)
 |-- market: integer (nullable = true)
 |-- repeattrips: integer (nullable = true)
 |-- repeater: integer (nullable = true)
 |-- offerdate: date (nullable = true)
 |-- category: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- company: integer (nullable = true)
 |-- offervalue: double (nullable = true)
 |-- brand: integer (nullable = true)
 |-- offer_count: integer (nullable = true)
 |-- offer_success_percentage: double (nullable = true)



Tendo em conta o schema acima do dataset, as features escolhidas foram as:
- **repeattrips** -> valor númerico que diz o número de vezes que o cliente voltou a comprar a oferta
- **repeater** -> valor binário onde 1 representa um cliente voltar a comprar a oferta e 0 representa o cliente não voltar a comprar a oferta.
- **offervalue** -> valor binário com o valor da oferta.
- **offer_count** -> número de vezes que a oferta foi feita.
- **offer_success_percentage** -> probabilidade de successo da oferta.
- **category** -> category of the made offer
- **quantity** -> quantity of the made offer

Bellow we exclude and identify the columns that were not chosen as features

In [34]:
# cols_not_feature = ['id', 'chain', 'market', 'offerdate', 'quantity', 'company', 'brand', 'offer']
cols_feature = ['repeattrips', 'repeater', 'offervalue', 'offer_count', 'offer_success_percentage', 'category', 'quantity']

# # As all the columns are numerical we won't need the StringIndexer
# index_output_cols = [x + ' Index' for x in df_dataset.columns if x not in cols_not_feature]
# one_output_cols = [x + ' OHE' for x in df_dataset.columns if x not in cols_not_feature]

# ohe_encoder = OneHotEncoder(inputCols=one_output_cols, outputCols=one_output_cols)
vec_assembler = VectorAssembler(
    inputCols=cols_feature,
    outputCol='features'
)

**Treino do modelo**

In [21]:
df_train, df_validation = df_dataset.randomSplit([0.8, 0.2], seed=42)

df_train.write.mode('overwrite').option('header', 'true').option('compression', 'gzip').csv(f"{base_path}-ml/model_B/df_train.csv.gz")

print(f'There are {df_train.count()} rows in the training set and {df_validation.count()} rows in the validation set.')

There are 127878 rows in the training set and 32179 rows in the validation set.


In [22]:
df_train.write.mode('overwrite').parquet(f"{base_path}-ml/model_B/df_train.parquet")
df_validation.write.mode('overwrite').parquet(f"{base_path}-ml/model_B/df_validation.parquet")

In [35]:
# Linear SVC algorithm
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol='repeater')

In [37]:
pipeline = Pipeline(stages=[vec_assembler, lsvc])

pipeline.save('data-ml/model_B/pipeline_model_2')

In [38]:
# Save in the pipeline for further use, should it be required
pipeline.save('data-ml/model_B/pipeline-LinearSVM-2')

In [39]:
model = pipeline.fit(df_train)

In [40]:
# Save the model for further use, should it be required.
model.save('data-ml/model_B/model-LinearSVM-2')

**Model evaluation**

In [41]:
df_predictions = model.transform(df_validation)

df_predictions.printSchema()

root
 |-- offer: integer (nullable = true)
 |-- id: long (nullable = true)
 |-- chain: integer (nullable = true)
 |-- market: integer (nullable = true)
 |-- repeattrips: integer (nullable = true)
 |-- repeater: integer (nullable = true)
 |-- offerdate: date (nullable = true)
 |-- category: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- company: integer (nullable = true)
 |-- offervalue: double (nullable = true)
 |-- brand: integer (nullable = true)
 |-- offer_count: integer (nullable = true)
 |-- offer_success_percentage: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [44]:
df_predictions_eval = df_predictions.select('features', 'rawPrediction', 'prediction', 'repeater')

binary_evaluator = BinaryClassificationEvaluator(
    labelCol='repeater',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)

area_under_roc = binary_evaluator.evaluate(df_predictions_eval)

print(f"Area Under ROC: {area_under_roc}")
df_predictions_eval.count()

Area Under ROC: 1.0


32179

In [45]:
df_confusion_matrix = df_predictions_eval.groupBy('repeater', 'prediction').count()
df_confusion_matrix.show()

+--------+----------+-----+
|repeater|prediction|count|
+--------+----------+-----+
|       0|       0.0|23454|
|       1|       1.0| 8725|
+--------+----------+-----+



In [48]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((df_confusion_matrix.repeater == 1) & (df_confusion_matrix.prediction == 1)).select('count').first()
tn = df_confusion_matrix.filter((df_confusion_matrix.repeater == 0) & (df_confusion_matrix.prediction == 0)).select('count').first()
fp = df_confusion_matrix.filter((df_confusion_matrix.repeater == 0) & (df_confusion_matrix.prediction == 1)).select('count').first()
fn = df_confusion_matrix.filter((df_confusion_matrix.repeater == 1) & (df_confusion_matrix.prediction == 0)).select('count').first()

confmat = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

if (tp):
    confmat['TP'] = tp['count'] * 1
if (tn):
    confmat['TN'] = tn['count'] * 1
if (fp):    
    confmat['FP'] = fp['count'] * 1
if (fn):
    confmat['FN'] = fn['count'] * 1
    
print(f"Confusion Matrix: {confmat}")

Confusion Matrix: {'TP': 8725, 'TN': 23454, 'FP': 0, 'FN': 0}


In [49]:
accuracy = (confmat['TP'] + confmat['TN']) / (confmat['TP'] + confmat['TN'] + confmat['FP'] + confmat['FN'])
precision = (confmat['TP']) / (confmat['TP'] + confmat['FP']) if (confmat['TP'] + confmat['FP']) > 0 else 0
recall = confmat['TP'] / (confmat['TP'] + confmat['FN']) if (confmat['TP'] + confmat['FN']) > 0 else 0
specificity = confmat['TN'] / (confmat['TN'] + confmat['FP']) if (confmat['TN'] + confmat['FP']) > 0 else 0
fiscore = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Specificity: {specificity}")
print(f"F1 Score: {fiscore}")


Evaluation Metrics:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
Specificity: 1.0
F1 Score: 1.0
