#Construção de um modelo 1

Para este modelo, vai ser usado o dataset criado na parte 2 designado **df_offers_and_trainHistory_with_count.csv.gz** que resulta do join das tabelas trainHistory.csv.gz com offers.csv.gz para cada offer.

In [3]:
# Basic imports


# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os
import csv
import os
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import plotly.express as px
import plotly.express as px
import pandas as pd


In [4]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

In [10]:
df_dataset = spark.read.csv(
    f"{base_path}/improved/df_offers_and_trainHistory.csv",
    header=True,
    inferSchema=True
)

df_dataset = df_dataset.withColumn("repeater", F.when(F.col("repeater") == "t", 1).otherwise(0))

In [None]:
df_dataset.printSchema()
df_dataset.show(5)

root
 |-- offer: integer (nullable = true)
 |-- id: long (nullable = true)
 |-- chain: integer (nullable = true)
 |-- market: integer (nullable = true)
 |-- repeattrips: integer (nullable = true)
 |-- repeater: integer (nullable = false)
 |-- offerdate: date (nullable = true)
 |-- category: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- company: integer (nullable = true)
 |-- offervalue: double (nullable = true)
 |-- brand: integer (nullable = true)

+-------+--------+-----+------+-----------+--------+----------+--------+--------+---------+----------+------+
|  offer|      id|chain|market|repeattrips|repeater| offerdate|category|quantity|  company|offervalue| brand|
+-------+--------+-----+------+-----------+--------+----------+--------+--------+---------+----------+------+
|1208251|   86246|  205|    34|          5|       1|2013-04-24|    2202|       1|104460040|       2.0|  3718|
|1197502|   86252|  205|    34|         16|       1|2013-03-27|    3203|       1

Tendo em conta o schema acima do dataset, as features escolhidas foram as:
- **offervalue** -> valor binário com o valor da oferta.
- **category** -> category of the made offer
- **quantity** -> quantity of the made offer
- **brand** -> brand of the made offer
- **company** -> company that the offer originates from.

In [11]:
df_train, df_validation = df_dataset.randomSplit([0.8, 0.2], seed=42)

print(f'There are {df_train.count()} rows in the training set and {df_validation.count()} rows in the validation set.')

There are 127878 rows in the training set and 32179 rows in the validation set.


# Preparação de Dados e Configuração Inicial

In [None]:
cols_feature = ['offervalue', 'category', 'quantity', 'brand', 'company']

vec_assembler = VectorAssembler(
    inputCols=cols_feature,
    outputCol='features'
)

train_assembled = vec_assembler.transform(df_train)
validation_assembled = vec_assembler.transform(df_validation)

modelos = {
    "LogisticRegression": LogisticRegression(labelCol="repeater", featuresCol="features"),
    "RandomForest":  RandomForestClassifier(labelCol="repeater", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="repeater", featuresCol="features"),
    "LinearSVC": LinearSVC(labelCol="repeater", featuresCol="features"),
    "DecisionTree": DecisionTreeClassifier(labelCol="repeater", featuresCol="features")
}
evaluator = BinaryClassificationEvaluator(labelCol="repeater", metricName="areaUnderROC")
results = {}
detailed_results = {}

def calculate_metrics(predictions):
    """Calculate detailed classification metrics from predictions"""
    # Get confusion matrix components
    confusion_matrix = predictions.groupBy('repeater', 'prediction').count()

    tp = confusion_matrix.filter((confusion_matrix.repeater == 1) & (confusion_matrix.prediction == 1)).select('count').first()
    tn = confusion_matrix.filter((confusion_matrix.repeater == 0) & (confusion_matrix.prediction == 0)).select('count').first()
    fp = confusion_matrix.filter((confusion_matrix.repeater == 0) & (confusion_matrix.prediction == 1)).select('count').first()
    fn = confusion_matrix.filter((confusion_matrix.repeater == 1) & (confusion_matrix.prediction == 0)).select('count').first()

 
    tp_val = tp['count'] if tp else 0
    tn_val = tn['count'] if tn else 0
    fp_val = fp['count'] if fp else 0
    fn_val = fn['count'] if fn else 0
    

    accuracy = (tp_val + tn_val) / (tp_val + tn_val + fp_val + fn_val) if (tp_val + tn_val + fp_val + fn_val) > 0 else 0
    precision = tp_val / (tp_val + fp_val) if (tp_val + fp_val) > 0 else 0
    recall = tp_val / (tp_val + fn_val) if (tp_val + fn_val) > 0 else 0
    specificity = tn_val / (tn_val + fp_val) if (tn_val + fp_val) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'f1_score': f1_score,
        'confusion_matrix': {'TP': tp_val, 'TN': tn_val, 'FP': fp_val, 'FN': fn_val}
    }

## Treino e Avaliação de Modelos de Classificação com Validação Cruzada


In [13]:
for nome, modelo in modelos.items():
    print(f"\n{'='*50}")
    print(f"Training {nome}...")
    print(f"{'='*50}")
    

    paramGrid = ParamGridBuilder().build()
    if nome == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.numTrees, [50]) \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [50]) \
            .addGrid(modelo.maxDepth, [5]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LinearSVC":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()
    elif nome == "DecisionTree":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LogisticRegression":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()

    crossval = CrossValidator(
        estimator=modelo,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
 
    cv_model = crossval.fit(train_assembled)
    cv_model.save(f"modelos/{nome}")

    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[nome] = auc
    
    metrics = calculate_metrics(predictions)
    detailed_results[nome] = {**metrics, 'auc_roc': auc}


    print(f"{nome} AUC-ROC: {auc:.4f}")
    print(f"{nome} Accuracy: {metrics['accuracy']:.4f}")
    print(f"{nome} Precision: {metrics['precision']:.4f}")
    print(f"{nome} Recall: {metrics['recall']:.4f}")
    print(f"{nome} Specificity: {metrics['specificity']:.4f}")
    print(f"{nome} F1 Score: {metrics['f1_score']:.4f}")

melhor_modelo_nome = max(results, key=results.get)
print(f"Melhor modelo: {melhor_modelo_nome} com AUC-ROC {results[melhor_modelo_nome]}")


Training LogisticRegression...
LogisticRegression AUC-ROC: 0.5730
LogisticRegression Accuracy: 0.7289
LogisticRegression Precision: 0.0000
LogisticRegression Recall: 0.0000
LogisticRegression Specificity: 1.0000
LogisticRegression F1 Score: 0.0000

Training RandomForest...
RandomForest AUC-ROC: 0.6472
RandomForest Accuracy: 0.7289
RandomForest Precision: 0.0000
RandomForest Recall: 0.0000
RandomForest Specificity: 1.0000
RandomForest F1 Score: 0.0000

Training GradientBoosting...
GradientBoosting AUC-ROC: 0.6760
GradientBoosting Accuracy: 0.7299
GradientBoosting Precision: 0.5123
GradientBoosting Recall: 0.0763
GradientBoosting Specificity: 0.9730
GradientBoosting F1 Score: 0.1329

Training LinearSVC...
LinearSVC AUC-ROC: 0.5190
LinearSVC Accuracy: 0.7289
LinearSVC Precision: 0.0000
LinearSVC Recall: 0.0000
LinearSVC Specificity: 1.0000
LinearSVC F1 Score: 0.0000

Training DecisionTree...
DecisionTree AUC-ROC: 0.3829
DecisionTree Accuracy: 0.7299
DecisionTree Precision: 0.5123
Decisio

# Representação gráfica dos resultados obtidos

In [15]:

results_data = [
    ("RandomForest", 0.6472, None),  
    ("LogisticRegression", 0.5730, None),  
    ("DecisionTree", 0.3829, None),  
    ("GradientBoostedTrees", 0.6760, None),  
    ("LinearSVC", 0.5190, None)  
]


data = {
    "Model": [name for name, _, _ in results_data],
    "AUC-ROC": [auc for _, auc, _ in results_data]
}
df = pd.DataFrame(data)


fig = px.bar(
    df,
    x="Model",
    y="AUC-ROC",
    title="Comparação dos modelos (AUC-ROC)",
    labels={"AUC-ROC": "AUC-ROC Score", "Model": "Model Name"},
    color="Model",
    color_discrete_sequence=px.colors.qualitative.Plotly
)


fig.update_layout(
    xaxis_title="Modelo",
    yaxis_title="Pontuação AUC-ROC",
    yaxis_range=[0, 1],  
    showlegend=False,
    title_x=0.5
)


fig.show()

In [17]:
import plotly.express as px
import pandas as pd

# Dados
data = {
    'Modelo': ['LogisticRegression']*6 + ['RandomForest']*6 + ['GradientBoosting']*6 + ['LinearSVC']*6 + ['DecisionTree']*6,
    'Métrica': ['AUC-ROC', 'Accuracy', 'Precision', 'Recall', 'Specificity', 'F1 Score']*5,
    'Valor': [0.5730, 0.7289, 0.0000, 0.0000, 1.0000, 0.0000,
              0.6472, 0.7289, 0.0000, 0.0000, 1.0000, 0.0000,
              0.6760, 0.7299, 0.5123, 0.0763, 0.9730, 0.1329,
              0.5190, 0.7289, 0.0000, 0.0000, 1.0000, 0.0000,
              0.3829, 0.7299, 0.5123, 0.0763, 0.9730, 0.1329]
}
df = pd.DataFrame(data)

# Gráfico de barras agrupadas
fig = px.bar(df, x='Modelo', y='Valor', color='Métrica', barmode='group',
             title='Desempenho dos Modelos por Métrica',
             labels={'Valor': 'Valor da Métrica', 'Modelo': 'Modelos'},
             color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(yaxis_range=[0, 1])  # Ajusta escala para métricas entre 0 e 1
fig.show()