# Construção de modelos de aprendizagem automática 3
Este modelo foi treinado com o resultado do join do trainHistory com o customer_features.csv que consiste na aggregação numa tabela por cliente de métricas extraidas a partir de todos os registos de compra do cliente encontrados na tabela transactions.
Este join foi feito usando o id e com apenas dados de ofertas feitas com o cliente a retornar.


In [30]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
load_dotenv('.env')
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
import plotly.express as px
import pandas as pd
import time
from datetime import datetime

In [20]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

**Data Ingestion**

Customer_features

In [21]:
print("Info for customer_features.csv:")
df_customer_features = spark.read.csv(f"{base_path}/improved/customer_features.csv",
                                      header=True, inferSchema=True)
df_customer_features.show(5)
df_customer_features.printSchema()


Info for customer_features.csv:
+---------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+
|       id|total_transactions|total_spent|avg_spent|first_purchase|last_purchase|unique_categories|unique_products|
+---------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+
|100007447|              1096|    6644.88|     6.06|    2012-03-06|   2013-04-21|              235|            226|
|100010021|               349|    1419.96|     4.07|    2012-03-12|   2013-05-12|              123|            109|
|100012115|               118|     553.02|     4.69|    2012-03-12|   2013-04-17|               62|             54|
|100017875|               348|     992.29|     2.85|    2012-03-04|   2013-03-16|              109|            113|
|100022923|               617|    3771.86|     6.11|    2012-03-02|   2013-05-14|              202|            170|
+---------+------------------+----------

trainHistory

In [22]:
schema_history = StructType([
    StructField("id", StringType(), True),
    StructField("chain", StringType(), True),
    StructField("offer", StringType(), True),
    StructField("market", StringType(), True),
    StructField("repeattrips", IntegerType(), True),
    StructField("repeater", StringType(), True),
    StructField("offerdate", StringType(), True)
])

df_train_history = spark.read.csv(f"{base_path}/trainHistory.csv.gz",
                                   header=True, schema=schema_history)

Join da customer_features com a trainHistory e conversão da coluna repeater de boolean para binário.

In [23]:
schema_features = StructType([
    StructField("id", StringType(), True),
    StructField("total_transactions", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", IntegerType(), True),
    StructField("unique_products", IntegerType(), True)
])

df_customer_features = spark.read.csv(f"{base_path}/improved/customer_features.csv",
                                      header=True, schema=schema_features)

reference_date = to_date(lit("2025-05-29"))
df_customer_features = df_customer_features.withColumn("days_since_first_purchase",
                                                       datediff(reference_date, to_date(col("first_purchase")))) \
                                            .withColumn("days_since_last_purchase",
                                                       datediff(reference_date, to_date(col("last_purchase"))))
df_train_data = df_customer_features.join(
    df_train_history.select("id", "repeater"),
    on="id",
    how='inner'
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0)) \
    .drop("repeater", "first_purchase", "last_purchase")

df_train_data_with_repeater = df_customer_features.join(
    df_train_history.select("id", "repeater"),
    on="id",
    how='inner'
).withColumn("repeater", when(col("repeater") == "t", 1).otherwise(0)) \
    .drop("first_purchase", "last_purchase")
    
df_train_data.printSchema()
df_train_data.show(5)

root
 |-- id: string (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- avg_spent: double (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- days_since_first_purchase: integer (nullable = true)
 |-- days_since_last_purchase: integer (nullable = true)
 |-- target: integer (nullable = false)

+---------+------------------+-----------+---------+-----------------+---------------+-------------------------+------------------------+------+
|       id|total_transactions|total_spent|avg_spent|unique_categories|unique_products|days_since_first_purchase|days_since_last_purchase|target|
+---------+------------------+-----------+---------+-----------------+---------------+-------------------------+------------------------+------+
|100007447|              1096|    6644.88|     6.06|              235|            226|                     4832|                    4421|

Correlações para averiguar as features mais significativas.

In [24]:
cols_corr = ["total_transactions", "total_spent", "avg_spent",
             "unique_categories", "unique_products", "days_since_first_purchase", "days_since_last_purchase", "repeater"]

col_features = "features"

assembler = VectorAssembler(inputCols=cols_corr, outputCol=col_features)
df_features = assembler.transform(df_train_data_with_repeater).select(col_features)

corr_matrix = Correlation.corr(df_features, col_features).collect()[0][0].toArray().tolist()

fig = px.imshow(corr_matrix, title='Correlations',
                x = cols_corr, y = cols_corr,
                color_continuous_scale='Sunsetdark',  # Sunsetdark, RdBu_r
                text_auto='.2f',
                zmin=-1, zmax=1)
fig.update_layout(
    xaxis_title="Funcionalidades",
    yaxis_title="Funcionalidades",
    width=700,
    height=600
)
fig.show()

Divisão de 80% do dataset para treino e 20% para validação.

In [25]:
train, validation = df_train_data.randomSplit([0.8, 0.2], seed=42)

**Treino e Validação**

Para as features escolhemos usar todas as colunas exceto o id, days_since_first_purchase e o days_since_last_purchase por não se aplicar e não apresentarem correlações negativas respetivamente.
Desta forma os modelos escolhidos são os modelos:
- LogisticRegression
- RandomForest
- GradientBoosting
- LinearSVC
- DecisionTree

In [26]:
feature_cols = [
    "total_transactions",
    "total_spent",
    "avg_spent",
    "unique_categories",
    "unique_products"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_assembled = assembler.transform(train)
validation_assembled = assembler.transform(validation)

modelos = {
    "LogisticRegression": LogisticRegression(labelCol="target", featuresCol="features"),
    "RandomForest":  RandomForestClassifier(labelCol="target", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="target", featuresCol="features"),
    "LinearSVC": LinearSVC(labelCol="target", featuresCol="features"),
    "DecisionTree": DecisionTreeClassifier(labelCol="target", featuresCol="features")
}
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
results = {}
detailed_results = {}

def calcular_metricas(predictions):
    confusion_matrix = predictions.groupBy('target', 'prediction').count()
    
    tp = confusion_matrix.filter((confusion_matrix.target == 1) & (confusion_matrix.prediction == 1)).select('count').first()
    tn = confusion_matrix.filter((confusion_matrix.target == 0) & (confusion_matrix.prediction == 0)).select('count').first()
    fp = confusion_matrix.filter((confusion_matrix.target == 0) & (confusion_matrix.prediction == 1)).select('count').first()
    fn = confusion_matrix.filter((confusion_matrix.target == 1) & (confusion_matrix.prediction == 0)).select('count').first()
    
    # Handle None values (when category doesn't exist)
    tp_val = tp['count'] if tp else 0
    tn_val = tn['count'] if tn else 0
    fp_val = fp['count'] if fp else 0
    fn_val = fn['count'] if fn else 0
    
    # Calculate metrics
    accuracy = (tp_val + tn_val) / (tp_val + tn_val + fp_val + fn_val) if (tp_val + tn_val + fp_val + fn_val) > 0 else 0
    precision = tp_val / (tp_val + fp_val) if (tp_val + fp_val) > 0 else 0
    recall = tp_val / (tp_val + fn_val) if (tp_val + fn_val) > 0 else 0
    specificity = tn_val / (tn_val + fp_val) if (tn_val + fp_val) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'f1_score': f1_score,
        'confusion_matrix': {'TP': tp_val, 'TN': tn_val, 'FP': fp_val, 'FN': fn_val}
    }

Na avalição do modelo produzido com cada algoritmo as métricas obtidas são:
- Precisão
- Exatidão
- Especifidade
- Matriz de confusão
- pontuação_f1
- Sensibilidade

In [38]:
for nome, modelo in modelos.items():
    print(f"\n{'='*50}")
    print(f"Treinando {nome}...")
    print(f"{'='*50}")
    
    start_time = time.time()
    training_start = datetime.now()
    print(f"Início do treino: {training_start.strftime('%Y-%m-%d %H:%M:%S')}")
    
    
    paramGrid = ParamGridBuilder().build()
    if nome == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.numTrees, [50]) \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [50]) \
            .addGrid(modelo.maxDepth, [5]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LinearSVC":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()
    elif nome == "DecisionTree":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LogisticRegression":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()

    crossval = CrossValidator(
        estimator=modelo,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
    training_time_start = time.time()
    cv_model = crossval.fit(train_assembled)
    training_time_end = time.time()
    training_duration = training_time_end - training_time_start
    cv_model.save(f"modelos_melhoria/{nome}")

    eval_time_start = time.time()
    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[nome] = auc
    eval_time_end = time.time()
    evaluation_duration = eval_time_end - eval_time_start
    
    metrics = calcular_metricas(predictions)
    detailed_results[nome] = {**metrics, 'auc_roc': auc}

    end_time = time.time()
    total_duration = end_time - start_time

    print(f"{nome} AUC-ROC: {auc:.4f}")
    print(f"{nome} Accuracy: {metrics['accuracy']:.4f}")
    print(f"{nome} Precision: {metrics['precision']:.4f}")
    print(f"{nome} Recall: {metrics['recall']:.4f}")
    print(f"{nome} Specificity: {metrics['specificity']:.4f}")
    print(f"{nome} F1 Score: {metrics['f1_score']:.4f}")
    print(f"Duração do treinamento: {training_duration:.2f} segundos")
    print(f"Duração da avaliação: {evaluation_duration:.2f} segundos")
    print(f"Duração total: {total_duration:.2f} segundos")

melhor_modelo_nome = max(results, key=results.get)
print(f"Melhor modelo: {melhor_modelo_nome} com AUC-ROC {results[melhor_modelo_nome]}")


Treinando LogisticRegression...
Início do treino: 2025-06-01 19:45:24
LogisticRegression AUC-ROC: 0.5184
LogisticRegression Accuracy: 0.7278
LogisticRegression Precision: 0.8750
LogisticRegression Recall: 0.0008
LogisticRegression Specificity: 1.0000
LogisticRegression F1 Score: 0.0016
Duração do treinamento: 20.19 segundos
Duração da avaliação: 1.83 segundos
Duração total: 35.94 segundos

Treinando RandomForest...
Início do treino: 2025-06-01 19:46:00
RandomForest AUC-ROC: 0.5660
RandomForest Accuracy: 0.7271
RandomForest Precision: 0.4286
RandomForest Recall: 0.0059
RandomForest Specificity: 0.9971
RandomForest F1 Score: 0.0116
Duração do treinamento: 114.29 segundos
Duração da avaliação: 2.53 segundos
Duração total: 137.39 segundos

Treinando GradientBoosting...
Início do treino: 2025-06-01 19:48:18
GradientBoosting AUC-ROC: 0.5649
GradientBoosting Accuracy: 0.7272
GradientBoosting Precision: 0.4434
GradientBoosting Recall: 0.0054
GradientBoosting Specificity: 0.9975
GradientBoosti

**Comparação de Resultados**

Gráfico das performances de cada modelo

In [28]:
results_data = [
    ("RandomForest", 0.57, None),
    ("LogisticRegression", 0.52, None),
    ("DecisionTree", 0.48, None),
    ("GradientBoostedTrees", 0.88, None),
    ("LinearSVC", 0.82, None)
]

data = {
    "Modelos": [name for name, _, _ in results_data],
    "AUC-ROC": [auc for _, auc, _ in results_data]
}
df = pd.DataFrame(data)

fig = px.bar(
    df,
    x="Modelos",
    y="AUC-ROC",
    title="Comparação da performance dos modelos (AUC-ROC)",
    labels={"AUC-ROC": "AUC-ROC Pontuação", "Modelo": "Modelo"},
    color="Modelos",
    color_discrete_sequence=px.colors.qualitative.Plotly
)

fig.update_layout(
    xaxis_title="Modelo",
    yaxis_title="AUC-ROC Pontuação",
    yaxis_range=[0, 1],
    showlegend=False,
    title_x=0.5
)

fig.show()

Gráfico com o tamanho de cada modelo

In [40]:
results_data_resources = [
    ("RandomForest", 800, None),
    ("LogisticRegression", 20, None),
    ("DecisionTree", 44, None),
    ("GradientBoostedTrees", 168, None),
    ("LinearSVC", 12, None)
]

data = {
    "Modelo": [name for name, _, _ in results_data_resources],
    "Memoria": [auc for _, auc, _ in results_data_resources]
}
df = pd.DataFrame(data)

fig = px.bar(
    df,
    x="Modelo",
    y="Memoria",
    title="Comparação do tamanho dos modelos",
    labels={"Memoria": "Memória Pontuação", "Modelo": "Modelo"},
    color="Modelo",
    color_discrete_sequence=px.colors.qualitative.Plotly
)

fig.update_layout(
    xaxis_title="Modelo",
    yaxis_title="Tamanho (KB)",
    showlegend=False,
    title_x=0.5
)

fig.show()

Gráfico com o tempo de treino de cada modelo

In [None]:
results_data_resources = [
    ("RandomForest", 128.79, None),
    ("LogisticRegression", 16.68, None),
    ("DecisionTree", 19.75, None),
    ("GradientBoostedTrees", 140.55, None),
    ("LinearSVC", 50.29, None)
]

data = {
    "Modelo": [name for name, _, _ in results_data_resources],
    "Tempo médio de treino": [auc for _, auc, _ in results_data_resources]
}
df = pd.DataFrame(data)

fig = px.bar(
    df,
    x="Modelo",
    y="Tempo médio de treino",
    title="Comparação do tempo médio de treino dos modelos",
    labels={"Tempo médio de treino": "Tempo médio de treino (s)", "Modelo": "Modelo"},
    color="Modelo",
    color_discrete_sequence=px.colors.qualitative.Plotly
)

fig.update_layout(
    xaxis_title="Modelo",
    yaxis_title="Tempo de médio de treino (s)",
    showlegend=False,
    title_x=0.5
)

fig.show()