# Modelo 1
Este modelo foi treinado com o resultado do join do trainHistory com o customer_features.csv que consiste na aggregação numa tabela por cliente de métricas extraidas a partir de todos os registos de compra do cliente encontrados na tabela transactions.
Este join foi feito usando o id e com apenas dados de ofertas feitas com o cliente a retornar.


In [24]:
# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os
import csv
import os
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import plotly.express as px

In [13]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

In [14]:
print("Info for customer_features.csv:")
df_customer_features = spark.read.csv(f"{base_path}/customer_features.csv",
                                      header=True, inferSchema=True)
df_customer_features.show(5)
df_customer_features.printSchema()


Info for customer_features.csv:
+---------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+
|       id|total_transactions|total_spent|avg_spent|first_purchase|last_purchase|unique_categories|unique_products|
+---------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+
|100007447|              1096|    6644.88|     6.06|    2012-03-06|   2013-04-21|              235|            226|
|100010021|               349|    1419.96|     4.07|    2012-03-12|   2013-05-12|              123|            109|
|100012115|               118|     553.02|     4.69|    2012-03-12|   2013-04-17|               62|             54|
|100017875|               348|     992.29|     2.85|    2012-03-04|   2013-03-16|              109|            113|
|100022923|               617|    3771.86|     6.11|    2012-03-02|   2013-05-14|              202|            170|
+---------+------------------+----------

In [16]:
# Set schema for trainHistory.csv
schema_history = StructType([
    StructField("id", StringType(), True),
    StructField("chain", StringType(), True),
    StructField("offer", StringType(), True),
    StructField("market", StringType(), True),
    StructField("repeattrips", IntegerType(), True),
    StructField("repeater", StringType(), True),
    StructField("offerdate", StringType(), True)
])

df_train_history = spark.read.csv(f"{base_path}/trainHistory.csv.gz",
                                   header=True, schema=schema_history)

In [32]:
# Conversão do schema

schema_features = StructType([
    StructField("id", StringType(), True),
    StructField("total_transactions", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", IntegerType(), True),
    StructField("unique_products", IntegerType(), True)
])

df_customer_features = spark.read.csv(f"{base_path}/customer_features.csv",
                                      header=True, schema=schema_features)

reference_date = to_date(lit("2025-05-29"))
df_customer_features = df_customer_features.withColumn("days_since_first_purchase",
                                                       datediff(reference_date, to_date(col("first_purchase")))) \
                                            .withColumn("days_since_last_purchase",
                                                       datediff(reference_date, to_date(col("last_purchase"))))
# Criação de data set para treino
df_train_data = df_customer_features.join(
    df_train_history.select("id", "repeater"),
    on="id",
    how='inner'
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0)) \
    .drop("repeater", "first_purchase", "last_purchase")

df_train_data_with_repeater = df_customer_features.join(
    df_train_history.select("id", "repeater"),
    on="id",
    how='inner'
).withColumn("repeater", when(col("repeater") == "t", 1).otherwise(0)) \
    .drop("first_purchase", "last_purchase")
    
df_train_data.printSchema()
df_train_data.show(5)

root
 |-- id: string (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- avg_spent: double (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- days_since_first_purchase: integer (nullable = true)
 |-- days_since_last_purchase: integer (nullable = true)
 |-- target: integer (nullable = false)

+---------+------------------+-----------+---------+-----------------+---------------+-------------------------+------------------------+------+
|       id|total_transactions|total_spent|avg_spent|unique_categories|unique_products|days_since_first_purchase|days_since_last_purchase|target|
+---------+------------------+-----------+---------+-----------------+---------------+-------------------------+------------------------+------+
|100007447|              1096|    6644.88|     6.06|              235|            226|                     4832|                    4421|

In [28]:
cols_corr = ["total_transactions", "total_spent", "avg_spent",
             "unique_categories", "unique_products", "days_since_first_purchase", "days_since_last_purchase", "repeater"]

col_features = "features"

assembler = VectorAssembler(inputCols=cols_corr, outputCol=col_features)
df_features = assembler.transform(df_train_data_with_repeater).select(col_features)

corr_matrix = Correlation.corr(df_features, col_features).collect()[0][0].toArray().tolist()

print(f'Computed correlations among {cols_corr}:')
fig = px.imshow(corr_matrix, title='Correlations',
                x = cols_corr, y = cols_corr,
                color_continuous_scale='Sunsetdark',  # Sunsetdark, RdBu_r
                text_auto=True)
fig.show()

Computed correlations among ['total_transactions', 'total_spent', 'avg_spent', 'unique_categories', 'unique_products', 'days_since_first_purchase', 'days_since_last_purchase', 'repeater']:


In [None]:
train, validation = df_train_data.randomSplit([0.8, 0.2], seed=42)

**Model Training and Evaluation**
Para as features escolhemos usar todas as colunas exceto o id, days_since_first_purchase e o days_since_last_purchase por não se aplicar e não apresentarem correlações negativas respetivamente.

In [31]:
# Combinar features em um vetor

feature_cols = [
    "total_transactions",
    "total_spent",
    "avg_spent",
    "unique_categories",
    "unique_products"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_assembled = assembler.transform(train)
validation_assembled = assembler.transform(validation)

modelos = {
    "LogisticRegression": LogisticRegression(labelCol="target", featuresCol="features"),
    "RandomForest": RandomForestClassifier(labelCol="target", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="target", featuresCol="features")
}

evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
results = {}

for nome, modelo in modelos.items():
    # Configurar validação cruzada
    paramGrid = ParamGridBuilder().build()
    if nome == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.numTrees, [50, 100]) \
            .addGrid(modelo.maxDepth, [5, 10]) \
            .build()
    elif nome == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [10, 20]) \
            .addGrid(modelo.maxDepth, [5, 7]) \
            .build()

    crossval = CrossValidator(
        estimator=modelo,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
    # Treinar modelo
    cv_model = crossval.fit(train_assembled)
    
    # Avaliar na validação
    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[nome] = auc
    print(f"{nome} AUC-ROC: {auc}")

melhor_modelo_nome = max(results, key=results.get)
print(f"Melhor modelo: {melhor_modelo_nome} com AUC-ROC {results[melhor_modelo_nome]}")

IllegalArgumentException: target does not exist. Available: id, total_transactions, total_spent, avg_spent, first_purchase, last_purchase, unique_categories, unique_products, days_since_first_purchase, days_since_last_purchase, features, CrossValidator_e42282111f91_rand