# Modelo 1
Este modelo foi treinado com o resultado do join do trainHistory com o customer_features.csv que consiste na aggregação numa tabela por cliente de métricas extraidas a partir de todos os registos de compra do cliente encontrados na tabela transactions.
Este join foi feito usando o id e com apenas dados de ofertas feitas com o cliente a retornar.


In [43]:

# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os
import csv
import os
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import plotly.express as px
import plotly.express as px
import pandas as pd


In [44]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

In [45]:
print("Info for customer_features.csv:")
df_customer_features = spark.read.csv(f"{base_path}/improved/customer_features.csv",
                                      header=True, inferSchema=True)
df_customer_features.show(5)
df_customer_features.printSchema()


Info for customer_features.csv:
+---------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+
|       id|total_transactions|total_spent|avg_spent|first_purchase|last_purchase|unique_categories|unique_products|
+---------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+
|100007447|              1096|    6644.88|     6.06|    2012-03-06|   2013-04-21|              235|            226|
|100010021|               349|    1419.96|     4.07|    2012-03-12|   2013-05-12|              123|            109|
|100012115|               118|     553.02|     4.69|    2012-03-12|   2013-04-17|               62|             54|
|100017875|               348|     992.29|     2.85|    2012-03-04|   2013-03-16|              109|            113|
|100022923|               617|    3771.86|     6.11|    2012-03-02|   2013-05-14|              202|            170|
+---------+------------------+----------

In [46]:
# Set schema for trainHistory.csv
schema_history = StructType([
    StructField("id", StringType(), True),
    StructField("chain", StringType(), True),
    StructField("offer", StringType(), True),
    StructField("market", StringType(), True),
    StructField("repeattrips", IntegerType(), True),
    StructField("repeater", StringType(), True),
    StructField("offerdate", StringType(), True)
])

df_train_history = spark.read.csv(f"{base_path}/trainHistory.csv.gz",
                                   header=True, schema=schema_history)

In [47]:
# Conversão do schema

schema_features = StructType([
    StructField("id", StringType(), True),
    StructField("total_transactions", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", IntegerType(), True),
    StructField("unique_products", IntegerType(), True)
])

df_customer_features = spark.read.csv(f"{base_path}/improved/customer_features.csv",
                                      header=True, schema=schema_features)

reference_date = to_date(lit("2025-05-29"))
df_customer_features = df_customer_features.withColumn("days_since_first_purchase",
                                                       datediff(reference_date, to_date(col("first_purchase")))) \
                                            .withColumn("days_since_last_purchase",
                                                       datediff(reference_date, to_date(col("last_purchase"))))
# Criação de data set para treino
df_train_data = df_customer_features.join(
    df_train_history.select("id", "repeater"),
    on="id",
    how='inner'
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0)) \
    .drop("repeater", "first_purchase", "last_purchase")

df_train_data_with_repeater = df_customer_features.join(
    df_train_history.select("id", "repeater"),
    on="id",
    how='inner'
).withColumn("repeater", when(col("repeater") == "t", 1).otherwise(0)) \
    .drop("first_purchase", "last_purchase")
    
df_train_data.printSchema()
df_train_data.show(5)

root
 |-- id: string (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- avg_spent: double (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- days_since_first_purchase: integer (nullable = true)
 |-- days_since_last_purchase: integer (nullable = true)
 |-- target: integer (nullable = false)

+---------+------------------+-----------+---------+-----------------+---------------+-------------------------+------------------------+------+
|       id|total_transactions|total_spent|avg_spent|unique_categories|unique_products|days_since_first_purchase|days_since_last_purchase|target|
+---------+------------------+-----------+---------+-----------------+---------------+-------------------------+------------------------+------+
|100007447|              1096|    6644.88|     6.06|              235|            226|                     4832|                    4421|

In [48]:
cols_corr = ["total_transactions", "total_spent", "avg_spent",
             "unique_categories", "unique_products", "days_since_first_purchase", "days_since_last_purchase", "repeater"]

col_features = "features"

assembler = VectorAssembler(inputCols=cols_corr, outputCol=col_features)
df_features = assembler.transform(df_train_data_with_repeater).select(col_features)

corr_matrix = Correlation.corr(df_features, col_features).collect()[0][0].toArray().tolist()

print(f'Computed correlations among {cols_corr}:')
fig = px.imshow(corr_matrix, title='Correlations',
                x = cols_corr, y = cols_corr,
                color_continuous_scale='Sunsetdark',  # Sunsetdark, RdBu_r
                text_auto='.2f',
                zmin=-1, zmax=1)
fig.update_layout(
    xaxis_title="Funcionalidades",
    yaxis_title="Funcionalidades",
    width=700,
    height=600
)
fig.show()

Computed correlations among ['total_transactions', 'total_spent', 'avg_spent', 'unique_categories', 'unique_products', 'days_since_first_purchase', 'days_since_last_purchase', 'repeater']:


In [49]:
train, validation = df_train_data.randomSplit([0.8, 0.2], seed=42)

**Model Training and Evaluation**
Para as features escolhemos usar todas as colunas exceto o id, days_since_first_purchase e o days_since_last_purchase por não se aplicar e não apresentarem correlações negativas respetivamente.

In [50]:
# Combinar features em um vetor

feature_cols = [
    "total_transactions",
    "total_spent",
    "avg_spent",
    "unique_categories",
    "unique_products"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_assembled = assembler.transform(train)
validation_assembled = assembler.transform(validation)

modelos = {
    "LogisticRegression": LogisticRegression(labelCol="target", featuresCol="features"),
    "RandomForest":  RandomForestClassifier(labelCol="target", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="target", featuresCol="features"),
    "LinearSVC": LinearSVC(labelCol="target", featuresCol="features"),
    "DecisionTree": DecisionTreeClassifier(labelCol="target", featuresCol="features")
}
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
results = {}
detailed_results = {}

def calculate_metrics(predictions):
    """Calculate detailed classification metrics from predictions"""
    # Get confusion matrix components
    confusion_matrix = predictions.groupBy('target', 'prediction').count()
    
    tp = confusion_matrix.filter((confusion_matrix.target == 1) & (confusion_matrix.prediction == 1)).select('count').first()
    tn = confusion_matrix.filter((confusion_matrix.target == 0) & (confusion_matrix.prediction == 0)).select('count').first()
    fp = confusion_matrix.filter((confusion_matrix.target == 0) & (confusion_matrix.prediction == 1)).select('count').first()
    fn = confusion_matrix.filter((confusion_matrix.target == 1) & (confusion_matrix.prediction == 0)).select('count').first()
    
    # Handle None values (when category doesn't exist)
    tp_val = tp['count'] if tp else 0
    tn_val = tn['count'] if tn else 0
    fp_val = fp['count'] if fp else 0
    fn_val = fn['count'] if fn else 0
    
    # Calculate metrics
    accuracy = (tp_val + tn_val) / (tp_val + tn_val + fp_val + fn_val) if (tp_val + tn_val + fp_val + fn_val) > 0 else 0
    precision = tp_val / (tp_val + fp_val) if (tp_val + fp_val) > 0 else 0
    recall = tp_val / (tp_val + fn_val) if (tp_val + fn_val) > 0 else 0
    specificity = tn_val / (tn_val + fp_val) if (tn_val + fp_val) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'f1_score': f1_score,
        'confusion_matrix': {'TP': tp_val, 'TN': tn_val, 'FP': fp_val, 'FN': fn_val}
    }

In [52]:
for nome, modelo in modelos.items():
    print(f"\n{'='*50}")
    print(f"Training {nome}...")
    print(f"{'='*50}")
    
    # Configurar validação cruzada
    paramGrid = ParamGridBuilder().build()
    if nome == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.numTrees, [50]) \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [50]) \
            .addGrid(modelo.maxDepth, [5]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LinearSVC":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()
    elif nome == "DecisionTree":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LogisticRegression":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()

    crossval = CrossValidator(
        estimator=modelo,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
    # Treinar modelo
    cv_model = crossval.fit(train_assembled)
    cv_model.save(f"modelos_melhoria/{nome}")

    # Avaliar na validação
    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[nome] = auc
    
    metrics = calculate_metrics(predictions)
    detailed_results[nome] = {**metrics, 'auc_roc': auc}


    print(f"{nome} AUC-ROC: {auc:.4f}")
    print(f"{nome} Accuracy: {metrics['accuracy']:.4f}")
    print(f"{nome} Precision: {metrics['precision']:.4f}")
    print(f"{nome} Recall: {metrics['recall']:.4f}")
    print(f"{nome} Specificity: {metrics['specificity']:.4f}")
    print(f"{nome} F1 Score: {metrics['f1_score']:.4f}")

melhor_modelo_nome = max(results, key=results.get)
print(f"Melhor modelo: {melhor_modelo_nome} com AUC-ROC {results[melhor_modelo_nome]}")


Training LogisticRegression...
LogisticRegression AUC-ROC: 0.5184
LogisticRegression Accuracy: 0.7278
LogisticRegression Precision: 0.8750
LogisticRegression Recall: 0.0008
LogisticRegression Specificity: 1.0000
LogisticRegression F1 Score: 0.0016

Training RandomForest...
RandomForest AUC-ROC: 0.5660
RandomForest Accuracy: 0.7271
RandomForest Precision: 0.4286
RandomForest Recall: 0.0059
RandomForest Specificity: 0.9971
RandomForest F1 Score: 0.0116

Training GradientBoosting...
GradientBoosting AUC-ROC: 0.5649
GradientBoosting Accuracy: 0.7272
GradientBoosting Precision: 0.4434
GradientBoosting Recall: 0.0054
GradientBoosting Specificity: 0.9975
GradientBoosting F1 Score: 0.0107

Training LinearSVC...
LinearSVC AUC-ROC: 0.5597
LinearSVC Accuracy: 0.7276
LinearSVC Precision: 0.0000
LinearSVC Recall: 0.0000
LinearSVC Specificity: 1.0000
LinearSVC F1 Score: 0.0000

Training DecisionTree...
DecisionTree AUC-ROC: 0.4819
DecisionTree Accuracy: 0.7238
DecisionTree Precision: 0.3484
Decisio

Adicionar comparação dos modelos num plot

In [None]:
# Extract model names and AUC-ROC scores from the results list
# Replace these placeholder AUC-ROC values with actual values from your results
results_data = [
    ("RandomForest", 0.85, None),  # Example AUC-ROC value
    ("LogisticRegression", 0.80, None),  # Example AUC-ROC value
    ("DecisionTree", 0.48, None),  # Example AUC-ROC value
    ("GradientBoostedTrees", 0.88, None),  # Example AUC-ROC value
    ("LinearSVC", 0.82, None)  # Example AUC-ROC value (replace with your linear_svc_auc_roc)
]

# Create a DataFrame for Plotly
data = {
    "Model": [name for name, _, _ in results_data],
    "AUC-ROC": [auc for _, auc, _ in results_data]
}
df = pd.DataFrame(data)

# Create a bar chart using Plotly Express
fig = px.bar(
    df,
    x="Model",
    y="AUC-ROC",
    title="Comparison of Model Performance (AUC-ROC)",
    labels={"AUC-ROC": "AUC-ROC Score", "Model": "Model Name"},
    color="Model",
    color_discrete_sequence=px.colors.qualitative.Plotly
)

# Update layout for better readability
fig.update_layout(
    xaxis_title="Model",
    yaxis_title="AUC-ROC Score",
    yaxis_range=[0, 1],  # AUC-ROC scores are typically between 0 and 1
    showlegend=False,
    title_x=0.5
)

# Show the plot
fig.show()