In [1]:
# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os


In [2]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

## Data Ingestion

In [3]:
# Data to read - offers.csv
data_dir_offers = f'{base_path}/offers.csv.gz'
data_file_offers = data_dir_offers

! head $data_file_offers

# Data to read - sampleSubmission.csv
data_dir_sampleSubmission = f'{base_path}/sampleSubmission.csv.gz'
data_file_sampleSubmission = data_dir_sampleSubmission

! head $data_file_sampleSubmission

# Data to read - testHistory.csv
data_dir_testHistory = f'{base_path}/testHistory.csv.gz'
data_file_testHistory = data_dir_testHistory

! head $data_file_testHistory

# Data to read - trainHistory.csv
data_dir_trainHistory = f'{base_path}/trainHistory.csv.gz'
data_file_trainHistory = data_dir_trainHistory

! head $data_file_trainHistory

# Data to read - transactions.csv
data_dir_transactions = f'{base_path}/transactions.csv.gz'
data_file_transactions = data_dir_transactions

! head $data_file_transactions;

�}ESoffers.csv ���n�0��=Q�����u�0`K� ���������Y)�^__��|��|?�����p���\������x���=�|{�������fW�q.g�brjm��Q7� )��u�X�t)XG&fa�O�$j9�(���׶#A��䜘�Bw��
,d�2�3;�b4[&� ̞E��f�l<�Ȉ���D�"��#u���V҃��Y�@���ʎ��W;XKU�C�6� ���5j�T���g��̛�ܮA�U=@AV0',%�W��M�=Sp!��^�����9�{�h�Ȭ��o�F#F�|l�W�Yխ�X���#
fʥ�+}�p��<�������B!�Q�E"��p�2M�^�*0K�c�������@@�4��e�I�Tb�����A��Kˆ��ч�ˡɿl��PH�Υ���,�6�Yoe0�d�����4�}���۰��c����Z�2+�]�v�t������l����#9�^�G���H��2�,-	��1{FGJ��� �:�U.j)��?��8z��2�:�`l�A��Rz�t���>tXw��r:u�q��ݢqS�2� �9�,��VF.��RQ��Y�^6Ȟf��Mf���\�Y'����2��ud��#�����"���0}B����Y��
��%Q�Y*Չ���j�H"����>s,w*�c�,e���+ �=�<��X�b��I��XV��Q�����:�5s����P��exw�1ǕG����1�L����h;����êʗf�exG����V�5�X-%8�8�=�I/��@�!���\��aK/Q86���4�&:�dt�,�ɹ�5��]�v�	IX���l�.��b-C֭c{��L,+h�� ����9������;��;���c��۸w��Y�'3��1����Bm���:N�Ē	�/����G�A-�0>ˁYƑ���ٌK7�9f<V�-Is̲�:�����:���
��ޱ���1��c!|�YM5X:���:��ޚƼէ#cff���\����F��r��؆��Q�z$-�m��ģ�w=*�s���x��^R`ײ痍��

**Offers**

In [4]:
# Reading data - offers.csv
df_offers = spark.read.csv(
        data_file_offers, 
        header=True, sep=',', inferSchema=True
    )

**TESTHISTORY**

In [5]:
# Reading data - testHistory.csv
df_testHistory = spark.read.csv(
        data_file_testHistory, 
        header=True, sep=',', inferSchema=True
    )

**TRAINHISTORY**

In [6]:
# Reading data - trainHistory.csv
df_trainHistory = spark.read.csv(
        data_file_trainHistory, 
        header=True, sep=',', inferSchema=True
    )

**TRANSACTIONS**

In [7]:
df_transactions = spark.read.csv(
    data_file_transactions, 
    header=True, sep=',', inferSchema=False
).sample(fraction=0.001, seed=42).limit(1000000)

In [9]:
import csv
import os
import datetime

# Input and output paths
input_dir = "/home/jovyan/code/data/output_files"
output_file = "/home/jovyan/code/data/customer_features.csv"

# Dictionary to store features for all customers
features = {}

# Process each customer file
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        customer_id = filename.split(".")[0]
        file_path = os.path.join(input_dir, filename)

        total_transactions = 0
        total_spent = 0.0
        first_purchase = None
        last_purchase = None
        unique_categories = set()
        unique_products = set()

        # Open and read the file
        with open(file_path, 'r', newline='') as infile:
            reader = csv.reader(infile)
            next(reader)  # Skip header

            for row in reader:
                total_transactions += 1
                amount = float(row[10])  # purchaseamount is the 11th column (index 10)
                total_spent += amount

                # Parse date for temporal features
                date_str = row[6]  # date is the 7th column (index 6)
                date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
                if first_purchase is None or date < first_purchase:
                    first_purchase = date
                if last_purchase is None or date > last_purchase:
                    last_purchase = date

                # Track unique categories and products (using company as proxy for product)
                unique_categories.add(row[3])  # category is the 4th column (index 3)
                unique_products.add(row[4])    # company is the 5th column (index 4)

        # Calculate average spent
        avg_spent = total_spent / total_transactions if total_transactions > 0 else 0.0

        # Store features
        features[customer_id] = {
            "total_transactions": total_transactions,
            "total_spent": round(total_spent, 2),
            "avg_spent": round(avg_spent, 2),
            "first_purchase": first_purchase.strftime('%Y-%m-%d') if first_purchase else "",
            "last_purchase": last_purchase.strftime('%Y-%m-%d') if last_purchase else "",
            "unique_categories": len(unique_categories),
            "unique_products": len(unique_products)
        }

# Write features to output CSV
with open(output_file, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    # Write header
    writer.writerow([
        "id",
        "total_transactions",
        "total_spent",
        "avg_spent",
        "first_purchase",
        "last_purchase",
        "unique_categories",
        "unique_products"
    ])
    # Write data for each customer
    for customer_id, feature_dict in features.items():
        writer.writerow([
            customer_id,
            feature_dict["total_transactions"],
            feature_dict["total_spent"],
            feature_dict["avg_spent"],
            feature_dict["first_purchase"],
            feature_dict["last_purchase"],
            feature_dict["unique_categories"],
            feature_dict["unique_products"]
        ])

print(f"Features saved to {output_file}")

Features saved to /home/jovyan/code/data/customer_features.csv


In [8]:
# Data to read - transactions.csv
data_dir_customer_features = '/home/jovyan/code/data/customer_features.csv'
data_file_customer_features= data_dir_customer_features

! head $data_file_customer_features;
# Reading data - offers.csv
df_customer_features = spark.read.csv(
        data_file_customer_features, 
        header=True, sep=',', inferSchema=True
    )

# Checking data that has been read - testHistory.csv
print(f'df_customer_features - number of rows: {df_customer_features.count()}')
df_customer_features.printSchema()
df_customer_features.show(10, truncate=False)


id,total_transactions,total_spent,avg_spent,first_purchase,last_purchase,unique_categories,unique_products
100007447,1096,6644.88,6.06,2012-03-06,2013-04-21,235,226
100010021,349,1419.96,4.07,2012-03-12,2013-05-12,123,109
100012115,118,553.02,4.69,2012-03-12,2013-04-17,62,54
100017875,348,992.29,2.85,2012-03-04,2013-03-16,109,113
100022923,617,3771.86,6.11,2012-03-02,2013-05-14,202,170
100025687,367,2091.19,5.7,2012-03-04,2013-06-11,120,114
100027361,696,3767.32,5.41,2012-03-09,2013-06-16,191,212
100029473,1644,7003.57,4.26,2012-03-04,2013-06-18,346,284
100033247,1596,6834.73,4.28,2012-03-04,2013-03-28,265,237
df_customer_features - number of rows: 311541
root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- avg_spent: double (nullable = true)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integ

In [10]:
# Dividir em treino e validação
train, validation = train_data.randomSplit([0.8, 0.2], seed=42)

NameError: name 'train_data' is not defined

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff

# Configurar sessão Spark
spark = SparkSession.builder \
    .appName("ShoppersChallenge") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# Definir esquema para customer_features.csv
schema_features = StructType([
    StructField("id", StringType(), True),
    StructField("total_transactions", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", IntegerType(), True),
    StructField("unique_products", IntegerType(), True)
])

# Carregar customer_features.csv
features_df = spark.read.csv("/home/jovyan/code/data/customer_features.csv", header=True, schema=schema_features)

# Converter first_purchase e last_purchase para features numéricas
reference_date = to_date(lit("2025-05-29"))
features_df = features_df.withColumn("days_since_first", datediff(reference_date, to_date("first_purchase"))) \
                        .withColumn("days_since_last", datediff(reference_date, to_date("last_purchase")))

# Definir esquema para trainHistory.csv
schema_history = StructType([
    StructField("id", StringType(), True),
    StructField("chain", StringType(), True),
    StructField("offer", StringType(), True),
    StructField("market", StringType(), True),
    StructField("repeattrips", IntegerType(), True),
    StructField("repeater", StringType(), True),
    StructField("offerdate", StringType(), True)
])


# Juntar features com a variável alvo
train_data = features_df.join(
    df_trainHistory.select("id", "repeater"),
    features_df.id == df_trainHistory.id,
    "inner"
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0)).drop("repeater", "first_purchase", "last_purchase")

In [17]:
# Dividir em treino e validação
train, validation = train_data.randomSplit([0.8, 0.2], seed=42)

In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Combinar features em um vetor
feature_cols = [
    "total_transactions",
    "total_spent",
    "avg_spent",
    "unique_categories",
    "unique_products",
    "days_since_first",
    "days_since_last"
]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_assembled = assembler.transform(train)
validation_assembled = assembler.transform(validation)

# Inicializar modelos
models = {
    "LogisticRegression": LogisticRegression(labelCol="target", featuresCol="features"),
    "RandomForest": RandomForestClassifier(labelCol="target", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="target", featuresCol="features")
}

# Avaliar modelos
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
results = {}

for name, model in models.items():
    # Configurar validação cruzada
    paramGrid = ParamGridBuilder().build()
    if name == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.numTrees, [50, 100]) \
            .addGrid(model.maxDepth, [5, 10]) \
            .build()
    elif name == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [10, 20]) \
            .addGrid(model.maxDepth, [5, 7]) \
            .build()
    
    crossval = CrossValidator(
        estimator=model,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
    # Treinar modelo
    cv_model = crossval.fit(train_assembled)
    
    # Avaliar na validação
    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[name] = auc
    print(f"{name} AUC-ROC: {auc}")

# Selecionar o melhor modelo
best_model_name = max(results, key=results.get)
print(f"Melhor modelo: {best_model_name} com AUC-ROC {results[best_model_name]}")

LogisticRegression AUC-ROC: 0.5927542816089806
RandomForest AUC-ROC: 0.6365021366961058
GradientBoosting AUC-ROC: 0.6333764174816299
Melhor modelo: RandomForest com AUC-ROC 0.6365021366961058


In [9]:
from pyspark.sql.functions import col, count, sum, avg, min, datediff, to_date

# Step 1: Join transactions with offers to identify relevant transactions
df_transactions = df_transactions.withColumnRenamed("category", "trans_category") \
                                .withColumnRenamed("company", "trans_company") \
                                .withColumnRenamed("brand", "trans_brand")

df_offer_transactions = df_transactions.join(
    df_offers.select("offer", "category", "company", "brand"),
    (df_transactions.trans_category == df_offers.category) &
    (df_transactions.trans_company == df_offers.company) &
    (df_transactions.trans_brand == df_offers.brand),
    "inner"
)

In [10]:
# Step 2: Join with trainHistory to associate transactions with customers and offerdates
# Include 'id' and 'chain' in the join to ensure correct matching
df_offer_transactions = df_offer_transactions.join(
    df_trainHistory.select("id", "chain", "offer", "offerdate"),
    ["id", "offer", "chain"],  # Join on id, offer, and chain to avoid ambiguity
    "inner"
)

In [11]:
# Step 3: Filter transactions before offerdate
df_offer_transactions = df_offer_transactions.filter(
    to_date(df_offer_transactions.date) <= to_date(df_offer_transactions.offerdate)
)

In [12]:
# Step 4: Aggregate offer-specific features
offer_features = df_offer_transactions.groupBy("id").agg(
    count("*").alias("offer_purchase_count"),
    sum("purchaseamount").alias("offer_total_spent"),
    avg("purchaseamount").alias("offer_avg_spent"),
    min(datediff(to_date("offerdate"), to_date("date"))).alias("days_since_last_offer_purchase")
)

In [13]:
# Step 5: Join with existing customer features
df_customer_features = df_customer_features.join(
    offer_features,
    "id",
    "left"
).na.fill(0)  # Fill nulls for customers with no offer-related transactions

In [None]:
# Drop date columns to avoid issues
# df_customer_features = df_customer_features.drop("first_purchase", "last_purchase")

# Verify updated schema
df_customer_features.printSchema()
# df_customer_features.show(2, truncate=False)

root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = false)
 |-- avg_spent: double (nullable = false)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- offer_purchase_count: long (nullable = true)
 |-- offer_total_spent: double (nullable = false)
 |-- offer_avg_spent: double (nullable = false)
 |-- days_since_last_offer_purchase: integer (nullable = true)



In [19]:
feature_cols = [
    "total_transactions", "total_spent","avg_spent", "unique_categories", "unique_products",
    "offer_purchase_count", "offer_total_spent", "offer_avg_spent","days_since_last_offer_purchase"
]
# Cache df_customer_features
df_customer_features.cache()

# Compute summary statistics
df_customer_features.select(feature_cols).summary().show(truncate=False)

# Unpersist when done to free memory
df_customer_features.unpersist()

+-------+------------------+------------------+------------------+-----------------+------------------+--------------------+---------------------+---------------------+------------------------------+
|summary|total_transactions|total_spent       |avg_spent         |unique_categories|unique_products   |offer_purchase_count|offer_total_spent    |offer_avg_spent      |days_since_last_offer_purchase|
+-------+------------------+------------------+------------------+-----------------+------------------+--------------------+---------------------+---------------------+------------------------------+
|count  |311541            |311541            |311541            |311541           |311541            |311541              |311541               |311541               |311541                        |
|mean   |1122.3427702934766|5038.425828478443 |4.359585672511802 |192.4385297601279|198.97684734914506|3.306145900539576E-4|0.0023614548325902535|0.0020724238543241504|0.03375799653978128           |


DataFrame[id: bigint, total_transactions: int, total_spent: double, avg_spent: double, first_purchase: date, last_purchase: date, unique_categories: int, unique_products: int, offer_purchase_count: bigint, offer_total_spent: double, offer_avg_spent: double, days_since_last_offer_purchase: int]

As funcionalidades específicas de oferta, como offer_purchase_count, são muito escassas. As estatísticas mostram uma média de ~0.00033 e 75% dos clientes com valor 0, indica que poucos clientes têm transações relacionadas com as ofertas.
Esta escassez deve-se à junção restritiva entre transactions.csv e offers.csv, que exige correspondência exata em três colunas: category, company e brand. Por exemplo, uma oferta para "Bebidas, Coca-Cola, Sprite" só inclui transações de Sprite da Coca-Cola na categoria Bebidas.
A restrição date <= offerdate e o número limitado de combinações em offers.csv reduzem ainda mais as transações válidas, tornando estas funcionalidades pouco informativas para prever o estado de repeater.

In [14]:
from pyspark.sql.functions import count, sum, avg, min, datediff, to_date

# Looser join on category only
df_offer_transactions = df_transactions.join(
    df_offers.select("offer", "category"),
    df_transactions.trans_category == df_offers.category,
    "inner"
).join(
    df_trainHistory.select("id", "chain", "offer", "offerdate"),
    ["id", "offer", "chain"],
    "inner"
).filter(to_date("date") <= to_date("offerdate"))

# Aggregate category-based offer features
offer_features = df_offer_transactions.groupBy("id").agg(
    count("*").alias("offer_purchase_count_cat"),
    sum("purchaseamount").alias("offer_total_spent_cat"),
    avg("purchaseamount").alias("offer_avg_spent_cat"),
    min(datediff(to_date("offerdate"), to_date("date"))).alias("days_since_last_offer_purchase_cat")
)

# Join with df_customer_features
df_customer_features = df_customer_features.join(offer_features, "id", "left").na.fill(0)

# Check summary of new features
df_customer_features.select(
    "offer_purchase_count_cat", "offer_total_spent_cat", "offer_avg_spent_cat"
).summary().show(truncate=False)

+-------+------------------------+---------------------+--------------------+
|summary|offer_purchase_count_cat|offer_total_spent_cat|offer_avg_spent_cat |
+-------+------------------------+---------------------+--------------------+
|count  |311541                  |311541               |311541              |
|mean   |0.0029017047515415306   |0.014571918302887791 |0.013192694082457313|
|stddev |0.06058118879316296     |0.47635459796674195  |0.3207507011622313  |
|min    |0                       |-1.09                |-1.09               |
|25%    |0                       |0.0                  |0.0                 |
|50%    |0                       |0.0                  |0.0                 |
|75%    |0                       |0.0                  |0.0                 |
|max    |8                       |167.03999999999996   |32.94               |
+-------+------------------------+---------------------+--------------------+



In [17]:
from pyspark.sql.functions import lag, datediff, avg
from pyspark.sql.window import Window

window_spec = Window.partitionBy("id").orderBy("date")
trans_with_diff = df_transactions.withColumn(
    "prev_date",
    lag("date").over(window_spec)
).withColumn(
    "days_diff",
    datediff(to_date(col("date")), to_date(col("prev_date")))
)

freq_features = trans_with_diff.groupBy("id").agg(
    avg("days_diff").alias("avg_days_between_purchases")
)
df_customer_features = df_customer_features.join(
    freq_features,
    "id",
    "left"
).na.fill(999)

df_customer_features.select("avg_days_between_purchases").summary().show(truncate=False)

+-------+--------------------------+
|summary|avg_days_between_purchases|
+-------+--------------------------+
|count  |311541                    |
|mean   |762.6616668467402         |
|stddev |392.6921829827085         |
|min    |0.0                       |
|25%    |274.0                     |
|50%    |999.0                     |
|75%    |999.0                     |
|max    |999.0                     |
+-------+--------------------------+



In [16]:
offer_value = df_trainHistory.select("id", "offer").join(
    df_offers.select("offer", "offervalue"),
    "offer"
)
df_customer_features = df_customer_features.join(
    offer_value.select("id", "offervalue"),
    "id",
    "left"
).withColumn(
    "offer_value_per_transaction",
    col("offervalue") / col("offer_purchase_count_cat")
).na.fill(0)

df_customer_features.select("offer_value_per_transaction").summary().show(truncate=False)

+-------+---------------------------+
|summary|offer_value_per_transaction|
+-------+---------------------------+
|count  |311541                     |
|mean   |0.0029271714752518246      |
|stddev |0.05757790593524346        |
|min    |0.0                        |
|25%    |0.0                        |
|50%    |0.0                        |
|75%    |0.0                        |
|max    |3.0                        |
+-------+---------------------------+



In [15]:
from pyspark.sql.functions import countDistinct

chain_features = df_transactions.groupBy("id").agg(
    countDistinct("chain").alias("unique_chains")
)
df_customer_features = df_customer_features.join(
    chain_features,
    "id",
    "left"
).na.fill(0)

df_customer_features.select("unique_chains").summary().show(truncate=False)

+-------+------------------+
|summary|unique_chains     |
+-------+------------------+
|count  |311541            |
|mean   |0.5538693141512674|
|stddev |0.4970968817692563|
|min    |0                 |
|25%    |0                 |
|50%    |1                 |
|75%    |1                 |
|max    |2                 |
+-------+------------------+



In [19]:
df_customer_features.printSchema()

root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = false)
 |-- avg_spent: double (nullable = false)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- offer_purchase_count: long (nullable = true)
 |-- offer_total_spent: double (nullable = false)
 |-- offer_avg_spent: double (nullable = false)
 |-- days_since_last_offer_purchase: integer (nullable = true)
 |-- offer_purchase_count_cat: long (nullable = true)
 |-- offer_total_spent_cat: double (nullable = false)
 |-- offer_avg_spent_cat: double (nullable = false)
 |-- days_since_last_offer_purchase_cat: integer (nullable = true)
 |-- unique_chains: long (nullable = true)
 |-- offervalue: double (nullable = false)
 |-- offer_value_per_transaction: double (nullable = false)
 |-- avg_days_between_purchases: double (nullable = false

**valiar Funcionalidades - Verificar Correlações**

In [21]:
feature_cols = [
    "total_transactions", "total_spent", "avg_spent",
    "unique_categories", "unique_products",
    "offer_purchase_count", "offer_total_spent", "offer_avg_spent",
    "days_since_last_offer_purchase",
    "offer_purchase_count_cat", "offer_total_spent_cat", "offer_avg_spent_cat",
    "days_since_last_offer_purchase_cat",
    "unique_chains", "offervalue", "offer_value_per_transaction",
    "avg_days_between_purchases"
]

In [22]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler



# Amostrar 10% para eficiência (opcional, remover se o dataset for gerível)
sample_df = df_customer_features.sample(fraction=0.1, seed=42)

# Combinar funcionalidades num vetor
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_for_corr")
corr_df = assembler.transform(sample_df).select("features_for_corr")

# Calcular matriz de correlação
correlation_matrix = Correlation.corr(corr_df, "features_for_corr").head()[0]
print("Matriz de Correlação:")
for i, row in enumerate(correlation_matrix.toArray()):
    print(f"{feature_cols[i]}: {row}")

Matriz de Correlação:
total_transactions: [ 1.          0.98660498  0.02908827  0.13359913  0.40951923  0.19240689
  0.30405577  0.28483415  0.02215508  0.69727804  0.54578781  0.31086169
  0.04812621  0.04185929  0.01323587  0.04020481 -0.05566961]
total_spent: [ 0.98660498  1.          0.04480806  0.08016611  0.33660929  0.16225113
  0.25658141  0.24029869  0.01850099  0.69602142  0.50597579  0.27159314
  0.04418412  0.01796431  0.00886273  0.02877265 -0.02782393]
avg_spent: [ 2.90882733e-02  4.48080598e-02  1.00000000e+00 -2.51785504e-01
 -9.45975817e-02  1.35204929e-02  2.15331053e-02  1.98103669e-02
  8.45101578e-04  2.72097217e-02  2.98551227e-02  1.39704380e-02
 -5.60719144e-03 -1.32031014e-01 -6.70670098e-02 -5.34870401e-03
  1.22979856e-01]
unique_categories: [ 0.13359913  0.08016611 -0.2517855   1.          0.8653264   0.02822381
  0.03826858  0.03739519  0.0111758   0.06362297  0.05483902  0.05309871
  0.02937216  0.45540342  0.08290755  0.03161104 -0.44742767]
unique_produc

In [24]:
import plotly.express as px
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

# Definir feature_cols
feature_cols = [
    "total_transactions", "total_spent", "avg_spent",
    "unique_categories", "unique_products",
    "offer_purchase_count", "offer_total_spent", "offer_avg_spent",
    "days_since_last_offer_purchase",
    "offer_purchase_count_cat", "offer_total_spent_cat", "offer_avg_spent_cat",
    "days_since_last_offer_purchase_cat",
    "unique_chains", "offervalue", "offer_value_per_transaction",
    "avg_days_between_purchases"
]

# Verificar se todas as colunas existem
missing_cols = [col for col in feature_cols if col not in df_customer_features.columns]
if missing_cols:
    print(f"Colunas ausentes: {missing_cols}")
    feature_cols = [col for col in feature_cols if col not in missing_cols]

# Combinar funcionalidades num vetor
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_for_corr")
corr_df = assembler.transform(df_customer_features.sample(fraction=0.1, seed=42)).select("features_for_corr")

# Calcular matriz de correlação
correlation_matrix = Correlation.corr(corr_df, "features_for_corr").head()[0].toArray()

# Plotar correlações
print(f'Correlações calculadas entre {feature_cols}:')
fig = px.imshow(
    correlation_matrix,
    title='Correlações entre Funcionalidades',
    x=feature_cols,
    y=feature_cols,
    color_continuous_scale='Sunsetdark',
    text_auto='.2f',
    zmin=-1, zmax=1
)
fig.update_layout(
    xaxis_title="Funcionalidades",
    yaxis_title="Funcionalidades",
    width=800,
    height=800
)
fig.show()

Correlações calculadas entre ['total_transactions', 'total_spent', 'avg_spent', 'unique_categories', 'unique_products', 'offer_purchase_count', 'offer_total_spent', 'offer_avg_spent', 'days_since_last_offer_purchase', 'offer_purchase_count_cat', 'offer_total_spent_cat', 'offer_avg_spent_cat', 'days_since_last_offer_purchase_cat', 'unique_chains', 'offervalue', 'offer_value_per_transaction', 'avg_days_between_purchases']:


Redundâncias Eliminadas: Excluímos total_spent, unique_categories, offer_total_spent, offer_avg_spent, offer_avg_spent_cat.

Importância Mantida: total_transactions, avg_spent, unique_products, offer_total_spent_cat.

Informações Únicas: days_since_last_offer_purchase, days_since_last_offer_purchase_cat, offervalue, offer_value_per_transaction, unique_chains, avg_days_between_purchases.

Relevância: Foco em volume (total_transactions), gasto médio (avg_spent), recência (days_since_last_offer_purchase_cat), e oferta (offervalue).

In [26]:
feature_cols = [
    "total_transactions",           # Alta importância (0.3538)
    "avg_spent",                    # Alta importância (0.2218)
    "unique_products",              # Moderada importância (0.1271)
    "offer_purchase_count",         # Comportamento na oferta
    "days_since_last_offer_purchase", # Recência
    "offer_purchase_count_cat",     # Comportamento na categoria
    "offer_total_spent_cat",        # Moderada importância (0.0999)
    "days_since_last_offer_purchase_cat", # Recência na categoria
    "unique_chains",                # Diversidade de lojas
    "offervalue",                   # Valor da oferta
    "offer_value_per_transaction",  # Incentivo por transação
    "avg_days_between_purchases"    # Frequência
]

In [None]:
# Preparar dados de treino (já criado anteriormente)
from pyspark.sql.functions import when

train_df = df_customer_features.join(
    df_trainHistory.select("id", "repeater"),
    "id",
    "inner"
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0))

# Dividir em treino (70%) e validação (30%)
train_split, val_split = train_df.randomSplit([0.7, 0.3], seed=42)

# Cache para eficiência
train_split.cache()
val_split.cache()

# Verificar tamanhos
print(f"Treino: {train_split.count()} linhas")
print(f"Validação: {val_split.count()} linhas")

In [27]:
# Preparar dados de treino (já criado anteriormente)
from pyspark.sql.functions import when

train_df = df_customer_features.join(
    df_trainHistory.select("id", "repeater"),
    "id",
    "inner"
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0))

# Dividir em treino (70%) e validação (30%)
train_split, val_split = train_df.randomSplit([0.7, 0.3], seed=42)

# Cache para eficiência
train_split.cache()
val_split.cache()

# Verificar tamanhos
print(f"Treino: {train_split.count()} linhas")
print(f"Validação: {val_split.count()} linhas")

Treino: 112283 linhas
Validação: 47774 linhas


In [32]:
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LinearSVC

# 1. Dividir em treino (70%) e validação (30%)
train_df = df_customer_features.join(
    df_trainHistory.select("id", "repeater"),
    "id",
    "inner"
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0))

train_split, val_split = train_df.randomSplit([0.7, 0.3], seed=42)

# Cache para eficiência
train_split.cache()
val_split.cache()

# Verificar tamanhos
print(f"Treino: {train_split.count()} linhas")
print(f"Validação: {val_split.count()} linhas")

# 2. Liberar memória do train_df
train_df.unpersist()

# 3. Definir feature_cols
feature_cols = [
    "total_transactions", "avg_spent",
    "unique_products",
    "offer_purchase_count",
    "days_since_last_offer_purchase",
    "offer_purchase_count_cat", "offer_total_spent_cat",
    "days_since_last_offer_purchase_cat",
    "unique_chains", "offervalue", "offer_value_per_transaction",
    "avg_days_between_purchases"
]

# 4. Configurar pipeline
# Definir colunas categóricas (vazia, pois todas são numéricas)
categorical_cols = []

# StringIndexer
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="skip")
    for col in categorical_cols
]

# OneHotEncoder
encoders = [
    OneHotEncoder(inputCols=[f"{col}_index"], outputCols=[f"{col}_ohe"])
    for col in categorical_cols
]

# VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# LinearSVC
svc = LinearSVC(labelCol="target", featuresCol="features", maxIter=100, regParam=0.1)

# Criar pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, svc])

# 5. Treinar modelo
model = pipeline.fit(train_split)

Treino: 112283 linhas
Validação: 47774 linhas


In [33]:
from pyspark.sql.functions import col
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Fazer previsões
val_predictions = model.transform(val_split)

# Mostrar esquema e colunas
print("Esquema do DataFrame de previsões:")
val_predictions.printSchema()
print("Amostra das colunas features, rawPrediction, prediction, target:")
val_predictions.select("features", "rawPrediction", "prediction", "target").show(10, truncate=False)

# Calcular AUC-ROC
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
auc_roc = evaluator.evaluate(val_predictions)
print(f"AUC-ROC (Validação): {auc_roc:.4f}")

# Computar matriz de confusão
confusion_matrix = val_predictions.groupBy("target", "prediction").count().collect()
TP = TN = FP = FN = 0
for row in confusion_matrix:
    actual = row["target"]
    predicted = row["prediction"]
    count = row["count"]
    if actual == 1 and predicted == 1:
        TP = count
    elif actual == 0 and predicted == 0:
        TN = count
    elif actual == 0 and predicted == 1:
        FP = count
    elif actual == 1 and predicted == 0:
        FN = count

print("Matriz de Confusão:")
print(f"{'':>10} {'Predicted 0':>12} {'Predicted 1':>12}")
print(f"{'Actual 0':>10} {TN:>12} {FP:>12}")
print(f"{'Actual 1':>10} {FN:>12} {TP:>12}")

# Calcular métricas
total = TP + TN + FP + FN
accuracy = (TP + TN) / total if total > 0 else 0.0
precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

print("Métricas de Avaliação:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Esquema do DataFrame de previsões:
root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = false)
 |-- avg_spent: double (nullable = false)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- offer_purchase_count: long (nullable = true)
 |-- offer_total_spent: double (nullable = false)
 |-- offer_avg_spent: double (nullable = false)
 |-- days_since_last_offer_purchase: integer (nullable = true)
 |-- offer_purchase_count_cat: long (nullable = true)
 |-- offer_total_spent_cat: double (nullable = false)
 |-- offer_avg_spent_cat: double (nullable = false)
 |-- days_since_last_offer_purchase_cat: integer (nullable = true)
 |-- unique_chains: long (nullable = true)
 |-- offervalue: double (nullable = false)
 |-- offer_value_per_transaction: double (nullable = false)
 |-- avg_days_between_

In [37]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Definir colunas categóricas (vazia)
categorical_cols = []

# StringIndexer
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="skip")
    for col in categorical_cols
]

# OneHotEncoder
encoders = [
    OneHotEncoder(inputCols=[f"{col}_index"], outputCols=[f"{col}_ohe"])
    for col in categorical_cols
]

# VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [38]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, GBTClassifier

# Dicionário para armazenar resultados
results = []

# 1. RandomForestClassifier
rf = RandomForestClassifier(labelCol="target", featuresCol="features", numTrees=50, maxDepth=10, seed=42)
rf_pipeline = Pipeline(stages=indexers + encoders + [assembler, rf])
rf_model = rf_pipeline.fit(train_split)
rf_auc_roc, rf_val_predictions = evaluate_model(rf_model, val_split, "RandomForest")
results.append(("RandomForest", rf_auc_roc, rf_model))

# 2. LogisticRegression
lr = LogisticRegression(labelCol="target", featuresCol="features", maxIter=100, regParam=0.1)
lr_pipeline = Pipeline(stages=indexers + encoders + [assembler, lr])
lr_model = lr_pipeline.fit(train_split)
lr_auc_roc, lr_val_predictions = evaluate_model(lr_model, val_split, "LogisticRegression")
results.append(("LogisticRegression", lr_auc_roc, lr_model))

# 3. DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxDepth=10, seed=42)
dt_pipeline = Pipeline(stages=indexers + encoders + [assembler, dt])
dt_model = dt_pipeline.fit(train_split)
dt_auc_roc, dt_val_predictions = evaluate_model(dt_model, val_split, "DecisionTree")
results.append(("DecisionTree", dt_auc_roc, dt_model))

# 4. GBTClassifier
gbt = GBTClassifier(labelCol="target", featuresCol="features", maxIter=50, maxDepth=5, seed=42)
gbt_pipeline = Pipeline(stages=indexers + encoders + [assembler, gbt])
gbt_model = gbt_pipeline.fit(train_split)
gbt_auc_roc, gbt_val_predictions = evaluate_model(gbt_model, val_split, "GradientBoostedTrees")
results.append(("GradientBoostedTrees", gbt_auc_roc, gbt_model))


Esquema do DataFrame de previsões (RandomForest):
root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = false)
 |-- avg_spent: double (nullable = false)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- offer_purchase_count: long (nullable = true)
 |-- offer_total_spent: double (nullable = false)
 |-- offer_avg_spent: double (nullable = false)
 |-- days_since_last_offer_purchase: integer (nullable = true)
 |-- offer_purchase_count_cat: long (nullable = true)
 |-- offer_total_spent_cat: double (nullable = false)
 |-- offer_avg_spent_cat: double (nullable = false)
 |-- days_since_last_offer_purchase_cat: integer (nullable = true)
 |-- unique_chains: long (nullable = true)
 |-- offervalue: double (nullable = false)
 |-- offer_value_per_transaction: double (nullable = false)
 |-- a

In [39]:
# Adicionar LinearSVC (assumindo que você tem o auc_roc do resultado anterior)
# Substitua linear_svc_auc_roc pelo valor real obtido
linear_svc_auc_roc = auc_roc  # Use o valor do seu LinearSVC anterior
results.append(("LinearSVC", linear_svc_auc_roc, model))

# Comparar AUC-ROC
print("\nComparação de Modelos (AUC-ROC):")
for name, auc, _ in results:
    print(f"{name}: {auc:.4f}")

# Escolher o melhor modelo
best_model = max(results, key=lambda x: x[1])
print(f"\nMelhor modelo: {best_model[0]} com AUC-ROC: {best_model[1]:.4f}")


Comparação de Modelos (AUC-ROC):
RandomForest: 0.6559
LogisticRegression: 0.5397
DecisionTree: 0.5245
GradientBoostedTrees: 0.6600
LinearSVC: 0.5546

Melhor modelo: GradientBoostedTrees com AUC-ROC: 0.6600


In [54]:
import plotly.express as px
import pandas as pd

# Extract model names and AUC-ROC scores from the results list
# Replace these placeholder AUC-ROC values with actual values from your results
results = [
    ("RandomForest", 0.85, None),  # Example AUC-ROC value
    ("LogisticRegression", 0.80, None),  # Example AUC-ROC value
    ("DecisionTree", 0.75, None),  # Example AUC-ROC value
    ("GradientBoostedTrees", 0.88, None),  # Example AUC-ROC value
    ("LinearSVC", 0.82, None)  # Example AUC-ROC value (replace with your linear_svc_auc_roc)
]

# Create a DataFrame for Plotly
data = {
    "Model": [name for name, _, _ in results],
    "AUC-ROC": [auc for _, auc, _ in results]
}
df = pd.DataFrame(data)

# Create a bar chart using Plotly Express
fig = px.bar(
    df,
    x="Model",
    y="AUC-ROC",
    title="Comparison of Model Performance (AUC-ROC)",
    labels={"AUC-ROC": "AUC-ROC Score", "Model": "Model Name"},
    color="Model",
    color_discrete_sequence=px.colors.qualitative.Plotly
)

# Update layout for better readability
fig.update_layout(
    xaxis_title="Model",
    yaxis_title="AUC-ROC Score",
    yaxis_range=[0, 1],  # AUC-ROC scores are typically between 0 and 1
    showlegend=False,
    title_x=0.5
)

# Show the plot
fig.show()

In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def evaluate_additional_metrics(predictions, model_name):
    metrics = ["precisionByLabel", "recallByLabel", "f1"]
    results = {}
    for metric in metrics:
        evaluator = MulticlassClassificationEvaluator(
            labelCol="target", predictionCol="prediction", metricName=metric
        )
        score = evaluator.evaluate(predictions)
        results[metric] = score
    print(f"{model_name} Metrics: {results}")
    return results

# Evaluate GBT predictions
gbt_metrics = evaluate_additional_metrics(gbt_val_predictions, "GradientBoostedTrees")

GradientBoostedTrees Metrics: {'precisionByLabel': 0.7357055372913041, 'recallByLabel': 0.983173007896626, 'f1': 0.6383092038321906}


In [43]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

gbt = GBTClassifier(labelCol="target", featuresCol="features", seed=42)
pipeline = Pipeline(stages=indexers + encoders + [assembler, gbt])
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxIter, [50, 100])
             .addGrid(gbt.maxDepth, [5, 10])
             .build())
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
cv_model = crossval.fit(train_split)
best_gbt_auc_roc = evaluator.evaluate(cv_model.transform(val_split))
print(f"Tuned GBT AUC-ROC: {best_gbt_auc_roc:.4f}")

Tuned GBT AUC-ROC: 0.6600


In [44]:
feature_importance = rf_model.stages[-1].featureImportances
print("Feature Importances:", feature_importance)

Feature Importances: (12,[0,1,2,3,4,5,6,7,8,9,10,11],[0.11992866560490009,0.08564657780440894,0.0871899084376784,0.001889119403551761,0.0017065875583536863,0.0034986972411792153,0.013662801247842717,0.011311968311518058,0.007449036622919843,0.6305072006354122,0.005764126570196182,0.0314453105620388])


In [49]:
test_split, val_split = train_df.randomSplit([0.7, 0.3], seed=42)

In [50]:
test_auc_roc = evaluate_model(gbt_model, test_split, "GradientBoostedTrees")[0]
print(f"Test Set AUC-ROC for GBT: {test_auc_roc:.4f}")


Esquema do DataFrame de previsões (GradientBoostedTrees):
root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = false)
 |-- avg_spent: double (nullable = false)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- offer_purchase_count: long (nullable = true)
 |-- offer_total_spent: double (nullable = false)
 |-- offer_avg_spent: double (nullable = false)
 |-- days_since_last_offer_purchase: integer (nullable = true)
 |-- offer_purchase_count_cat: long (nullable = true)
 |-- offer_total_spent_cat: double (nullable = false)
 |-- offer_avg_spent_cat: double (nullable = false)
 |-- days_since_last_offer_purchase_cat: integer (nullable = true)
 |-- unique_chains: long (nullable = true)
 |-- offervalue: double (nullable = false)
 |-- offer_value_per_transaction: double (nullable = false

In [52]:
gbt_model.save("path/to/save/gbt_model")