In [1]:
# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os


In [2]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

## Data Ingestion

In [3]:
# Data to read - offers.csv
data_dir_offers = f'{base_path}/offers.csv.gz'
data_file_offers = data_dir_offers

! head $data_file_offers

# Data to read - sampleSubmission.csv
data_dir_sampleSubmission = f'{base_path}/sampleSubmission.csv.gz'
data_file_sampleSubmission = data_dir_sampleSubmission

! head $data_file_sampleSubmission

# Data to read - testHistory.csv
data_dir_testHistory = f'{base_path}/testHistory.csv.gz'
data_file_testHistory = data_dir_testHistory

! head $data_file_testHistory

# Data to read - trainHistory.csv
data_dir_trainHistory = f'{base_path}/trainHistory.csv.gz'
data_file_trainHistory = data_dir_trainHistory

! head $data_file_trainHistory

# Data to read - transactions.csv
data_dir_transactions = f'{base_path}/transactions.csv.gz'
data_file_transactions = data_dir_transactions

! head $data_file_transactions;

�}ESoffers.csv ���n�0��=Q�����u�0`K� ���������Y)�^__��|��|?�����p���\������x���=�|{�������fW�q.g�brjm��Q7� )��u�X�t)XG&fa�O�$j9�(���׶#A��䜘�Bw��
,d�2�3;�b4[&� ̞E��f�l<�Ȉ���D�"��#u���V҃��Y�@���ʎ��W;XKU�C�6� ���5j�T���g��̛�ܮA�U=@AV0',%�W��M�=Sp!��^�����9�{�h�Ȭ��o�F#F�|l�W�Yխ�X���#
fʥ�+}�p��<�������B!�Q�E"��p�2M�^�*0K�c�������@@�4��e�I�Tb�����A��Kˆ��ч�ˡɿl��PH�Υ���,�6�Yoe0�d�����4�}���۰��c����Z�2+�]�v�t������l����#9�^�G���H��2�,-	��1{FGJ��� �:�U.j)��?��8z��2�:�`l�A��Rz�t���>tXw��r:u�q��ݢqS�2� �9�,��VF.��RQ��Y�^6Ȟf��Mf���\�Y'����2��ud��#�����"���0}B����Y��
��%Q�Y*Չ���j�H"����>s,w*�c�,e���+ �=�<��X�b��I��XV��Q�����:�5s����P��exw�1ǕG����1�L����h;����êʗf�exG����V�5�X-%8�8�=�I/��@�!���\��aK/Q86���4�&:�dt�,�ɹ�5��]�v�	IX���l�.��b-C֭c{��L,+h�� ����9������;��;���c��۸w��Y�'3��1����Bm���:N�Ē	�/����G�A-�0>ˁYƑ���ٌK7�9f<V�-Is̲�:�����:���
��ޱ���1��c!|�YM5X:���:��ޚƼէ#cff���\����F��r��؆��Q�z$-�m��ģ�w=*�s���x��^R`ײ痍��

**Offers**

In [4]:
# Reading data - offers.csv
df_offers = spark.read.csv(
        data_file_offers, 
        header=True, sep=',', inferSchema=True
    )

**TESTHISTORY**

In [5]:
# Reading data - testHistory.csv
df_testHistory = spark.read.csv(
        data_file_testHistory, 
        header=True, sep=',', inferSchema=True
    )

**TRAINHISTORY**

In [6]:
# Reading data - trainHistory.csv
df_trainHistory = spark.read.csv(
        data_file_trainHistory, 
        header=True, sep=',', inferSchema=True
    )

**TRANSACTIONS**

In [7]:
df_transactions = spark.read.csv(
    data_file_transactions, 
    header=True, sep=',', inferSchema=False
).sample(fraction=0.001, seed=42).limit(1000000)

In [9]:
import csv
import os
import datetime

# Input and output paths
input_dir = "/home/jovyan/code/data/output_files"
output_file = "/home/jovyan/code/data/customer_features.csv"

# Dictionary to store features for all customers
features = {}

# Process each customer file
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        customer_id = filename.split(".")[0]
        file_path = os.path.join(input_dir, filename)

        total_transactions = 0
        total_spent = 0.0
        first_purchase = None
        last_purchase = None
        unique_categories = set()
        unique_products = set()

        # Open and read the file
        with open(file_path, 'r', newline='') as infile:
            reader = csv.reader(infile)
            next(reader)  # Skip header

            for row in reader:
                total_transactions += 1
                amount = float(row[10])  # purchaseamount is the 11th column (index 10)
                total_spent += amount

                # Parse date for temporal features
                date_str = row[6]  # date is the 7th column (index 6)
                date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
                if first_purchase is None or date < first_purchase:
                    first_purchase = date
                if last_purchase is None or date > last_purchase:
                    last_purchase = date

                # Track unique categories and products (using company as proxy for product)
                unique_categories.add(row[3])  # category is the 4th column (index 3)
                unique_products.add(row[4])    # company is the 5th column (index 4)

        # Calculate average spent
        avg_spent = total_spent / total_transactions if total_transactions > 0 else 0.0

        # Store features
        features[customer_id] = {
            "total_transactions": total_transactions,
            "total_spent": round(total_spent, 2),
            "avg_spent": round(avg_spent, 2),
            "first_purchase": first_purchase.strftime('%Y-%m-%d') if first_purchase else "",
            "last_purchase": last_purchase.strftime('%Y-%m-%d') if last_purchase else "",
            "unique_categories": len(unique_categories),
            "unique_products": len(unique_products)
        }

# Write features to output CSV
with open(output_file, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    # Write header
    writer.writerow([
        "id",
        "total_transactions",
        "total_spent",
        "avg_spent",
        "first_purchase",
        "last_purchase",
        "unique_categories",
        "unique_products"
    ])
    # Write data for each customer
    for customer_id, feature_dict in features.items():
        writer.writerow([
            customer_id,
            feature_dict["total_transactions"],
            feature_dict["total_spent"],
            feature_dict["avg_spent"],
            feature_dict["first_purchase"],
            feature_dict["last_purchase"],
            feature_dict["unique_categories"],
            feature_dict["unique_products"]
        ])

print(f"Features saved to {output_file}")

Features saved to /home/jovyan/code/data/customer_features.csv


In [10]:
# Dividir em treino e validação
train, validation = train_data.randomSplit([0.8, 0.2], seed=42)

NameError: name 'train_data' is not defined

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff

# Configurar sessão Spark
spark = SparkSession.builder \
    .appName("ShoppersChallenge") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# Definir esquema para customer_features.csv
schema_features = StructType([
    StructField("id", StringType(), True),
    StructField("total_transactions", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", IntegerType(), True),
    StructField("unique_products", IntegerType(), True)
])

# Carregar customer_features.csv
features_df = spark.read.csv("/home/jovyan/code/data/customer_features.csv", header=True, schema=schema_features)

# Converter first_purchase e last_purchase para features numéricas
reference_date = to_date(lit("2025-05-29"))
features_df = features_df.withColumn("days_since_first", datediff(reference_date, to_date("first_purchase"))) \
                        .withColumn("days_since_last", datediff(reference_date, to_date("last_purchase")))

# Definir esquema para trainHistory.csv
schema_history = StructType([
    StructField("id", StringType(), True),
    StructField("chain", StringType(), True),
    StructField("offer", StringType(), True),
    StructField("market", StringType(), True),
    StructField("repeattrips", IntegerType(), True),
    StructField("repeater", StringType(), True),
    StructField("offerdate", StringType(), True)
])


# Juntar features com a variável alvo
train_data = features_df.join(
    df_trainHistory.select("id", "repeater"),
    features_df.id == df_trainHistory.id,
    "inner"
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0)).drop("repeater", "first_purchase", "last_purchase")

In [17]:
# Dividir em treino e validação
train, validation = train_data.randomSplit([0.8, 0.2], seed=42)

In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Combinar features em um vetor
feature_cols = [
    "total_transactions",
    "total_spent",
    "avg_spent",
    "unique_categories",
    "unique_products",
    "days_since_first",
    "days_since_last"
]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_assembled = assembler.transform(train)
validation_assembled = assembler.transform(validation)

# Inicializar modelos
models = {
    "LogisticRegression": LogisticRegression(labelCol="target", featuresCol="features"),
    "RandomForest": RandomForestClassifier(labelCol="target", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="target", featuresCol="features")
}

# Avaliar modelos
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
results = {}

for name, model in models.items():
    # Configurar validação cruzada
    paramGrid = ParamGridBuilder().build()
    if name == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.numTrees, [50, 100]) \
            .addGrid(model.maxDepth, [5, 10]) \
            .build()
    elif name == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [10, 20]) \
            .addGrid(model.maxDepth, [5, 7]) \
            .build()
    
    crossval = CrossValidator(
        estimator=model,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
    # Treinar modelo
    cv_model = crossval.fit(train_assembled)
    
    # Avaliar na validação
    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[name] = auc
    print(f"{name} AUC-ROC: {auc}")

# Selecionar o melhor modelo
best_model_name = max(results, key=results.get)
print(f"Melhor modelo: {best_model_name} com AUC-ROC {results[best_model_name]}")

LogisticRegression AUC-ROC: 0.5927542816089806
RandomForest AUC-ROC: 0.6365021366961058
GradientBoosting AUC-ROC: 0.6333764174816299
Melhor modelo: RandomForest com AUC-ROC 0.6365021366961058
