# Feature Engineering

In [1]:
# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os
from pyspark.sql.functions import col, count, sum, avg, min, datediff, to_date
from pyspark.sql.functions import lag, datediff, avg
from pyspark.sql.window import Window
from pyspark.sql.functions import count, sum, avg, min, datediff, to_date
from pyspark.sql.functions import col, count, sum, avg, min, datediff, to_date
from pyspark.sql.functions import countDistinct



In [2]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

In [3]:
# Reading data - offers.csv
df_offers = spark.read.csv(
        f"{base_path}/offers.csv.gz", 
        header=True, sep=',', inferSchema=True
    )

In [None]:
df_transactions = spark.read.csv(
    f"{base_path}/transactions.csv.gz", 
    header=True, sep=',', inferSchema=True
).sample(fraction=0.001, seed=42).limit(1000000)

In [None]:
df_transactions_sample = spark.read.csv(
    f"{base_path}/improved/transactions_sample.csv", 
    header=True, sep=',', inferSchema=True
)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/jovyan/code/data/improved/transactions_sample.csv.

In [None]:
# Reading data - trainHistory.csv
df_trainHistory = spark.read.csv(
        f"{base_path}/trainHistory.csv.gz", 
        header=True, sep=',', inferSchema=True
    )

In [None]:
df_customer_features = spark.read.csv(
    f"{base_path}/improved/customer_features.csv", 
    header=True, sep=',', inferSchema=True
)

Para além do customer_features esta feature engineering serve para adicionar mais features a essa tabela.

In [None]:
df_transactions = df_transactions.withColumnRenamed("category", "trans_category") \
                                .withColumnRenamed("company", "trans_company") \
                                .withColumnRenamed("brand", "trans_brand")

## Se for preciso aqui usar a correlação que tinhas feito antes para dizer que 
## a brand e o company não são relevantes para o repeater
df_offer_transactions = df_transactions.join(
    df_offers.select("offer", "category"),
    (df_transactions.trans_category == df_offers.category),
    "inner"
).join(
    df_trainHistory.select("id", "chain", "offer", "offerdate"),
    ["id", "offer", "chain"],  # Join on id, offer, and chain to avoid ambiguity
    "inner"
).filter(
    to_date("date") <= to_date("offerdate")
)

# Step 4: Aggregate offer-specific features
offer_features = df_offer_transactions.groupBy("id").agg(
    count("*").alias("offer_purchase_count"),
    sum("purchaseamount").alias("offer_total_spent"),
    avg("purchaseamount").alias("offer_avg_spent"),
    min(datediff(to_date("offerdate"), to_date("date"))).alias("days_since_last_offer_purchase")
)

# Step 5: Join with existing customer features
df_customer_features = df_customer_features.join(
    offer_features,
    "id",
    "left"
).na.fill(0)  # Fill nulls for customers with no offer-related transactions

In [None]:
window_spec = Window.partitionBy("id").orderBy("date")
trans_with_diff = df_transactions.withColumn(
    "prev_date",
    lag("date").over(window_spec)
).withColumn(
    "days_diff",
    datediff(to_date(col("date")), to_date(col("prev_date")))
)

freq_features = trans_with_diff.groupBy("id").agg(
    avg("days_diff").alias("avg_days_between_purchases")
)
df_customer_features = df_customer_features.join(
    freq_features,
    "id",
    "left"
).na.fill(999)

In [None]:
offer_value = df_trainHistory.select("id", "offer").join(
    df_offers.select("offer", "offervalue"),
    "offer"
)
df_customer_features = df_customer_features.join(
    offer_value.select("id", "offervalue"),
    "id",
    "left"
).withColumn(
    "offer_value_per_transaction",
    col("offervalue") / col("offer_purchase_count")
).na.fill(0)

In [None]:
chain_features = df_transactions.groupBy("id").agg(
    countDistinct("chain").alias("unique_chains")
)
df_customer_features = df_customer_features.join(
    chain_features,
    "id",
    "left"
).na.fill(0)

In [None]:
df_customer_features.select("unique_chains", "offer_purchase_count", "offer_total_spent", "offer_avg_spent",
                            "avg_days_between_purchases", "offer_value_per_transaction").summary().show(truncate=False)

+-------+------------------+---------------------+--------------------+--------------------+--------------------------+---------------------------+
|summary|unique_chains     |offer_purchase_count |offer_total_spent   |offer_avg_spent     |avg_days_between_purchases|offer_value_per_transaction|
+-------+------------------+---------------------+--------------------+--------------------+--------------------------+---------------------------+
|count  |311541            |311541               |311541              |311541              |311541                    |311541                     |
|mean   |0.5538693141512674|0.0029017047515415306|0.014571918302887791|0.013192694082457313|762.6616668467402         |0.0029271714752518246      |
|stddev |0.4970968817692563|0.06058118879316296  |0.47635459796674195 |0.3207507011622313  |392.6921829827085         |0.05757790593524346        |
|min    |0                 |0                    |-1.09               |-1.09               |0.0                 

In [None]:
df_customer_features.printSchema()

# Save the improved customer features DataFrame to CSV and Parquet formats
df_customer_features.write.csv(
    f"{base_path}/improved/customer_features_improved.csv",
    header=True,
    mode="overwrite"
)

df_customer_features.write.parquet(
    f"{base_path}/parquet/customer_features_improved.parquet",
    mode="overwrite"
)

root
 |-- id: long (nullable = true)
 |-- total_transactions: integer (nullable = true)
 |-- total_spent: double (nullable = false)
 |-- avg_spent: double (nullable = false)
 |-- first_purchase: date (nullable = true)
 |-- last_purchase: date (nullable = true)
 |-- unique_categories: integer (nullable = true)
 |-- unique_products: integer (nullable = true)
 |-- offer_purchase_count: long (nullable = true)
 |-- offer_total_spent: double (nullable = false)
 |-- offer_avg_spent: double (nullable = false)
 |-- days_since_last_offer_purchase: integer (nullable = true)
 |-- avg_days_between_purchases: double (nullable = false)
 |-- offervalue: double (nullable = false)
 |-- offer_value_per_transaction: double (nullable = false)
 |-- unique_chains: long (nullable = true)

