In [2]:
import pandas as pd
import numpy as np
import random
# River modules
from river import compose, base, preprocessing, feature_extraction, ensemble, tree, metrics, evaluate, stream
from river import forest
from collections import defaultdict

pd.set_option("display.max_columns", 100)
data = pd.read_parquet('/Users/leolebuhotel/Documents/DSBA/T2/Hackaton Eleven/hackathon/final_df.parquet')
data.drop(columns=['Rolling90Pct_Football', 'Rolling90Pct_Handball',
       'Rolling90Pct_Badminton', 'Rolling90Pct_Baseball',
       'Rolling90Pct_Tennis', 'Rolling90Pct_Hockey', 'Rolling90Pct_Cricket',
       'Rolling90Pct_Beach', 'Rolling90Pct_Basketball', 'Rolling90Pct_Rugby',
       'Rolling90Pct_Golf', 'Rolling90Pct_Softball', 'Rolling90Pct_Cycling',
       'Rolling90Pct_Volleyball', 'Rolling90Pct_Running',
       'Rolling90Pct_Skiing'], inplace=True)

In [3]:
# =============================================================================
# 1. Custom Transformer for Missing Indicators
# =============================================================================
class MissingIndicator(base.Transformer):
    """A transformer that adds missing indicators for selected keys."""
    def __init__(self, keys):
        self.keys = keys

    def transform_one(self, x):
        for key in self.keys:
            # Check if the value is None or np.nan
            if x.get(key) is None or (isinstance(x.get(key), float) and np.isnan(x.get(key))):
                x[f'{key}_missing'] = 1
            else:
                x[f'{key}_missing'] = 0
        return x

    def learn_one(self, x):
        # For a transformer, learn_one is often a no-op.
        return self.transform_one(x)

In [4]:
# =============================================================================
# 2. Define Feature Groups
# =============================================================================
# These lists can be adjusted. Here we assume that you want to use most of your features.
# We drop ID and date fields as they are identifiers (unless you decide to keep them).
CATEGORICAL_FEATURES = [
    'StoreCountry', 'Category', 'FamilyLevel1', 'FamilyLevel2', 'Universe',
    'ClientSegment', 'ClientGender', 'ClientCountry', 'ClientOptINEmail', 'ClientOptINPhone',
    'Weekday', 'Brand', 'DayOfWeek', 'Month', 'Season'
]

NUMERICAL_FEATURES = [
    'Age', 'Quantity_sold', 'SalesNetAmountEuro', 'product_avg_price_order', 'avg_price',
    'DaysSinceLastTransaction', 'CumulativeSpent', 'CumulativeQuantity', 'PercentageMaleProductsSoFar',
    'UniqueProductsSoFar', 'AverageAmountPerTransactionSoFar', 'AverageFrequencySoFar', 'AveragePrice',
    'Frequency_30', 'Monetary_30', 'Recency_30', 'Frequency_60', 'Monetary_60', 'Recency_60',
    'Frequency_90', 'Monetary_90', 'Recency_90'
]

In [5]:
from collections import defaultdict

# =============================================================================
# 3. Build the Feature Processing Pipeline
# =============================================================================
# For categorical features, we use one-hot encoding.
cat_pipeline = compose.Select(*CATEGORICAL_FEATURES) | preprocessing.OneHotEncoder()

# For numerical features, we apply a standard scaler.
num_pipeline = compose.Select(*NUMERICAL_FEATURES) | preprocessing.StandardScaler()

# Combine the two pipelines in parallel:
feature_pipeline = compose.TransformerUnion(
    cat_pipeline,
    num_pipeline
)

# =============================================================================
# Precompute the List of Available Products for Negative Sampling
# =============================================================================
# Define the product-related columns. (We assume these columns identify a product.)
product_cols = ['ProductID', 'StoreCountry', 'Category', 'FamilyLevel1', 'FamilyLevel2', 'Universe']

# Extract a unique list of products from the merged dataframe.
all_products = data[product_cols].drop_duplicates().to_dict('records')

# Build a dictionary mapping country -> list of products.
products_by_country = defaultdict(list)
for prod in all_products:
    country = prod.get('StoreCountry')
    if country is not None:
        products_by_country[country].append(prod)

In [6]:
from river import forest
from river.compose import Pipeline

# =============================================================================
# 4. Build Two Model Pipelines
# =============================================================================
# Create an instance of the MissingIndicator and wrap it for pipeline composition.
missing_indicator = MissingIndicator(keys=['Age', 'ClientGender'])
missing_indicator_transformer = compose.FuncTransformer(missing_indicator.transform_one)

# Using River 0.22.0, the Adaptive Random Forest classifier is ARFClassifier.
pipeline_arf = Pipeline(
    missing_indicator_transformer,
    feature_pipeline,
    forest.ARFClassifier()
)

pipeline_hat = Pipeline(
    missing_indicator_transformer,
    feature_pipeline,
    tree.HoeffdingAdaptiveTreeClassifier()
)

In [7]:
# =============================================================================
# 5. Convert DataFrame to a River Stream with Negative Sampling
# =============================================================================
def df_to_stream_with_negative_sampling(df, negative_ratio=2):
    """
    Convert a DataFrame into a stream of (x, y) examples with negative sampling.
    Each row in df is assumed to be a positive example (i.e., a transaction with a purchase).
    For each positive, generate negative samples by replacing the product information with
    products available in the client's country.
    
    Parameters:
        df: DataFrame containing positive transactions.
        negative_ratio: Number of negative samples to generate per positive sample.
    
    Yields:
        (x, y) tuples, where y is 1 for positive and 0 for negative examples.
    """
    # Drop only identifier columns that we don't want as features.
    # We keep TransactionDate because you want it as a raw feature.
    drop_cols = ['ClientID', 'StoreID']
    df = df.drop(columns=drop_cols, errors='ignore')
    
    for _, row in df.iterrows():
        # Convert row to a dictionary.
        pos_example = row.to_dict()
        
        # If TransactionDate is a datetime, convert it to ISO string.
        if 'TransactionDate' in pos_example and hasattr(pos_example['TransactionDate'], 'isoformat'):
            pos_example['TransactionDate'] = pos_example['TransactionDate'].isoformat()
        
        # For the positive sample, set label to 1.
        yield pos_example, 1
        
        # Negative sampling: sample negative examples from products available in the client's country.
        client_country = row.get('ClientCountry', None)
        if client_country is None:
            continue  # Cannot sample negatives without client country.
        
        # Get candidate products for this country.
        candidates = products_by_country.get(client_country, [])
        if not candidates:
            continue
        
        # Exclude the positive product.
        pos_product_id = row.get('ProductID', None)
        candidate_negatives = [prod for prod in candidates if prod.get('ProductID') != pos_product_id]
        if not candidate_negatives:
            candidate_negatives = candidates  # Fallback if no alternative is found.
        
        # Generate the negative samples.
        for _ in range(negative_ratio):
            neg_product = random.choice(candidate_negatives)
            # Start with a copy of the original row (client and context features).
            neg_example = row.to_dict()
            # Override product-related fields with the negative product info.
            for col in product_cols:
                if col in neg_product:
                    neg_example[col] = neg_product[col]
            # Set transaction-specific fields for a negative sample.
            neg_example['Quantity_sold'] = 0
            neg_example['SalesNetAmountEuro'] = 0
            # If needed, you might adjust other fields here.
            yield neg_example, 0

In [8]:
# =============================================================================
# 6. Split the Data into Training and Testing Sets and Create Streams
# =============================================================================
# For example, split the data chronologically: 80% for training, 20% for testing.
train_size = int(0.8 * len(data))
df_train = data.iloc[:train_size]
df_test = data.iloc[train_size:]

# Use our negative sampling function to produce the streams.
train_stream = df_to_stream_with_negative_sampling(df_train, negative_ratio=2)
test_stream = df_to_stream_with_negative_sampling(df_test, negative_ratio=2)

In [9]:
# =============================================================================
# 7. Train the Model Pipelines
# =============================================================================
for i,(x, y) in enumerate(train_stream):
    if i % 1000 == 0:
        print(f"Itération {i}")
    pipeline_arf.learn_one(x, y)
    pipeline_hat.learn_one(x, y)

Itération 0
Itération 1000
Itération 2000
Itération 3000
Itération 4000
Itération 5000
Itération 6000
Itération 7000
Itération 8000
Itération 9000
Itération 10000
Itération 11000
Itération 12000
Itération 13000
Itération 14000
Itération 15000
Itération 16000
Itération 17000
Itération 18000
Itération 19000
Itération 20000
Itération 21000
Itération 22000
Itération 23000
Itération 24000
Itération 25000
Itération 26000
Itération 27000
Itération 28000
Itération 29000
Itération 30000
Itération 31000
Itération 32000
Itération 33000
Itération 34000
Itération 35000
Itération 36000
Itération 37000
Itération 38000
Itération 39000
Itération 40000
Itération 41000
Itération 42000
Itération 43000
Itération 44000
Itération 45000
Itération 46000
Itération 47000
Itération 48000
Itération 49000
Itération 50000
Itération 51000
Itération 52000
Itération 53000
Itération 54000
Itération 55000
Itération 56000
Itération 57000
Itération 58000
Itération 59000
Itération 60000
Itération 61000
Itération 62000
Itéra

In [11]:
import pickle

# Save the ARF pipeline
with open('pipeline_arf.pkl', 'wb') as f:
    pickle.dump(pipeline_arf, f)

# Save the HAT pipeline
with open('pipeline_hat.pkl', 'wb') as f:
    pickle.dump(pipeline_hat, f)

In [53]:
y_pred_arf = pipeline_arf.predict_proba_one(x)
y_pred_arf

{0: 0.9323944064759272, 1: 0.06760559352407287}

In [32]:
# =============================================================================
# 8. Evaluate the Model Pipelines
# =============================================================================
# Use LogLoss as the evaluation metric.
metric_arf = metrics.LogLoss()
metric_hat = metrics.LogLoss()

test_stream = df_to_stream_with_negative_sampling(df_test, negative_ratio=2)

for i, (x, y) in enumerate(test_stream):
    if i % 1000 == 0:
        print(f"Itération {i}")
        
    y_pred_arf = pipeline_arf.predict_proba_one(x)
    y_pred_hat = pipeline_hat.predict_proba_one(x)
    
    metric_arf.update(y, y_pred_arf)
    metric_hat.update(y, y_pred_hat)

print("Test LogLoss for ARFClassifier:", metric_arf.get())
print("Test LogLoss for HoeffdingAdaptiveTreeClassifier:", metric_hat.get())

Itération 0
Itération 1000
Itération 2000
Itération 3000
Itération 4000
Itération 5000
Itération 6000
Itération 7000
Itération 8000
Itération 9000
Itération 10000
Itération 11000
Itération 12000
Itération 13000
Itération 14000
Itération 15000
Itération 16000
Itération 17000
Itération 18000
Itération 19000
Itération 20000
Itération 21000
Itération 22000
Itération 23000
Itération 24000
Itération 25000
Itération 26000
Itération 27000
Itération 28000
Itération 29000
Itération 30000
Itération 31000
Itération 32000
Itération 33000
Itération 34000
Itération 35000
Itération 36000
Itération 37000
Itération 38000
Itération 39000
Itération 40000
Itération 41000
Itération 42000
Itération 43000
Itération 44000
Itération 45000
Itération 46000
Itération 47000
Itération 48000
Itération 49000
Itération 50000
Itération 51000
Itération 52000
Itération 53000
Itération 54000
Itération 55000
Itération 56000
Itération 57000
Itération 58000
Itération 59000
Itération 60000
Itération 61000
Itération 62000
Itéra

In [34]:
# =============================================================================
# 9. Real-Time Recommendation Example
# =============================================================================
def recommend_for_client(client_features, available_products, model_pipeline, top_k=5):
    """
    Given a client (with features) and a list of available products (each as a dict of product features),
    compute the purchase probability for each (client, product) pair and return the top_k recommendations.
    
    Parameters:
        client_features: dict with client-specific features (e.g., demographics, context).
        available_products: list of dicts, each containing product-specific features.
        model_pipeline: the trained River pipeline.
        top_k: number of recommendations to return.
    
    Returns:
        List of tuples (ProductID, PurchaseProbability) sorted by descending probability.
    """
    scores = []
    for product in available_products:
        # Merge client and product features.
        x = {**client_features, **product}
        # Compute probability (assuming positive class label is 1).
        prob = model_pipeline.predict_proba_one(x).get(1, 0)
        product_id = product.get('ProductID', None)
        scores.append((product_id, prob))
    # Sort by descending probability.
    scores.sort(key=lambda tup: tup[1], reverse=True)
    return scores[:top_k]

In [57]:
def clean_client_features(row):
    """Remove transaction-specific fields from the client's feature dictionary."""
    features = row.to_dict()
    for key in ['ProductID', 'Quantity_sold', 'SalesNetAmountEuro']:
        features.pop(key, None)
    return features

In [None]:
import numpy as np

# Set the number of recommendations to consider
k = 5

# List to store binary hit values (1 if ground truth is in top-k, else 0)
hits = []

# Loop over each transaction in the test set (without negative sampling)
for i, row in df_test.iterrows():
    # Convert the row to a dictionary of features
    client_features = clean_client_features(row)
    
    # Ground-truth product purchased in this transaction
    ground_truth = row.get('ProductID')
    
    # Get the client's country (assumed to be in the row)
    client_country = row.get('ClientCountry')
    if client_country is None:
        # Skip this transaction if there's no country information
        continue
    
    # Filter candidate products to those available in the client's country
    candidates = [prod for prod in all_products if prod.get('StoreCountry') == client_country]
    
    # Generate top-k recommendations using the trained model (choose pipeline_arf or pipeline_hat)
    recommendations = recommend_for_client(client_features, candidates, pipeline_hat, top_k=k)

    # Extract the recommended product IDs (the function returns tuples of (ProductID, score))
    recommended_ids = [prod_id for prod_id, score in recommendations]
    
    # Check if the ground truth product is in the top-k recommendations
    hit = 1 if ground_truth in recommended_ids else 0
    hits.append(hit)
    print(recommendations)

# Compute the mean hit rate (i.e. the proportion of transactions where the purchased product was recommended)
mean_hit_rate = np.mean(hits)
print(f"Mean Hit Rate @ {k}: {mean_hit_rate:.4f}")

KeyError: 'SalesNetAmountEuro'

In [103]:
# =============================================================================
# 2. Helper: Fill Missing Features with Defaults
# =============================================================================
def fill_missing_features(client_features, default_values):
    """
    Fill in any missing keys in client_features with the corresponding default.
    """
    for key, default in default_values.items():
        if key not in client_features:
            client_features[key] = default
    return client_features

# Define default values for all expected features.
default_values = {
    'StoreCountry': 'USA',        # Default if unknown
    'Category': 'Unknown',
    'FamilyLevel1': 'Unknown',
    'FamilyLevel2': 'Unknown',
    'Universe': 'Unknown',
    'ClientSegment': 'Unknown',
    'ClientGender': 'Unknown',
    'ClientCountry': 'USA',
    'ClientOptINEmail': False,
    'ClientOptINPhone': False,
    'Weekday': 'Unknown',
    'Brand': 'Unknown',
    'DayOfWeek': 'Unknown',
    'Month': 'January',           # Default month
    'Season': 'Unknown',
    # Numerical defaults:
    'Age': 0,
    'Quantity_sold': 0,
    'SalesNetAmountEuro': 0,
    'product_avg_price_order': 0,
    'avg_price': 0,
    'DaysSinceLastTransaction': 0,
    'CumulativeSpent': 0,
    'CumulativeQuantity': 0,
    'PercentageMaleProductsSoFar': 0,
    'UniqueProductsSoFar': 0,
    'AverageAmountPerTransactionSoFar': 0,
    'AverageFrequencySoFar': 0,
    'AveragePrice': 0,
    'Frequency_30': 0,
    'Monetary_30': 0,
    'Recency_30': 0,
    'Frequency_60': 0,
    'Monetary_60': 0,
    'Recency_60': 0,
    'Frequency_90': 0,
    'Monetary_90': 0,
    'Recency_90': 0
}

# =============================================================================
# 3. Recommendation Function
# =============================================================================
def recommend_for_client(client_features, available_products, model_pipeline, top_k=5):
    """
    Given a client (with demographic information) and a list of candidate products,
    compute the purchase probability for each (client, product) pair and return the top_k recommendations.
    
    Parameters:
        client_features: dict with client-specific features.
        available_products: list of dicts with product-specific features.
        model_pipeline: the trained River pipeline.
        top_k: number of recommendations to return.
    
    Returns:
        List of tuples (ProductID, PurchaseProbability) sorted by descending probability.
    """
    # Fill missing keys using default values.
    client_features = fill_missing_features(client_features, default_values)
    
    scores = []
    for product in available_products:
        # Merge client and product features.
        x = {**client_features, **product}
        # Compute probability (assuming the positive class label is 1)
        prob = model_pipeline.predict_proba_one(x).get(1, 0)
        product_id = product.get('ProductID', None)
        scores.append((product_id, prob))
    # Sort candidates by descending probability.
    scores.sort(key=lambda tup: tup[1], reverse=True)
    return scores[:top_k]

# =============================================================================
# 4. Use Rows from Test Data for Recommendation
# =============================================================================

# List to accumulate binary hit values (1 if the ground_truth is in the recommendations, else 0)
hits = []

# Iterate over each row in the test DataFrame
for idx, row in df_test.iterrows():
    # Ground-truth product purchased in this transaction
    ground_truth = row.get('ProductID')
    
    # Convert the row to a dictionary and remove transaction-specific fields 
    # so that we use only demographic (client) information.
    client_features = row.to_dict()
    for col in ['ProductID', 'Quantity_sold', 'SalesNetAmountEuro']:
        client_features.pop(col, None)
    
    # Get the client's country (or default if missing) and filter candidate products accordingly.
    client_country = client_features.get('ClientCountry', default_values['ClientCountry'])
    
    candidates = [prod for prod in all_products if prod.get('StoreCountry') == client_country] 
    candidates_ids = [prod.get('ProductID') for prod in candidates if prod.get('StoreCountry') == client_country]
    # Compute recommendations using the trained pipeline (e.g., pipeline_arf)
    # so that we can check if the ground truth is anywhere in the list.
    recommendations = recommend_for_client(client_features, candidates, pipeline_arf, top_k=100)
    
    # Extract only the product IDs from the recommendations.
    recommended_ids = [prod_id for prod_id, prob in recommendations]
    
    # Check if the ground-truth product is in the recommendations
    hit = 1 if ground_truth in recommended_ids else 0
    hits.append(hit)
    
    # Optional: print progress every 100 rows
    if idx % 1 == 0:
        print(f"Processed row {idx}")

# Compute the mean hit rate (accuracy)
mean_accuracy = np.mean(hits)
print(f"Mean Hit Rate (Accuracy): {mean_accuracy:.4f}")

Processed row 717570
Processed row 717571
Processed row 717572
Processed row 717573
Processed row 717574
Processed row 717575
Processed row 717576
Processed row 717577
Processed row 717578
Processed row 717579
Processed row 717580
Processed row 717581
Processed row 717582
Processed row 717583
Processed row 717584
Processed row 717585
Processed row 717586
Processed row 717587
Processed row 717588
Processed row 717589
Processed row 717590
Processed row 717591
Processed row 717592
Processed row 717593
Processed row 717594
Processed row 717595
Processed row 717596
Processed row 717597
Processed row 717598
Processed row 717599
Processed row 717600
Processed row 717601
Processed row 717602
Processed row 717603
Processed row 717604
Processed row 717605
Processed row 717606
Processed row 717607
Processed row 717608
Processed row 717609
Processed row 717610
Processed row 717611
Processed row 717612
Processed row 717613
Processed row 717614
Processed row 717615
Processed row 717616
Processed row

KeyboardInterrupt: 

In [104]:
hits

[0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,


In [105]:
np.mean(hits)

0.056291390728476824