# Tutorial Assignment 2

In [1]:
import os

import numpy as np
import pandas as pd
import sklearn.linear_model
import sklearn.metrics  # use existing libraries!

In [2]:
# score function: binary cross entropy loss
def score_yp(y, p): # y, p are numpy arrays
    return sklearn.metrics.log_loss(y, p)

In [3]:
# score wrapper, for data frames. we need this when using the `truth` data frame
def score(x, y): # x, y are data frames
    xy = pd.merge(x, y, on=["customer", "product", "week"])
    assert xy.shape[0] == x.shape[0]
    return score_yp(xy["y"].values, xy["probability"].values)

<br>

## Input

In [4]:
# the path that contains the data
path = os.path.expandvars("~/Dropbox_RSM/teaching/big-data-23-24/data/dr-s")

In [5]:
# INPUT
training_week = 88  # for model training
validation_week = 89  # for model and baseline validation
test_week = 90  # for the final prediction (one week in the future, beyond our data)
target_customers = list(range(2000))
target_products = list(range(250))

<br>

## Load data

In [6]:
baskets = pd.read_parquet(f"{path}/upload/baskets-s.parquet")
# we only keep the 2000 customers required for the test set
# a complex model might require more data, e.g., see Boosted Tree model in Gabel & Timoshenko (2022)
baskets.head()

Unnamed: 0,week,customer,product,price
0,0,0,71,629
1,0,0,91,605
2,0,0,116,715
3,0,0,123,483
4,0,0,157,592


In [7]:
prediction_index = pd.read_parquet(f"{path}/raw/prediction_index.parquet")
prediction_index.head()

Unnamed: 0,week,customer,product
0,90,0,0
1,90,0,1
2,90,0,2
3,90,0,3
4,90,0,4


<br>

## Reminder

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/time-split.png" width="900"/>

<br>

## SOLUTION 2: simple machine learning model   

### Example for constructing the features

In [8]:
def build_frequency_feature(baskets, week_start, week_end, feature_name):
    # subset baskets
    baskets_subset = baskets[
        (baskets["week"] >= week_start) & (baskets["week"] <= week_end)
    ]
    print(baskets_subset.week.nunique())

    purchase_frequency_ij = (
        (
            baskets_subset.groupby(["customer", "product"])[["week"]].count()
            / baskets_subset.week.nunique()
        )
        .rename(columns={"week": feature_name})
        .reset_index()
    )

    return purchase_frequency_ij

In [9]:
build_frequency_feature(baskets, -1, training_week - 1, "frequency_full")

88


Unnamed: 0,customer,product,frequency_full
0,0,4,0.090909
1,0,11,0.102273
2,0,15,0.011364
3,0,16,0.034091
4,0,21,0.011364
...,...,...,...
141147,1999,237,0.045455
141148,1999,242,0.397727
141149,1999,243,0.011364
141150,1999,245,0.125000


### Example for building the base table

In [10]:
def build_target(baskets, week):

    baskets_week = baskets[baskets["week"] == week][
        ["week", "customer", "product"]
    ].reset_index(drop=True)
    baskets_week["y"] = 1

    df = pd.DataFrame(
        {
            "week": week,
            "customer": np.repeat(target_customers, len(target_products), axis=0),
            "product": target_products * len(target_customers),
        }
    )

    df = df.merge(baskets_week, on=["week", "customer", "product"], how="left")
    df["y"] = df["y"].fillna(0).astype(int)

    return df

In [11]:
def build_base_table(baskets, week):
    # target variable (product purchase)
    # consider using multiple weeks for training! more data might lead to better results.
    # also, different weeks might have different information.
    y = build_target(baskets, week)
    # features
    # note how features are computed on data BEFORE the target week
    x_1 = build_frequency_feature(baskets, -1, week - 1, "frequency_full")
    x_2 = build_frequency_feature(baskets, week - 30, week - 1, "frequency_l30")
    x_3 = build_frequency_feature(baskets, week - 5, week - 1, "frequency_l5")
    base_table_yx = (
        y.merge(x_1, on=["customer", "product"], how="left")
        .merge(x_2, on=["customer", "product"], how="left")
        .merge(x_3, on=["customer", "product"], how="left")
        .fillna(0)
    )
    return base_table_yx

In [12]:
build_base_table(baskets, training_week)

88
30
5


Unnamed: 0,week,customer,product,y,frequency_full,frequency_l30,frequency_l5
0,88,0,0,0,0.000000,0.000000,0.0
1,88,0,1,0,0.000000,0.000000,0.0
2,88,0,2,0,0.000000,0.000000,0.0
3,88,0,3,0,0.000000,0.000000,0.0
4,88,0,4,0,0.090909,0.100000,0.0
...,...,...,...,...,...,...,...
499995,88,1999,245,0,0.125000,0.133333,0.0
499996,88,1999,246,0,0.000000,0.000000,0.0
499997,88,1999,247,0,0.000000,0.000000,0.0
499998,88,1999,248,0,0.000000,0.000000,0.0


### Training: Train model (week < 89)

In [13]:
base_table_train = build_base_table(baskets, training_week)

88
30
5


In [14]:
y = base_table_train["y"].values  # 1s and 0s

In [15]:
X = base_table_train[["frequency_full", "frequency_l30", "frequency_l5"]].values  # purchase frequencies

In [16]:
log_reg = sklearn.linear_model.LogisticRegression().fit(X, y)

In [17]:
log_reg.intercept_, log_reg.coef_

(array([-4.48545291]), array([[10.74680489,  0.30805447, -0.98091252]]))

In [18]:
# use model to predict purchase probabilities 
base_table_train["probability"] = log_reg.predict_proba(X)[:, 1]

In [19]:
score_yp(
    base_table_train["y"].values,
    base_table_train["probability"].values,
)

0.09008063416504795

### Validation: Test model performance (week 89)

In [20]:
base_table_validation = build_base_table(baskets, validation_week)

89
30
5


In [21]:
X_validation = base_table_validation[
    ["frequency_full", "frequency_l30", "frequency_l5"]
].values

In [22]:
base_table_validation["probability"] = log_reg.predict_proba(X_validation)[:, 1]

In [23]:
score_yp(
    base_table_validation["y"].values,
    base_table_validation["probability"].values,
)

0.089622346979186

### Test: Produce final result for submission (week 90)

We can't evaluate this prediction because we don't have this data -- that's why we need the validation set! 

In [24]:
base_table_test = build_base_table(baskets, test_week)

90
30
5


In [25]:
X_test = base_table_test[["frequency_full", "frequency_l30", "frequency_l5"]].values

In [26]:
base_table_test["probability"] = log_reg.predict_proba(X_test)[:, 1]

<br>

## Score on test set

In real life, never load your test/validation data durnig model training!

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/danger-zone.gif" width="800"/>

In [27]:
# true purchases in week 90
truth = pd.read_parquet(f"{path}/private/truth.parquet")

In [28]:
score(
    truth,
    base_table_test[["customer", "product", "week", "probability"]],
)

0.08998744561223695

<br>
<br>

<b>Learning from Big Data</b> <br>
Sebastian Gabel <br>