# Tutorial Assignment 2

In [1]:
import os

import numpy as np
import pandas as pd
import sklearn.linear_model
import sklearn.metrics

In [2]:
# benchmark function
def benchmark_yp(y, p):
    return sklearn.metrics.log_loss(y, p)

In [3]:
# benchmark wrapper, for data frames
def benchmark(x, y):
    xy = pd.merge(x, y, on=["customer", "product", "week"])
    assert xy.shape[0] == x.shape[0]
    return benchmark_yp(xy["y"].values, xy["probability"].values)

## Input

In [4]:
# the path that contains the data
path = os.path.expandvars("~/Dropbox/teaching/big-data/data/dr-s/upload")

# note that you don't have access to the truth data set
path_truth = os.path.expandvars("~/Dropbox/teaching/big-data/data/dr-s/private")

In [5]:
training_week = 88  # for model training
target_week = 89  # for model and baseline validation
target_customers = list(range(2000))
target_products = list(range(250))

## Load data

In [6]:
baskets = pd.read_parquet(f"{path}/baskets.parquet")
baskets.head()

Unnamed: 0,week,customer,product,price
0,0,0,71,629
1,0,0,91,605
2,0,0,116,715
3,0,0,123,483
4,0,0,157,592


In [7]:
prediction_index = pd.read_parquet(f"{path}/prediction_index.parquet")
prediction_index.head()

Unnamed: 0,week,customer,product
0,90,0,0
1,90,0,1
2,90,0,2
3,90,0,3
4,90,0,4


<br>

## SOLUTION 1: Descriptive feature (past purchase rates) = Baseline

### Validation

In [8]:
# function to define target variable for all customer-product combinations (in a given week)
def build_target(baskets, week):

    baskets_week = baskets[baskets["week"] == week][
        ["week", "customer", "product"]
    ].reset_index(drop=True)
    baskets_week["y"] = 1

    df = pd.DataFrame(
        {
            "week": week,
            "customer": np.repeat(target_customers, len(target_products), axis=0),
            "product": target_products * len(target_customers),
        }
    )

    df = df.merge(baskets_week, on=["week", "customer", "product"], how="left")
    df["y"] = df["y"].fillna(0).astype(int)

    return df

In [9]:
baseline_target = build_target(baskets, target_week)
baseline_target.head()

Unnamed: 0,week,customer,product,y
0,89,0,0,0
1,89,0,1,0
2,89,0,2,0
3,89,0,3,0
4,89,0,4,0


In [10]:
# baseline = purchase rates for customer-product combinations before the target week
def baseline_prediction(baskets, week):

    # subset baskets
    baskets_t = baskets[baskets["week"] < week].reset_index(drop=True)
    n_weeks = baskets_t.week.nunique()
    print(n_weeks)

    purchase_frequency_ij = (
        (baskets_t.groupby(["customer", "product"])[["week"]].count() / n_weeks)
        .rename(columns={"week": "probability"})
        .reset_index()
    )

    df = pd.DataFrame(
        {
            "week": week,
            "customer": np.repeat(target_customers, len(target_products), axis=0),
            "product": target_products * len(target_customers),
        }
    )

    result_baseline = pd.merge(
        df,
        purchase_frequency_ij,
        on=["customer", "product"],
        how="left",
    ).fillna(0)

    return result_baseline

In [11]:
# prediction for validation data
baseline_validation = baseline_prediction(baskets, target_week)
baseline_validation.head()

89


Unnamed: 0,week,customer,product,probability
0,89,0,0,0.0
1,89,0,1,0.0
2,89,0,2,0.0
3,89,0,3,0.0
4,89,0,4,0.089888


In [12]:
# benchmark for validation data
benchmark(baseline_target, baseline_validation)

0.10034939532139737

### Test

In [13]:
# prediction for test data
baseline_test = baseline_prediction(baskets, target_week + 1)
baseline_test.head()

90


Unnamed: 0,week,customer,product,probability
0,90,0,0,0.0
1,90,0,1,0.0
2,90,0,2,0.0
3,90,0,3,0.0
4,90,0,4,0.088889


<br>

## SOLUTION 2: simple machine learning model   

### Example for constructing the features

In [14]:
def build_frequency_feature(baskets, week_start, week_end, feature_name):
    # subset baskets
    baskets_subset = baskets[
        (baskets["week"] >= week_start) & (baskets["week"] <= week_end)
    ]
    print(baskets_subset.week.nunique())

    purchase_frequency_ij = (
        (
            baskets_subset.groupby(["customer", "product"])[["week"]].count()
            / baskets_subset.week.nunique()
        )
        .rename(columns={"week": feature_name})
        .reset_index()
    )

    return purchase_frequency_ij

In [15]:
build_frequency_feature(baskets, -1, target_week - 1, "frequency_full")

89


Unnamed: 0,customer,product,frequency_full
0,0,4,0.089888
1,0,11,0.101124
2,0,15,0.011236
3,0,16,0.044944
4,0,21,0.011236
...,...,...,...
7065816,99999,226,0.292135
7065817,99999,231,0.123596
7065818,99999,234,0.089888
7065819,99999,243,0.011236


### Example for building the base table

In [16]:
def build_base_table(baskets, week):
    y = build_target(baskets, week)
    x_1 = build_frequency_feature(baskets, -1, week - 1, "frequency_full")
    x_2 = build_frequency_feature(baskets, week - 30, week - 1, "frequency_l30")
    x_3 = build_frequency_feature(baskets, week - 5, week - 1, "frequency_l5")
    base_table_yx = (
        y.merge(x_1, on=["customer", "product"], how="left")
        .merge(x_2, on=["customer", "product"], how="left")
        .merge(x_3, on=["customer", "product"], how="left")
        .fillna(0)
    )
    return base_table_yx

In [17]:
build_base_table(baskets, training_week)

88
30
5


Unnamed: 0,week,customer,product,y,frequency_full,frequency_l30,frequency_l5
0,88,0,0,0,0.000000,0.000000,0.0
1,88,0,1,0,0.000000,0.000000,0.0
2,88,0,2,0,0.000000,0.000000,0.0
3,88,0,3,0,0.000000,0.000000,0.0
4,88,0,4,0,0.090909,0.100000,0.0
...,...,...,...,...,...,...,...
499995,88,1999,245,0,0.125000,0.133333,0.0
499996,88,1999,246,0,0.000000,0.000000,0.0
499997,88,1999,247,0,0.000000,0.000000,0.0
499998,88,1999,248,0,0.000000,0.000000,0.0


### Training: Train model (week < 89)

In [18]:
base_table_train = build_base_table(baskets, training_week)

88
30
5


In [19]:
y = base_table_train["y"].values

In [20]:
X = base_table_train[["frequency_full", "frequency_l30", "frequency_l5"]].values

In [21]:
log_reg = sklearn.linear_model.LogisticRegression().fit(X, y)

In [22]:
log_reg.intercept_, log_reg.coef_

(array([-4.48545291]), array([[10.74680489,  0.30805447, -0.98091252]]))

In [23]:
base_table_train["probability"] = log_reg.predict_proba(X)[:, 1]

In [24]:
benchmark_yp(
    base_table_train["y"].values,
    base_table_train["probability"].values,
)

0.09008063416504797

### Validation: Test model performance (week 89)

In [25]:
base_table_validation = build_base_table(baskets, target_week)

89
30
5


In [26]:
X_validation = base_table_validation[
    ["frequency_full", "frequency_l30", "frequency_l5"]
].values

In [27]:
base_table_validation["probability"] = log_reg.predict_proba(X_validation)[:, 1]

In [28]:
benchmark_yp(
    base_table_validation["y"].values,
    base_table_validation["probability"].values,
)

0.08962234697918603

### Test: Produce final result (week 90)

In [29]:
base_table_test = build_base_table(baskets, target_week + 1)

90
30
5


In [30]:
X_test = base_table_test[["frequency_full", "frequency_l30", "frequency_l5"]].values

In [31]:
base_table_test["probability"] = log_reg.predict_proba(X_test)[:, 1]

<br>

## Benchmark

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/danger-zone.gif" width="700"/>

In [32]:
truth = pd.read_parquet(f"{path_truth}/truth.parquet")

In [33]:
benchmark(truth, baseline_test)

0.10008904979918823

In [34]:
benchmark(
    truth,
    base_table_test[["customer", "product", "week", "probability"]],
)

0.089987445612237

<br>
<br>
&mdash; <br>
Sebastian Gabel <br>
`Learning from Big Data`, Module 2, Extra Tutorial <br>
2021/22 <br>