In [51]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import log_loss, roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression

from joblib import dump, load

# Load Data

Multi-core preprocessing that will calculate expanding purchase frequencies and rolling purchase sums per product.
- First keyword is number of random shoppers
- Second keyword is core count

Result will be parqueted into working directory. Data for 2000 random shoppers will occupy ~180MB.

Running the script in command line will display a progress bar.

In [52]:
# !preprocess_logit_mp.py 2000 10

In [55]:
logitprep = pd.read_parquet("data/logitprep_random_2000.parquet")

In [56]:
logitprep.head()

Unnamed: 0_level_0,week,shopper,product,bought,freqs,purchases_last_5_weeks,purchases_last_15_weeks,purchases_last_25_weeks,discount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,30,0,0,0.0,0,0,0,0
1,0,30,1,0,0.0,0,0,0,0
2,0,30,2,0,0.0,0,0,0,0
3,0,30,3,0,0.0,0,0,0,0
4,0,30,4,1,0.0,0,0,0,0


# Train Logistic Regression

In [8]:
features = ['freqs','purchases_last_5_weeks','purchases_last_15_weeks','purchases_last_25_weeks', 'discount']

In [13]:
x_train = logitprep.loc[(logitprep.week < 80), features]
y_train = logitprep.loc[(logitprep.week < 80), "bought"]
x_val = logitprep.loc[logitprep.week >= 80, features]
y_val = logitprep.loc[logitprep.week >= 80, "bought"]

In [11]:
logModel = LogisticRegression(penalty='l2',
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    random_state=69,
    max_iter=100,
    verbose=1,
    solver="saga")

# logModel.fit(x_train, y_train);
# dump('logModel.joblib')
logModel = load('data/logModel.joblib')

In [14]:
pred = logModel.predict_proba(x_val)

In [15]:
log_loss(y_val, pred[:, 1])

0.09011332066244462

In [16]:
roc_auc_score(y_val, pred[:, 1])

0.9411856950782126

In [17]:
average_precision_score(y_val, pred[:, 1])

0.44152795477836504

---
Get intercept and coefficients:

In [49]:
coef_df = pd.DataFrame(columns=["feature", "coefficient"], index=range(len(features)+1))
coef_df.iloc[0] = ["intercept", logModel.intercept_[0]]
ix = 1
for feat, coef in zip(features, logModel.coef_[0]):
    coef_df.iloc[ix] = [feat, coef]
    ix += 1

coef_df

Unnamed: 0,feature,coefficient
0,intercept,-4.38685
1,freqs,5.38777
2,purchases_last_5_weeks,-0.0759897
3,purchases_last_15_weeks,0.0930723
4,purchases_last_25_weeks,0.128638
5,discount,0.036721


# Calculate Elasticities

In [50]:
own_product = []

for prod in tqdm(range(250)):
    newtest = logitprep.loc[(logitprep.week >= 80) & (logitprep["product"]==prod), features].copy()
    newtest["discount"] = 0
    base_prob = np.mean(logModel.predict_proba(newtest)[:,1])
    newtest["discount"] = 30
    avg_30prob = np.mean(logModel.predict_proba(newtest)[:,1])
    own_product.append((avg_30prob - base_prob) / (0.3 * base_prob))

np.mean(own_product)

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [00:29<00:00,  8.36it/s]


4.420810343666133