In [1]:
from sklearn.cluster import KMeans

import numpy as np
import orchest
import pandas as pd
import pickle
import os

from utils import compute_event_df

### Construct summary dataframe

From event data

In [2]:
product_data = orchest.get_inputs()['product_data']

events_df = product_data['events']

In [3]:
events_df

Unnamed: 0,product,date,action
0,Product A,2021-03-30 23:23:46,buy
1,Product J,2021-04-01 02:34:59,buy
2,Product E,2020-01-22 18:39:14,buy
3,Product A,2020-02-14 00:31:33,buy
4,Product E,2021-08-11 00:30:40,buy
...,...,...,...
11995,Product F,2020-10-23 07:35:50,view
11996,Product J,2021-11-08 14:52:12,view
11997,Product D,2020-02-13 17:42:02,view
11998,Product C,2021-07-01 07:51:21,view


In [4]:
df = compute_event_df(events_df)

### Construct model

In [5]:
X = df.values

In [6]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [7]:
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0], dtype=int32)

In [8]:
# Choose known attractive product
high_seller_label = kmeans.predict([df.iloc[0].values])[0]

In [9]:
low_seller_label = np.abs(high_seller_label - 1)

### Store model

In [10]:
model_storage_location = os.environ.get("KMEANS_MODEL_LOC", "/data/ecom-kmeans-model.pickle")

In [11]:
with open(model_storage_location, "wb") as f:
    pickle.dump({"model": kmeans, 
                 "high_seller_label": high_seller_label, 
                 "low_seller_label": low_seller_label}, f)

In [12]:
# Verify storage worked
with open(model_storage_location, "rb") as f:
    model = pickle.load(f)["model"]

assert all((kmeans.cluster_centers_ == model.cluster_centers_).tolist())