In [5]:
#download data
!apt-get install p7zip
!curl -Lo yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z
!7z x yoochoose-data.7z

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip is already the newest version (16.02+dfsg-6).
p7zip set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  273M  100  273M    0     0  27.1M      0  0:00:10  0:00:10 --:--:-- 31.1M

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Sca        1 file, 287211932 bytes (274 MiB)

Extracting archive: yoochoose-data.7z
--
Path = yoochoose-data.7z
Type = 7z
Physical Size = 287211932
Headers Size = 255
Method = LZMA:24
Solid = +
Blocks = 2

      0% - yoochoose-buys.da                          1% - yoochoose-buys.da

In [6]:
#installing packages
!pip install git+https://github.com/maciejkula/spotlight.git

Collecting git+https://github.com/maciejkula/spotlight.git
  Cloning https://github.com/maciejkula/spotlight.git to /tmp/pip-req-build-g7h_e3ty
  Running command git clone -q https://github.com/maciejkula/spotlight.git /tmp/pip-req-build-g7h_e3ty
Building wheels for collected packages: spotlight
  Building wheel for spotlight (setup.py) ... [?25l[?25hdone
  Created wheel for spotlight: filename=spotlight-0.1.6-cp36-none-any.whl size=33920 sha256=601a552eb8aeb365b691675f0b6d0376fe74f399d94c8d6428211a15bd1d16e8
  Stored in directory: /tmp/pip-ephem-wheel-cache-xw2bib14/wheels/0a/33/c8/e8510ea648aaacf6031e128dfa92bcd3750f02db2aaf0922fe
Successfully built spotlight
Installing collected packages: spotlight
Successfully installed spotlight-0.1.6


In [1]:
import os
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import time
import h5py
import hashlib
import json
import shutil
import sys
import torch

from sklearn.model_selection import ParameterSampler
from sklearn.preprocessing import LabelEncoder
random_state = np.random.RandomState(100)

from spotlight.interactions import Interactions
from spotlight.evaluation import mrr_score
from spotlight.evaluation import precision_recall_score
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.sequence.representations import CNNNet
from spotlight.evaluation import sequence_mrr_score
from spotlight.cross_validation import random_train_test_split
from spotlight.cross_validation import user_based_train_test_split
from spotlight.factorization.implicit import ImplicitFactorizationModel

  import pandas.util.testing as tm


In [None]:
# !git clone https://github.com/sparsh9012/python-util.git
sys.path.append('./python-util')
sys.path.append('./python-util/recsys')

In [None]:
from IPython.display import SVG, display
from preprocess import encode_user_item, random_split, user_split

Clicks data

In [5]:
df_clicks = pd.read_csv('yoochoose-clicks.dat', sep=',', header=None,
                        dtype={0:np.int32, 1:str, 2:np.int64, 3:str},
                        names = ["SessionId", "TimeStr", "ItemId", "Item_Type"])
df_clicks.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


In [None]:
#category types
'''The categories can be S (for promotion), 0 (when unknown), 
a number between 1-12 when it came from a category on the page
or any other that represents a brand'''

def assign_cat(x):
    if x == "S":
        return "PROMOTION"
    elif np.int(x) == 0:
        return "NONE"
    elif np.int(x) < 13:
        return "CATEGORY"
    else:
        return "BRAND"

df_clicks['Item_Type'] = df_clicks.loc[:,'Item_Type'].map(assign_cat)

Buy data

In [7]:
df_buys = pd.read_csv('yoochoose-buys.dat', sep=',', header=None,
                      dtype={0:np.int32, 1:str, 2:np.int64, 
                             3:np.int64, 4:np.int64},
                      names = ["SessionId", "TimeStr", "ItemId", "Price", "Quantity"])
df_buys.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Price,Quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


In [8]:
df_buys.drop(["TimeStr"], inplace=True, axis=1)
df_buys["Action"] = "BUY"
df_buys.head()

Unnamed: 0,SessionId,ItemId,Price,Quantity,Action
0,420374,214537888,12462,1,BUY
1,420374,214537850,10471,1,BUY
2,281626,214535653,1883,1,BUY
3,420368,214530572,6073,1,BUY
4,420368,214835025,2617,1,BUY


In [9]:
df = pd.merge(left=df_clicks, right=df_buys, how="left", on=["SessionId", "ItemId"])
df.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type,Price,Quantity,Action
0,1,2014-04-07T10:51:09.277Z,214536502,NONE,,,
1,1,2014-04-07T10:54:09.868Z,214536500,NONE,,,
2,1,2014-04-07T10:54:46.998Z,214536506,NONE,,,
3,1,2014-04-07T10:57:00.306Z,214577561,NONE,,,
4,2,2014-04-07T13:56:37.614Z,214662742,NONE,,,


Exploring data

In [None]:
query = "ItemId==@ItemId & SessionId==@SessionId"

In [None]:
ItemId = 214821371
SessionId = 11

In [12]:
df_clicks.query(query)

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type
26,11,2014-04-03T10:45:29.873Z,214821371,NONE
27,11,2014-04-03T10:46:12.162Z,214821371,NONE
28,11,2014-04-03T10:46:57.355Z,214821371,NONE


In [13]:
df_buys.query(query)

Unnamed: 0,SessionId,ItemId,Price,Quantity,Action
10,11,214821371,1046,1,BUY
11,11,214821371,1046,1,BUY


In [14]:
df.query(query)

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type,Price,Quantity,Action
26,11,2014-04-03T10:45:29.873Z,214821371,NONE,1046.0,1.0,BUY
27,11,2014-04-03T10:45:29.873Z,214821371,NONE,1046.0,1.0,BUY
28,11,2014-04-03T10:46:12.162Z,214821371,NONE,1046.0,1.0,BUY
29,11,2014-04-03T10:46:12.162Z,214821371,NONE,1046.0,1.0,BUY
30,11,2014-04-03T10:46:57.355Z,214821371,NONE,1046.0,1.0,BUY
31,11,2014-04-03T10:46:57.355Z,214821371,NONE,1046.0,1.0,BUY


In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

Data subset selection based on thresholds

In [None]:
SESSION_THRESHOLD = 20
ITEM_THRESHOLD = 1000

In [None]:
session_lengths = df.groupby(["SessionId"]).size()
session_lengths_w_threshold = (session_lengths[session_lengths>SESSION_THRESHOLD]).reset_index()
df_with_session_threshold = df[df.SessionId.isin(session_lengths_w_threshold.SessionId)]

In [None]:
item_lengths = df.groupby(["ItemId"]).size()
item_lengths_w_threshold = item_lengths[item_lengths>ITEM_THRESHOLD].reset_index()
df_with_session_item_threshold = df_with_session_threshold[df_with_session_threshold.ItemId.isin(item_lengths_w_threshold.ItemId)]

In [None]:
session_lengths_2 = df_with_session_item_threshold.groupby(["SessionId"]).size()
session_lengths_2_w_threshold = (session_lengths_2[session_lengths_2 > SESSION_THRESHOLD]).reset_index()

In [21]:
df_final = df_with_session_item_threshold[df_with_session_item_threshold.SessionId.isin(session_lengths_2_w_threshold.SessionId)]
df_final.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type,Price,Quantity,Action
282,87,2014-04-07T06:19:20.979Z,214840483,NONE,1674.0,1.0,BUY
283,87,2014-04-07T06:19:28.762Z,214840483,NONE,1674.0,1.0,BUY
284,87,2014-04-07T06:26:01.516Z,214717286,NONE,,,
285,87,2014-04-07T06:26:15.176Z,214558807,NONE,,,
286,87,2014-04-07T06:26:32.159Z,214821300,NONE,,,


In [22]:
df_final.Action.fillna(value="CLICK", inplace=True)
df_final.drop(["Price", "Quantity"], axis=1, inplace=True)
df_final.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type,Action
282,87,2014-04-07T06:19:20.979Z,214840483,NONE,BUY
283,87,2014-04-07T06:19:28.762Z,214840483,NONE,BUY
284,87,2014-04-07T06:26:01.516Z,214717286,NONE,CLICK
285,87,2014-04-07T06:26:15.176Z,214558807,NONE,CLICK
286,87,2014-04-07T06:26:32.159Z,214821300,NONE,CLICK


Some more changes

In [23]:
df_final['Time'] = df_final.TimeStr.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
del(df_final["TimeStr"])
df_final.sort_values(by=["SessionId", "Time"], inplace=True)
df_final["Rating"] = df_final.Action.apply(lambda x: 1 if (x == "CLICK") else 5)
df_final.head()

Unnamed: 0,SessionId,ItemId,Item_Type,Action,Time,Rating
282,87,214840483,NONE,BUY,1396852000.0,5
283,87,214840483,NONE,BUY,1396852000.0,5
284,87,214717286,NONE,CLICK,1396852000.0,1
285,87,214558807,NONE,CLICK,1396852000.0,1
286,87,214821300,NONE,CLICK,1396852000.0,1


In [None]:
df_final.to_csv('yoochoose_processed.csv')

Modeling

In [None]:
df = pd.read_csv("yoochoose_processed.csv")

In [26]:
# Data Encoding
DATA, user_encoder, item_encoder = encode_user_item(df, "SessionId", "ItemId", "Rating", "Time")

Number of users:  42144
Number of items:  5120


In [None]:
# Spotlight requires encoders to begin from 1 (instead of 0). We will add 1 to the encoders 
# When doing inverse transform, remember to subtract 1.

DATA.USER = DATA.USER + 1
DATA.ITEM = DATA.ITEM + 1

In [None]:
DATA.RATING = DATA.RATING.astype(np.int32)
DATA.USER = DATA.USER.astype(np.int32)
DATA.ITEM = DATA.ITEM.astype(np.int32)

In [29]:
DATA.head()

Unnamed: 0.1,Unnamed: 0,SessionId,ItemId,Item_Type,Action,TIMESTAMP,RATING,USER,ITEM
0,282,87,214840483,NONE,BUY,1396852000.0,5,1,3844
1,283,87,214840483,NONE,BUY,1396852000.0,5,1,3844
2,284,87,214717286,NONE,CLICK,1396852000.0,1,1,2574
3,285,87,214558807,NONE,CLICK,1396852000.0,1,1,703
4,286,87,214821300,NONE,CLICK,1396852000.0,1,1,2990


In [None]:
df_for_interaction_matrix = (DATA.USER.values,DATA.ITEM.values,DATA.RATING,DATA.TIMESTAMP)
df_interaction = Interactions(*df_for_interaction_matrix)

Train and Validation set

In [None]:
train_with_val, test = user_based_train_test_split(df_interaction,
                                                   random_state=random_state, 
                                                   test_percentage = 0.2)

train, val = user_based_train_test_split(train_with_val, test_percentage=0.2, 
                                         random_state=random_state)

Implicit Model

In [None]:
model_implicit = ImplicitFactorizationModel(n_iter=3, loss='bpr')
model_implicit.fit(train)

In [33]:
user_for_reco = test.user_ids[0]
pred_for_user = model_implicit.predict(user_for_reco)
pred_for_user

array([-3.0667593 ,  1.4037921 , -1.0657367 , ...,  0.24834053,
       -3.0151875 ,  1.5183879 ], dtype=float32)

In [35]:
rec_item_ids = (-pred_for_user).argsort()
rec_item_ids

array([2770,  858, 3302, ..., 1234,  606,  810])

In [36]:
# ground truth
target = test.item_ids[0]
target

3727

In [37]:
np.where(rec_item_ids == target)

(array([1040]),)

Evaluation

In [None]:
implicit_mrr_score = mrr_score(model_implicit, test)
(pk, rk) = precision_recall_score(model_implicit, test, k= 5)

Sequence Model

In [None]:
max_sequence_length = 200
min_sequence_length = 50
step_size = 200

In [None]:
train = train.to_sequence(max_sequence_length=max_sequence_length,
                          min_sequence_length=min_sequence_length,
                          step_size=step_size)
test = test.to_sequence(max_sequence_length=max_sequence_length,
                        min_sequence_length=min_sequence_length,
                        step_size=step_size)
val = val.to_sequence(max_sequence_length=max_sequence_length,
                                    min_sequence_length=min_sequence_length,
                                    step_size=step_size)

In [41]:
print(train.sequences.shape)
print(test.sequences.shape)
print(val.sequences.shape)

(1587, 200)
(477, 200)
(411, 200)


In [None]:
net = CNNNet(train.num_items,
             embedding_dim=128,
             kernel_width=3,
             dilation=[1,1,1,1],
             num_layers=2,
             nonlinearity="relu",
             residual_connections=False)

In [None]:
model = ImplicitSequenceModel(loss="bpr",
                              representation=net,
                              batch_size=32,
                              learning_rate=0.1,
                              l2=0.0,
                              n_iter=2,
                              random_state=random_state)

In [None]:
model.fit(train)

Prediction

In [45]:
query = test.sequences[1][0:199]
target = test.sequences[1][199]

print("Shape of query is : ",query.shape)
print("The value of target is : ",target)

Shape of query is :  (199,)
The value of target is :  3152


In [None]:
pred = model.predict(query)

In [47]:
rec_item_ids = (-pred).argsort()
np.where(rec_item_ids == target)

(array([312]),)

In [48]:
#Item ID that is to be recommended :
item_encoder.inverse_transform([rec_item_ids[0]-1])[0]

214851099