In [1]:
import pandas as pd

In [2]:
PATH_CAND = 'datas/candidate_items.csv'
PATH_FEAT = 'datas/item_features.csv'
PATH_TRAINS = 'datas/train_sessions.csv'
PATH_TRAINP = 'datas/train_purchases.csv'
PATH_LEADER = 'datas/test_leaderboard_sessions.csv'

candidates = pd.read_csv(PATH_CAND)
candidates

In [3]:
items = pd.read_csv(PATH_FEAT)
items

In [4]:
from datetime import datetime


def dateparse(date):    
    return int(datetime.strptime(date.split('.')[0], '%Y-%m-%d  %H:%M:%S').timestamp())

train_purchases = pd.read_csv(PATH_TRAINP, parse_dates=[2])

# train_purchases['ts'] = train_purchases['date'].astype('int64')
train_purchases

In [5]:
pd.read_csv(PATH_TRAINP, parse_dates=[2])

In [6]:
train_sessions = pd.read_csv(PATH_TRAINS, parse_dates=[2])
train_sessions

In [7]:
# Add rank column
train_sessions["rank"] = train_sessions.groupby('session_id')['date'].rank(method="first", ascending=False)
train_sessions = train_sessions[train_sessions["rank"] <= 10]
train_sessions

In [8]:
test_leaderboard_sessions = pd.read_csv(PATH_LEADER)
test_leaderboard_sessions

# Embedding items
Items are represented in a sparse format with the id of the feature and its value. We have no information on whether or not it is categorical or numerical, or the number of features. We want to explore them a little bit and find a compact representation.

First we want to know the coverage of each feature. Do a plot with, in x-axis, the feature id, and in y-axis, the number of non-null values.

In [9]:
from matplotlib import pyplot as plt
import numpy as np

In [10]:
feature_id_count = items.groupby(['feature_category_id']).size().reset_index(name='counts')
fig, ax = plt.subplots(figsize = (20,10))
ax.bar(feature_id_count['feature_category_id'], feature_id_count['counts'])
ax.set_xticks(np.arange(1, feature_id_count['feature_category_id'].max() + 1, step=1))
plt.show()

73 features is not that much, let's see how each of them look. Display the counts / min / max / std for each feature. You should see that one feature is useless, remove it!

In [11]:
items.groupby("feature_category_id")["feature_value_id"].describe().sort_values(by=["count"])

In [12]:
items = items.drop(items[items.feature_category_id == 27].index)

If 73 features is not a lot, it is still a big number given the number of samples for your poor laptop if you make it dense. It is also good practice to preprocess the data into a set of embeddings without sparse format because most ML algorithm do not handle sparse. We want to do it now. Use TruncatedSVD from scikit-learn with n_components = 12 (or less depending on your memory) and compute embeddings for your items.

In [13]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import coo_matrix


items_matrix = coo_matrix((items["feature_value_id"], (items["item_id"], items["feature_category_id"])))
items_matrix

In [14]:
svd = TruncatedSVD(n_components=12)
svd.fit(items_matrix)
df = pd.DataFrame(svd.transform(items_matrix))
df.shape

In [15]:
df["item_id"] = df.index

You now have a matrix of dimension (28144, n_components) representing the items.

In [16]:
df

# Embedding sessions

Sessions are defined by a series of items, we first want to see what they look like to be able to find a suitable representation for them.

My advice: start by sampling the train_sessions dataframe to keep 5% or 10% otherwise your memory may suffer.

Then plot the distribution of session sizes: x-axis is the size of the session, y-axis is the number of sessions of this size. Determine a threshold to contain completely 90% of the sessions. This is how we will compute the session representation.

Let $k$ be the length you have picked for your session. Create a vectorize representation of your sessions by either concatenating the embedding of the corresponding items, or averaging them (if memory is scarse).

In [17]:
def add_rank(sessions):
    sessions["rank"] = sessions.groupby('session_id')['date'].rank(method="first", ascending=False)
    sessions = sessions[sessions["rank"] <= 10]
    return sessions

In [18]:
def embedding_session(sessions):
    sample = sessions.sample(frac=0.10, random_state=42)
    distribution = sample.groupby('session_id').size().reset_index(name='counts')
    distribution = distribution.groupby('counts').size().reset_index(name='number')
    
    fig, ax = plt.subplots(figsize = (20,10))
    ax.bar(distribution['counts'], distribution['number'])
    ax.set_xticks(np.arange(1, distribution['counts'].max() + 1, step=1))
    plt.show()
    
    k = 12
    merged = df.merge(sample, on="item_id")
    sample = merged.pivot(index="session_id", columns='rank', values=[i for i in range (k)]).fillna(0)
    sample.columns = sample.columns.to_flat_index()
    
    return sample

In [19]:
sample = embedding_session(train_sessions)
sample

In [20]:
test_sessions = add_rank(test_leaderboard_sessions)
test_sessions = embedding_session(test_sessions)
test_sessions

# Embedding purchases
Purchases are one item, so we simply use their embedding. However, we also need to add negative samples so that our model not only learn on positives (One class SVM can do this, but those models are not known to generalize well...). In order to generate negative samples, shuffle the item_id columns of 4 copies of the purchase array. The original values have a label 1, the copies that have been shuffled have a value of 0.

In [21]:
negative_train_purchases = pd.concat([train_purchases] * 4)
negative_train_purchases["y"] = 0
train_purchases["y"] = 1
all_purchases = pd.concat([train_purchases, negative_train_purchases])

In [22]:
all_purchases = df.merge(all_purchases, right_on='item_id', left_index=True).drop(['item_id', 'item_id_x', 'item_id_y', 'date'], axis=1)
all_purchases

In [23]:
del train_purchases, negative_train_purchases

# Merge everything

We merge the whole dataset together to fit a model. For each session we concatenate:
* The history of the user
* The purchase embedding, with the label

We will have our training set!

In [24]:
df_test = test_sessions.merge(all_purchases, on='session_id')
y_test = df_test['y']
X_test = df_test.drop("y", axis=1)

In [25]:
df_train = sample.merge(all_purchases, on='session_id')
y = df_train['y']
X = df_train.drop("y", axis=1)

# Learn a model and predict on test

This is it. You have now a dataset that you can directly pass to a LogisticRegression. For the testing part, perform your prediction on all candidates and pick the hundred ones that have the highest results.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [28]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred_train = clf.predict(X_test)

In [29]:
from sklearn.metrics import (average_precision_score,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             precision_score,
                             classification_report,
                             recall_score,
                             f1_score,
                             accuracy_score,
                             RocCurveDisplay,
                             PrecisionRecallDisplay)

In [30]:
print('Accuracy: ' + str(accuracy_score(y_test, y_pred_train)))
print('Average precision score: ' + str(average_precision_score(y_test, y_pred_train)))
print(classification_report(y_test, y_pred_train))

cm = confusion_matrix(y_test, y_pred_train, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
disp.plot()
disp.ax_.set_title("confusion matrix")

# Going further

Your next task is to have an RNN running on this dataset. Take a look at [this notebook](https://github.com/oakfr/intro-to-reco/blob/master/application/part_2/RNN-next-item-prediction.ipynb) and adapt it to your usecase!

You are now ready to start the project. Take a look at the packages [surprise](http://surpriselib.com/) and [Microsoft recommenders](https://github.com/microsoft/recommenders) to find models available out of the box.

In [31]:
EPOCHS = 1
BATCH_SIZE = 10000

X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

In [32]:
type(X_test)
print(X_train.shape)
print(X_test.shape)

X_valRNN = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))
X_testRNN = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

print(X_val.shape)

print(X_valRNN.shape)
print(X_testRNN.shape)

In [33]:
# RNN with keras with X_train as input and y_train as output
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

# load the dataset
X_trainRNN = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))

model = Sequential()

model.add(LSTM(128, input_shape=(X_trainRNN.shape[1:]), return_sequences=True, activation='relu'))
model.add(Dropout(0.2))

model.add(LSTM(128, activation='relu'))

model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))

# compile the keras model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit the keras model on the dataset
model.fit(X_trainRNN, y_train, epochs=EPOCHS, validation_data=(X_valRNN, y_val), batch_size=BATCH_SIZE)


In [None]:
# evaluate the keras model
_, accuracy = model.evaluate(X_testRNN, y_test)
print('Accuracy: %.2f' % (accuracy * 100))