## Application of BPR on Movielens

In [1]:
#%load_ext watermark
%load_ext autoreload 
%autoreload 2

In [2]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from subprocess import call
# import pybpr
import matplotlib.pyplot as plt
from functools import partial

In [3]:
import torch

class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.linear1 = torch.nn.Linear(100, 200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(200, 10)
        self.softmax = torch.nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

tinymodel = TinyModel()

In [5]:
isinstance(tinymodel)

__main__.TinyModel

In [3]:
fpath = os.path.join(DATA_DIR, 'movie_dataset_public_final', 'processed', 'features_r.csv')
df = pd.read_csv(fpath, engine='python')
df.head()

NameError: name 'DATA_DIR' is not defined

In [None]:
df.info()

In [None]:
df[df.item_id==2571]

In [None]:
df.tag.value_counts()

In [None]:
df.item_id.value_counts()

In [None]:
df.targets.value_counts()

In [None]:
df.tag_exists.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(12,4))
for col in ['log_IMDB', 'log_IMDB_nostem','rating_similarity','avg_rating','lsi_tags_75', 'lsi_imdb_175','tag_prob']:
    df[col].hist(bins=100, ax=ax, histtype='step', label=col)
# df.log_IMDB_nostem.hist(bins=100, ax=ax, histtype='step')
ax.set_xlim([-5,5])
ax.legend()

In [None]:
fpath = os.path.join(DATA_DIR, 'movie_dataset_public_final', 'processed', '10folds', 'test0.csv')
df = pd.read_csv(fpath, engine='python')
df.head()

In [None]:
df.head()

In [None]:
%%time
#df = load_movielens_data('ml-1m')
DATA_DIR = '/projects/zazzle/rsandhu/pybpr/examples/data'
fpath = os.path.join(DATA_DIR, 'ml-1m', 'ratings.dat')
df = pd.read_csv(fpath, sep='::', header=None, engine='python')
df.columns=['user_id', 'item_id', 'rating','time']
df.head()

In [None]:
fpath = os.path.join(DATA_DIR, 'ml-1m', 'movies.dat')
df = pd.read_csv(fpath, sep='::', header=None, engine='python')

In [None]:
df['user_id'] = df['user_id'].astype('category')
df['item_id'] = df['item_id'].astype('category')
df['user_idx'] = df['user_id'].cat.codes
df['item_idx'] = df['item_id'].cat.codes
df.head()

In [None]:
df.user_idx.nunique(), df.item_idx.nunique()

In [None]:
dfp = df[df['rating'] > 3].copy()
dfn = df[df['rating'] <= 3].copy()

In [None]:
pos_count = dfp.groupby('user_idx')['item_idx'].count().sort_values(ascending=False)
pos_count = pos_count[pos_count > 5]
neg_count = dfn.groupby('user_idx')['item_idx'].count().sort_values(ascending=False)
neg_count = neg_count[neg_count > 5]

In [None]:
uvec = np.intersect1d(neg_count.index.values, pos_count.index.values, assume_unique=True)
dfp = dfp[dfp.user_idx.isin(uvec)]
dfn = dfn[dfn.user_idx.isin(uvec)]

In [None]:
dfp.user_idx.nunique(), dfn.user_idx.nunique()

In [None]:
uvec = np.intersect1d(dfp.user_idx.unique(), dfn.user_idx.unique(), assume_unique=True)
dfp = dfp[dfp.user_idx.isin(uvec)]
dfn = dfn[dfn.user_idx.isin(uvec)]
ivec = np.intersect1d(dfp.item_idx.unique(), dfn.item_idx.unique(), assume_unique=True)
dfp = dfp[dfp.item_idx.isin(ivec)]
dfn = dfn[dfn.item_idx.isin(ivec)]

In [None]:
dfp['user_idxx'] = 0
dfn['user_idxx'] = 0
for i, idx in enumerate(dfp.user_idx.unique()):
    dfp.loc[dfp.user_idx==idx,'user_idxx'] = i
    dfn.loc[dfn.user_idx==idx,'user_idxx'] = i
dfp['item_idxx'] = 0
dfn['item_idxx'] = 0
for i, idx in enumerate(dfp.item_idx.unique()):
    dfp.loc[dfp.item_idx==idx,'item_idxx'] = i
    dfn.loc[dfn.item_idx==idx,'item_idxx'] = i

In [None]:
dfp.user_idxx.nunique(), dfp.user_idxx.max()
dfn.user_idxx.nunique(), dfn.user_idxx.max()

In [None]:
cfp = UserItemInteractions(
    name='MovieLens-100k',
    users=dfp['user_idxx'],
    items=dfp['item_idxx'],
    min_num_rating_per_user=0,
    min_num_rating_per_item=0,
    num_cores = 1
)
cfp.generate_train_test(user_test_ratio=0.2)
cfn = UserItemInteractions(
    name='MovieLens-100k',
    users=dfn['user_idxx'],
    items=dfn['item_idxx'],
    min_num_rating_per_user=0,
    min_num_rating_per_item=0,
    num_cores = 1
)
cfn.generate_train_test(user_test_ratio=0.2)

In [None]:
cfp.R_train, cfp.R_test, dfp.shape
cfn.R_train, cfn.R_test, dfp.shape

In [None]:
cfn.R, dfn.shape

In [None]:
np.allclose(np.sort(dfp.user_id.unique()), np.sort(dfn.user_id.unique()))
#np.allclose(np.sort(dfp.item_id.unique()), np.sort(dfn.item_id.unique()))

In [None]:
bpr = BPR(
    num_features=40,
    reg_lambda=0.00,
    num_iters=500,
    learning_rate = 0.04,
    batch_size=10,
    initial_std=0.001,
    seed=None
)

In [None]:
#bpr.fit(train_mat=cfp.R, train_mat_neg=cfn.R, ndcg_func=ndcg_fun_p)
ndcg_fun_p = partial(
    cfp.get_ndcg_metric,
    num_items=5
)
bpr.fit(
    train_mat=cfp.R_train, 
    train_mat_neg=cfn.R, 
    ndcg_func=ndcg_fun_p)
ndcg_df = pd.DataFrame(bpr.ndcg_metric)

In [None]:
bpr.fit(train_mat=cfp.R_train, ndcg_func=ndcg_fun_p)
ndcg_df2 = pd.DataFrame(bpr.ndcg_metric)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,3))
ax[0].plot(ndcg_df['train'],'-r', label='Explicit Neg')
ax[0].plot(ndcg_df2['train'],'-b', label='AMAN')
ax[1].plot(ndcg_df['test'],'-r', label='Explicit Neg')
ax[1].plot(ndcg_df2['test'],'-b', label='AMAN')
for iax in ax:
    iax.legend()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,3))
ax[0].plot(ndcg_df['train'],'-r', label='Explicit Neg')
ax[0].plot(ndcg_df2['train'],'-b', label='AMAN')
ax[1].plot(ndcg_df['test'],'-r', label='Explicit Neg')
ax[1].plot(ndcg_df2['test'],'-b', label='AMAN')
for iax in ax:
    iax.legend()

In [None]:
bpr.fit(train_mat=cfp.R, ndcg_func=ndcg_fun_p)

In [None]:
rec1 = cfp.get_top_items_for_this_user(10, bpr.user_mat, bpr.item_mat,5)
rec2 = cfp.get_top_items_for_this_user(10, bpr2.user_mat, bpr.item_mat,5)