In [None]:
# default_exp models.spop

# S-Pop
> Session Popularity

Session popularity predictor that gives higher scores to items with higher number of occurrences in the session. Ties are broken up by adding the popularity score of the item.

The score is given by $r_{s,i} = supp_{s,i} + \frac{supp_i}{(1+supp_i)}$.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
import numpy as np
import pandas as pd

In [None]:
#export
class SessionPop:
    '''
    SessionPop(top_n=100, item_key='ItemId', support_by_key=None)
    Session popularity predictor that gives higher scores to items with higher number of occurrences in the session. 
    Ties are broken up by adding the popularity score of the item.
    The score is given by:
    .. math::
        r_{s,i} = supp_{s,i} + \\frac{supp_i}{(1+supp_i)}
    Parameters
    --------
    top_n : int
        Only give back non-zero scores to the top N ranking items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
    item_key : string
        The header of the item IDs in the training data. (Default value: 'ItemId')
    '''    
    def __init__(self, top_n = 1000, session_key = 'SessionId', item_key = 'ItemId'):
        self.top_n = top_n
        self.item_key = item_key
        self.session_id = session_key
        
    def fit(self, data):
        '''
        Trains the predictor.
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. 
            It has one column for session IDs, one for item IDs.
        '''
        self.items = data[self.item_key].unique()
        grp = data.groupby(self.item_key)
        self.pop_list = grp.size()
        self.pop_list = self.pop_list / (self.pop_list + 1)
        self.pop_list.sort_values(ascending=False, inplace=True)
        self.pop_list = self.pop_list.head(self.top_n)
        self.prev_session_id = -1
         
    def predict_next(self, last_items, k):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
        Parameters
        --------
        last_items : list of items clicked in current session
        k : number of items to recommend and evaluate based on it
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
        '''
        pers = {}
        for i in last_items:
            pers[i] = pers[i] + 1 if i in pers.keys() else  1
        
        preds = np.zeros(len(self.items))
        mask = np.in1d(self.items, self.pop_list.index)
        ser = pd.Series(pers)
        preds[mask] = self.pop_list[self.items[mask]]
        
        mask = np.in1d(self.items, ser.index)
        preds[mask] += ser[self.items[mask]]
        
        series = pd.Series(data=preds, index=self.items)
        series = series / series.max()    
        return series.nlargest(k).index.values

In [None]:
import os
import time
import argparse
import pandas as pd
from recohut.utils.common_utils import download_url

In [None]:
data_root = '/content/data'
download_url('https://github.com/RecoHut-Datasets/yoochoose/raw/v4/yoochoose_train.txt', data_root)
download_url('https://github.com/RecoHut-Datasets/yoochoose/raw/v4/yoochoose_valid.txt', data_root)

Downloading https://github.com/RecoHut-Datasets/yoochoose/raw/v4/yoochoose_train.txt
Downloading https://github.com/RecoHut-Datasets/yoochoose/raw/v4/yoochoose_valid.txt


'/content/data/yoochoose_valid.txt'

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--K', type=int, default=20, help="K items to be used in Recall@K and MRR@K")
parser.add_argument('--topn', type=int, default=100, help="Number of top items to return non zero scores for them (most popular)")
parser.add_argument('--itemid', default='sid', type=str)
parser.add_argument('--sessionid', default='uid', type=str)
parser.add_argument('--valid_data', default='yoochoose_valid.txt', type=str)
parser.add_argument('--train_data', default='yoochoose_train.txt', type=str)
parser.add_argument('--data_folder', default=data_root, type=str)

# Get the arguments
args = parser.parse_args([])
train_data = os.path.join(args.data_folder, args.train_data)
x_train = pd.read_csv(train_data)
valid_data = os.path.join(args.data_folder, args.valid_data)
x_valid = pd.read_csv(valid_data)
x_valid.sort_values(args.sessionid, inplace=True)

print('Finished Reading Data \nStart Model Fitting...')
# Fitting AR Model
t1 = time.time()
model = SessionPop(top_n = args.topn, session_key = args.sessionid, item_key = args.itemid)
model.fit(x_train)
t2 = time.time()
print('End Model Fitting with total time =', t2 - t1, '\n Start Predictions...')

# Test Set Evaluation
test_size = 0.0
hit = 0.0
MRR = 0.0
cur_length = 0
cur_session = -1
last_items = []
t1 = time.time()
index_item = x_valid.columns.get_loc(args.itemid)
index_session = x_valid.columns.get_loc(args.sessionid)
train_items = model.items
counter = 0
for row in x_valid.itertuples( index=False ):
    counter += 1
    if counter % 5000 == 0:
        print('Finished Prediction for ', counter, 'items.')
    session_id, item_id = row[index_session], row[index_item]
    if session_id != cur_session:
        cur_session = session_id
        last_items = []
        cur_length = 0
    
    if item_id in train_items:
        if len(last_items) > cur_length: #make prediction
            cur_length += 1
            test_size += 1
            # Predict the most similar items to items
            predictions = model.predict_next(last_items, k = args.K)
            # Evaluation
            rank = 0
            for predicted_item in predictions:
                rank += 1
                if predicted_item == item_id:
                    hit += 1.0
                    MRR += 1/rank
                    break
        
        last_items.append(item_id)
t2 = time.time()
print('Recall: {}'.format(hit / test_size))
print ('\nMRR: {}'.format(MRR / test_size))
print('End Model Predictions with total time =', t2 - t1)

Finished Reading Data 
Start Model Fitting...
End Model Fitting with total time = 0.10341858863830566 
 Start Predictions...
Finished Prediction for  5000 items.
Recall: 0.313485342019544

MRR: 0.11998186799961241
End Model Predictions with total time = 33.76607871055603


> **References:-**
- [https://github.com/mmaher22/iCV-SBR/tree/master/Source Codes/S-POP_Python](https://github.com/mmaher22/iCV-SBR/tree/master/Source%20Codes/S-POP_Python)

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2022-01-01 06:12:09

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas  : 1.1.5
numpy   : 1.19.5
argparse: 1.1
IPython : 5.5.0

