In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [0]:
train_link_1="https://drive.google.com/open?id=1vx9pDj-Av55AlUe10Bn3DjxDqzHEdl-m"
initial1, train_id1 = train_link_1.split('=')

downloaded1 = drive.CreateFile({'id':train_id1})
downloaded1.GetContentFile('sessions.csv')

In [0]:
import numpy as np
import pandas as pd

**Loading and precprocessing the data:**

In [0]:
import os
import calendar
import time
import datetime
def load_and_adapt(path, last_months=0):
    data = pd.read_csv(path, header=0)
    col_names = ['session_id', 'user_id', 'item_id', 'ts'] + data.columns.values.tolist()[4:]
    data.columns = col_names

    if last_months > 0:
        def add_months(sourcedate, months):
            month = sourcedate.month - 1 + months
            year = int(sourcedate.year + month / 12)
            month = month % 12 + 1
            day = min(sourcedate.day, calendar.monthrange(year, month)[1])
            return datetime.date(year, month, day)

        lastdate = datetime.datetime.fromtimestamp(data.ts.max())
        firstdate = add_months(lastdate, -last_months)
        initial_unix = time.mktime(firstdate.timetuple())

        # filter out older interactions
        data = data[data['ts'] >= initial_unix]

    return data

In [0]:
from collections import Counter

topk = 1000
file = load_and_adapt("sessions.csv", last_months=1)

c = Counter(list(file['item_id']))

if topk > 1:
    keeper = set([x[0] for x in c.most_common(topk)])
    file = file[file['item_id'].isin(keeper)]

# group by session id and concat song_id
groups = file.groupby('session_id')

# convert item ids to string, then aggregate them to lists
aggregated = groups['item_id'].aggregate([lambda x: list(map(str, x))])
init_ts = groups['ts'].min()
users = groups['user_id'].min()  

result = aggregated.join(init_ts).join(users)
result.reset_index(inplace=True)

In [6]:
result = result.rename(columns = {"<lambda>":"sequence"})
result

Unnamed: 0,session_id,sequence,ts,user_id
0,122,"[1762, 3700, 638]",1420059172,2432
1,223,"[3772, 3953]",1419418147,15861
2,226,"[245, 1271, 379]",1419433841,15861
3,243,"[245, 1197, 4307, 3868]",1421674741,15861
4,245,"[409, 234, 2334, 2431, 231, 4738, 219, 2403]",1421679507,15861
...,...,...,...,...
65875,2764052,"[419, 930, 419, 908, 3493, 5294, 5297, 5299, 5...",1421508739,4503
65876,2764159,"[528, 6475]",1421059220,12934
65877,2764161,"[6349, 2803]",1421141469,12934
65878,2764164,"[1485, 5733, 1482, 2445, 915]",1421430665,12934


**Splitting the data into Train and Test Datasets:**

In [0]:
def last_session_out_split(data, user_key='user_id', session_key='session_id', time_key='ts'):
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.last()
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    train, test = clean_split(train, test)
    return train, test


def clean_split(train, test):
    train_items = set()
    train['sequence'].apply(lambda seq: train_items.update(set(seq)))
    test['sequence'] = test['sequence'].apply(lambda seq: [it for it in seq if it in train_items])
    return train, test

In [8]:
train_data, test_data = last_session_out_split(result)
print("Train sessions: {} - Test sessions: {}".format(len(train_data), len(test_data)))

Train sessions: 48068 - Test sessions: 17812


**Training the model:**

In [0]:

import gensim

class Prod2VecRecommender():

    def __init__(self, min_count=2, size=100, window=5, decay_alpha=0.9, workers=4):
        #super(Prod2VecRecommender, self).__init__()
        self.min_count = min_count
        self.size = size
        self.window = window
        self.decay_alpha = decay_alpha
        self.workers = workers

    def __str__(self):
        return 'Prod2VecRecommender(min_count={min_count}, ' \
               'size={size}, ' \
               'window={window}, ' \
               'decay_alpha={decay_alpha}, ' \
               'workers={workers})'.format(**self.__dict__)

    def fit(self, train_data):
        sequences = train_data['sequence'].values
        self.model = gensim.models.Word2Vec(sequences, min_count=self.min_count, window=self.window, hs=1, size=self.size, sg=1, workers=self.workers)

    def recommend(self, user_profile, user_id=None):
        user_profile = list(map(str, user_profile))
        rec = []
        try:
            # iterate the user profile backwards
            for i, item in enumerate(user_profile[::-1]):
                ms = self.model.most_similar(positive=item)
                # apply exponential decay to the similarity scores
                decay = self.decay_alpha ** i
                ms = [(x[0], decay * x[1]) for x in ms]
                rec.extend(ms)
            # sort items by similarity score
            rec = sorted(rec, key=lambda x: -x[1])
        except KeyError:
            rec = []
        return [([x[0]], x[1]) for x in rec]

    def get_recommendation_list(self, recommendation):
        return list(map(lambda x: x[0], recommendation))

    def get_recommendation_confidence_list(self, recommendation):
        return list(map(lambda x: x[1], recommendation))


In [0]:
recommender = Prod2VecRecommender(min_count=2, size=50, window=5, decay_alpha=0.9, workers=4)
recommender.fit(train_data)

**Evaluation:**

In [0]:
GIVEN_K = 1
LOOK_AHEAD = 1
STEP = 1

In [12]:
test_sequences = test_data.loc[test_data['sequence'].map(len) > abs(GIVEN_K), 'sequence'].values
print('{} sequences available for evaluation'.format(len(test_sequences)))


11776 sequences available for evaluation


In [0]:
from tqdm import tqdm    #showing the progress bar
#evaluation method defined 
def sequential_evaluation(recommender, test_sequences, evaluation_functions, users=None, given_k=1, look_ahead=1, top_n=10, scroll=True, step=1):
    if given_k == 0:
        raise ValueError('given_k must be != 0')

    metrics = np.zeros(len(evaluation_functions))
    with tqdm(total=len(test_sequences)) as pbar:
        for i, test_seq in enumerate(test_sequences):
            if users is not None:
                user = users[i]
            else:
                user = None
            if scroll:
                metrics += sequence_sequential_evaluation(recommender, test_seq, evaluation_functions, user, given_k, look_ahead, top_n, step)
            else:
                metrics += evaluate_sequence(recommender, test_seq, evaluation_functions, user, given_k, look_ahead, top_n)
            pbar.update(1)
    return metrics / len(test_sequences)


def evaluate_sequence(recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n):
    # safety checks
    if given_k < 0:
        given_k = len(seq) + given_k

    user_profile = seq[:given_k]
    ground_truth = seq[given_k:]

    # restrict ground truth to look_ahead
    ground_truth = ground_truth[:look_ahead] if look_ahead != 'all' else ground_truth
    ground_truth = list(map(lambda x: [x], ground_truth))  # list of list format

    if not user_profile or not ground_truth:
        # if any of the two missing all evaluation functions are 0
        return np.zeros(len(evaluation_functions))

    r = recommender.recommend(user_profile, user)[:top_n]

    if not r:
        # no recommendation found
        return np.zeros(len(evaluation_functions))
    reco_list = recommender.get_recommendation_list(r)

    tmp_results = []
    for f in evaluation_functions:
        tmp_results.append(f(ground_truth, reco_list))
    return np.array(tmp_results)
  
def sequence_sequential_evaluation(recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n, step):
    if given_k < 0:
        given_k = len(seq) + given_k

    eval_res = 0.0
    eval_cnt = 0
    for gk in range(given_k, len(seq), step):
        eval_res += evaluate_sequence(recommender, seq, evaluation_functions, user, gk, look_ahead, top_n)
        eval_cnt += 1
    return eval_res / eval_cnt

In [0]:
#metrics defined
def precision(ground_truth, prediction):
    ground_truth = remove_duplicates(ground_truth)
    prediction = remove_duplicates(prediction)
    precision_score = count_a_in_b_unique(prediction, ground_truth) / float(len(prediction))
    assert 0 <= precision_score <= 1
    return precision_score


def recall(ground_truth, prediction):
    ground_truth = remove_duplicates(ground_truth)
    prediction = remove_duplicates(prediction)
    if len(prediction) == 0:
      recall_score = 0  
    else:
      recall_score = count_a_in_b_unique(prediction, ground_truth) / float(len(ground_truth))
    assert 0 <= recall_score <= 1
    return recall_score


def mrr(ground_truth, prediction):
    rr = 0
    for rank, p in enumerate(prediction):
        if p in ground_truth:
            rr = 1 / (rank + 1)
            break
    return rr

def prec_r(ground_truth, prediction):
    ground_truth = remove_duplicates(ground_truth)
    prediction = remove_duplicates(prediction)
    pt = prediction[0:len(ground_truth)]
    pr = count_a_in_b_unique(pt, ground_truth) / float(len(pt))
    assert 0 <= pr <= 1
    return pr


def count_a_in_b_unique(a, b):

    #returns number of elements of a in b
    
    count = 0
    for el in a:
        if el in b:
            count += 1
    return count
    
def remove_duplicates(l):
    return [list(x) for x in set(tuple(x) for x in l)]

In [0]:
METRICS = {'Precision':precision, 
           'Recall':recall,
           'MRR': mrr,
           'r-Precision':prec_r}


**Evaluation for Sequentially revealed user profiles:**

In [16]:
TOPN = 10 
STEP = 5
r11 = sequential_evaluation(recommender, 
                            test_sequences=test_sequences, 
                            given_k=GIVEN_K, look_ahead=LOOK_AHEAD, evaluation_functions=METRICS.values(), top_n=TOPN, scroll=True, step=STEP)

  if np.issubdtype(vec.dtype, np.int):
100%|██████████| 11776/11776 [00:15<00:00, 778.30it/s]


In [17]:
met = list(METRICS.keys())
print("Sequentially revealed user profiles:")
for i in range(len(r11)):
  print(met[i]," = ",r11[i])

Sequentially revealed user profiles:
Precision  =  0.04570258808005073
Recall  =  0.4384538414231781
MRR  =  0.2045028013831103
r-Precision  =  0.05199296884004565


**Evaluation for next-item recommendAation with varying recommendation list lengths:**

In [0]:
topn_list = [1, 5, 10, 20, 50, 100]

In [19]:
res_list = []

for topn in topn_list:
    print('Evaluating recommendation lists with length: {}'.format(topn))
    res_tmp = sequential_evaluation(recommender,
                                               test_sequences=test_sequences,
                                               given_k=GIVEN_K,
                                               look_ahead=LOOK_AHEAD,
                                               evaluation_functions=METRICS.values(),
                                               top_n=topn,
                                               scroll=True,  # here we average over all profile lengths
                                               step=STEP)
    mvalues = list(zip(METRICS.keys(), res_tmp))
    res_list.append((topn, mvalues))

  if np.issubdtype(vec.dtype, np.int):
  0%|          | 33/11776 [00:00<00:38, 303.14it/s]

Evaluating recommendation lists with length: 1


100%|██████████| 11776/11776 [00:14<00:00, 811.27it/s]
  1%|          | 62/11776 [00:00<00:19, 615.70it/s]

Evaluating recommendation lists with length: 5


100%|██████████| 11776/11776 [00:14<00:00, 805.99it/s]
  0%|          | 55/11776 [00:00<00:21, 549.95it/s]

Evaluating recommendation lists with length: 10


100%|██████████| 11776/11776 [00:15<00:00, 783.46it/s]
  0%|          | 51/11776 [00:00<00:23, 508.61it/s]

Evaluating recommendation lists with length: 20


100%|██████████| 11776/11776 [00:15<00:00, 783.98it/s]
  0%|          | 33/11776 [00:00<00:36, 325.55it/s]

Evaluating recommendation lists with length: 50


100%|██████████| 11776/11776 [00:15<00:00, 766.79it/s]
  0%|          | 49/11776 [00:00<00:24, 485.71it/s]

Evaluating recommendation lists with length: 100


100%|██████████| 11776/11776 [00:15<00:00, 747.21it/s]


In [20]:
for i in res_list:
  print("For list size = ",i[0],":")
  for j in i[1]:
    print(j[0]," = ",j[1])
    

For list size =  1 :
Precision  =  0.10016255922785647
Recall  =  0.10016255922785647
MRR  =  0.10016255922785647
r-Precision  =  0.10016255922785647
For list size =  5 :
Precision  =  0.07160217618062927
Recall  =  0.35609378671688835
MRR  =  0.19320593116169596
r-Precision  =  0.07577799148010507
For list size =  10 :
Precision  =  0.04570258808005073
Recall  =  0.4384538414231781
MRR  =  0.2045028013831103
r-Precision  =  0.05199296884004565
For list size =  20 :
Precision  =  0.043975755630879125
Recall  =  0.44593918471439803
MRR  =  0.20502657312821992
r-Precision  =  0.04801187907006319
For list size =  50 :
Precision  =  0.04216252395606706
Recall  =  0.45396725605669763
MRR  =  0.20528269003256003
r-Precision  =  0.046284018973487086
For list size =  100 :
Precision  =  0.04133120672150731
Recall  =  0.4573902050873502
MRR  =  0.20533919209026832
r-Precision  =  0.045949924639944725
