In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import sys
sys.path.append("/Users/alessiorussointroito/Documents/GitHub/Structural-Perturbation-Method")

#from SPM_fast import SPM
from BiSPM import BiSPM

import similaripy as sim
import math

In [2]:
from tqdm.notebook import tqdm

In [3]:
np.random.seed(42)

In [4]:
df = pd.read_csv(
    "/Users/alessiorussointroito/Downloads/Telegram Desktop/recommender-system-2020-challenge-polimi/data_train.csv")

le = LabelEncoder()
df['new_col'] = le.fit_transform(df.col)

row_size = len(df.row.unique())
col_size = len(le.classes_)

In [5]:
group = df.groupby("row")['new_col'].apply(lambda x: list(x))

In [6]:
group = group.reset_index()

In [7]:
res = []
holdout_perc = 0.2
for l in group.new_col:
    res.append(math.ceil(len(l)*holdout_perc))

In [8]:
group['holdout'] = res

In [9]:
res = []
for l, h in zip(group.new_col, group.holdout):
    res.append(np.random.choice(l, h))
group['target'] = res

In [10]:
target_idx = np.random.choice(group.row.unique(), int(7947*0.2), replace=False)
target_idx = np.sort(target_idx)
group = group[group.row.isin(target_idx)]

In [11]:
row = []
col = []
for r, t in tqdm(zip(group.row, group.target)):
    for e in t:
        row.append(r)
        col.append(e)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [12]:
test_df = pd.DataFrame({"user_id":row, "item":col})
test_df

Unnamed: 0,user_id,item
0,15,18202
1,15,19427
2,15,19427
3,18,14901
4,18,21875
...,...,...
5101,7932,1501
5102,7936,24085
5103,7941,17998
5104,7941,7136


# Matrices Creation

In [13]:
urm_test = sps.csr_matrix((np.ones(len(row)), (row, col)), shape=(row_size, col_size))

In [14]:
urm_total = sps.csr_matrix((np.ones(df.shape[0]), (df.row, df.new_col)), shape=(row_size, col_size))

In [15]:
urm_train = urm_total - urm_test

# Model

In [16]:
test_indices = group.row.unique()

In [17]:
test_indices.shape

(1589,)

In [18]:
n_svs = 100
k = 10
p = 0.3

In [19]:
urm_train = sim.normalization.bm25(urm_train)

In [20]:
bspm = BiSPM(urm_train, target=test_indices, n_sv=n_svs, p=p)

In [21]:
rankings = bspm.k_runs(k=k)

Computing Perturbed B: 100%|██████████| 10/10 [01:38<00:00,  9.83s/it]


# Predictions

In [22]:
# Evaluation

def precision(is_relevant, relevant_items):
    # is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    return precision_score


def recall(is_relevant, relevant_items):
    # is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    return recall_score


def MAP(is_relevant, relevant_items):
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    map_score = np.sum(p_at_k) / np.min([len(relevant_items), len(is_relevant)])
    return map_score

In [23]:
n_users = len(test_indices)

cumulative_precision = 0.0
cumulative_recall = 0.0
cumulative_MAP = 0.0
num_eval = 0

at = 10

for i, user_id in enumerate(tqdm(test_indices)):
    relevant_items = group[group.row == user_id]
    #recommended_items = rankings[user_id].argsort()[::-1][:at]
    recommended_items = rankings[i]

    # Filter Seen:
    # 1. Remove items already seen by the user
    seen_indices = sps.find(urm_train[user_id])[1]    # Con [1] Prendiamo solo le colonne d'interesse della matrice di adiacenza
    mask = np.zeros(urm_train.shape[1], dtype=bool)
    mask[seen_indices] = True
    
    recommended_items[mask] = -np.inf

    # Recommend
    recommended_items = recommended_items.argsort()[::-1][:at]

    num_eval += 1

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    cumulative_precision += precision(is_relevant, relevant_items)
    cumulative_recall += recall(is_relevant, relevant_items)
    cumulative_MAP += MAP(is_relevant, relevant_items)

cumulative_precision /= num_eval
cumulative_recall /= num_eval
cumulative_MAP /= num_eval

print(f"SPM n_eigen = {n_svs} , k = {k}  \n Precision = {cumulative_precision} \n Recall = {cumulative_recall} \n MAP = {cumulative_MAP}")

HBox(children=(FloatProgress(value=0.0, max=1589.0), HTML(value='')))


SPM n_eigen = 100 , k = 10  
 Precision = 0.0025173064820641924 
 Recall = 0.0025173064820641924 
 MAP = 0.011624062253388872
