In [10]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import sys
sys.path.append("/Users/alessiorussointroito/Documents/GitHub/Structural-Perturbation-Method")

#from SPM_fast import SPM
from BiSPM import BiSPM

import os
os.environ["CC"] = "g++-10"

import similaripy as sim

In [11]:
# Evaluation

def precision(is_relevant, relevant_items):
    # is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    return precision_score


def recall(is_relevant, relevant_items):
    # is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    return recall_score


def MAP(is_relevant, relevant_items):
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    map_score = np.sum(p_at_k) / np.min([len(relevant_items), len(is_relevant)])
    return map_score

In [12]:
n_svs = 100
k = 10
p = 0.3

In [13]:
df = pd.read_csv(
    "/Users/alessiorussointroito/Downloads/Telegram Desktop/recommender-system-2020-challenge-polimi/data_train.csv")

le = LabelEncoder()
df['new_col'] = le.fit_transform(df.col)

row_size = len(df.row.unique())
col_size = len(le.classes_)

X_train, X_test, y_train, y_test = train_test_split(df.row, df.new_col, test_size=0.20, random_state=3)

bip_adj = sps.csr_matrix((np.ones(X_train.shape[0]), (X_train, y_train)), shape=(row_size, col_size))

test_indices = X_test.unique()
test_indices = np.sort(test_indices)
test = pd.DataFrame({'row': X_test, 'target': y_test})

targets = test.groupby(test.row)['target'].apply(lambda x: list(x))

In [72]:
bspm = BiSPM(urm, target=test_indices, n_sv=n_svs, p=p)

rankings = bspm.k_runs(k=k)

Computing Perturbed B:   0%|          | 0/10 [00:19<?, ?it/s]


KeyboardInterrupt: 

In [71]:
n_users = len(test_indices)

cumulative_precision = 0.0
cumulative_recall = 0.0
cumulative_MAP = 0.0
num_eval = 0

at = 10

for i, user_id in enumerate(tqdm(test_indices)):
    relevant_items = targets[user_id]
    #recommended_items = rankings[user_id].argsort()[::-1][:at]
    recommended_items = rankings[i]

    # Filter Seen:
    # 1. Remove items already seen by the user
    seen_indices = sps.find(bip_adj[user_id])[1]    # Con [1] Prendiamo solo le colonne d'interesse della matrice di adiacenza
    mask = np.zeros(bip_adj.shape[1], dtype=bool)
    mask[seen_indices] = True
    
    recommended_items[mask] = -np.inf

    # Recommend
    recommended_items = recommended_items.argsort()[::-1][:at]

    num_eval += 1

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    cumulative_precision += precision(is_relevant, relevant_items)
    cumulative_recall += recall(is_relevant, relevant_items)
    cumulative_MAP += MAP(is_relevant, relevant_items)

cumulative_precision /= num_eval
cumulative_recall /= num_eval
cumulative_MAP /= num_eval

print(f"SPM n_eigen = {n_svs} , k = {k}  \n Precision = {cumulative_precision} \n Recall = {cumulative_recall} \n MAP = {cumulative_MAP}")

100%|██████████| 5644/5644 [00:15<00:00, 364.09it/s]

SPM n_eigen = 100 , k = 10  
 Precision = 0.01879872430900057 
 Recall = 0.01879872430900057 
 MAP = 0.02405776068236307





In [73]:
bspm = BiSPM(urm, target=test_indices, n_sv=2, p=p)
rankings = bspm.k_runs(k=2)

Computing Perturbed B: 100%|██████████| 2/2 [00:09<00:00,  4.52s/it]


In [69]:
urm = sim.normalization.bm25(bip_adj)

# Variance

In [62]:
n_components = 100
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=n_components, n_iter=7, random_state=42)
svd.fit(bip_adj)
svd.explained_variance_ratio_.sum() # Percentage of variance explained by each singular value

0.26483175906359896

## Variance of SVD (in my case)

In [24]:
%load_ext Cython

In [50]:
%%cython

from math import sqrt
from tqdm import tqdm

import numpy as np
cimport numpy as np

import cython
cimport cython

ctypedef np.float64_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
def outer_prod(int index,
               DTYPE_t[:] u,
               DTYPE_t s,
               DTYPE_t[:] v,
               DTYPE_t[:,:] out):
    
    cdef int n = u.shape[0]
    cdef int m = v.shape[0]
    cdef int i
    cdef int j
    cdef float tmp_u_i
    
    for i in range(n):
        tmp_u_i = u[i] * s
        for j in range(m):
            out[i,j] = tmp_u_i * v[j]
            
    #return out

In [59]:
k_components = 100
u,s,vt = sps.linalg.svds(urm, k=k_components)

In [64]:
total_var = 0.0
norm = sps.linalg.norm(urm)
out = np.zeros((7947, 24896), dtype=np.float64)
norm_i = np.zeros(k_components, dtype=np.float64)

for i in tqdm(range(k_components)):
    outer_prod(i, 
           u[:, i].astype(np.float64),
           s[i].astype(np.float64),
           vt[i].astype(np.float64),
           out)
    norm_i[i] = np.linalg.norm(out)
    
total_var = np.square(norm_i).sum() / norm**2
total_var

100%|██████████| 100/100 [00:36<00:00,  2.77it/s]


0.06318189863678542

In [16]:
from sklearn.utils.extmath import randomized_svd

s,v,d = randomized_svd(bip_adj, 2000, random_state=0)

In [17]:
print(s.shape)
print(v.shape)
print(d.shape)

(7947, 2000)
(2000,)
(2000, 24896)


In [10]:
u,s,vt = sps.linalg.svds(bip_adj, k=1000)

In [12]:
print(u.shape)
print(s.shape)
print(vt.shape)

(7947, 1000)
(1000,)
(1000, 24896)


In [7]:
group = df.groupby("row")['col'].apply(lambda x: list(x))

In [8]:
group

row
0                                          [10080, 19467]
1                [2665, 7494, 17068, 17723, 18131, 20146]
2                                          [19337, 21181]
3                                          [18736, 23037]
4          [477, 6927, 10204, 13707, 18999, 19838, 19851]
                              ...                        
7942                   [7830, 20122, 21662, 22275, 24405]
7943                                  [7883, 7888, 11754]
7944    [182, 259, 265, 385, 415, 426, 1026, 1224, 150...
7945                            [817, 2476, 12319, 21384]
7946                                        [8699, 19178]
Name: col, Length: 7947, dtype: object