In [108]:
import numpy as np
import time, sys
import scipy.sparse as sps

def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

class Cosine_Similarity(object):

    def __init__(self, ICM, k=100):
        self.diop = k
        self.ICM = ICM.copy()
        self._S = None

    def compute(self):
        check_matrix(self.ICM, 'csc')
        S = np.dot(self.ICM, self.ICM.transpose())
        S.setdiag(0)
        self._S = S
        self._weighted_S = S
        return S

    def assign_weights(self, w):
        self._weighted_S = np.dot(self._S, w )
        return  self._weighted_S

    def topK(self, k):

        topk_matrix = []
        values = sps.csr_matrix((self._weighted_S.shape[0], self._weighted_S.shape[1]))

        if (self._weighted_S != None):
            for row_index in range(self._weighted_S.shape[0]):
                row = self._weighted_S.getrow(row_index).toarray().squeeze()
                #item_data = self.ICM[row, :]
                #item_data = item_data.toarray.squeeze()

                # partition row placing at the k-th position element
                # that would occupy that position in an ordered array.
                # then, move all elements greater or equal than that
                # to the left partition and elements smaller to the
                # right partition. since we are interested only about
                # the top k elements, e.g. the left part of the array
                #  we want to select only those using [0:topK]

                topK_items = np.argpartition(row, k-1, axis=0)[0:k]

                # now we want to order the topK_items we found before
                # so that we can check the most similar items in order
                topK_items_sorted = np.argsort(row[topK_items])
                topk_matrix.append(topK_items_sorted)

            for topk_row_idx in range(len(topk_matrix)):
                for element in topk_matrix[topk_row_idx]:
                   values[topk_row_idx][element] = 1

        S_knn = np.dot(S, values)
        return S_knn


In [109]:
import pandas as p
from scipy.sparse import hstack

def build_icm():
    MAX_ALBUM = 12744
    MAX_ARTIST = 6668
    MAX_DURATION = 2115

    datafile = np.loadtxt('../data/tracks.csv', delimiter=',', skiprows=1, dtype=int)

    tracks_list, album_list, artist_list, duration_list = zip(*datafile)
    ratings = np.ones(len(album_list), dtype=int)

    ICM_album = sps.csc_matrix((ratings, (tracks_list, album_list)))
    ICM_artist = sps.csc_matrix((ratings, (tracks_list, artist_list)))
    ICM_duration = sps.csc_matrix((ratings, (tracks_list, duration_list)))

    ICM_partial = hstack((ICM_album, ICM_artist))
    ICM = hstack((ICM_partial, ICM_duration))

    #return ICM
    return ICM_partial



In [110]:
porcodio = build_icm()

sim = Cosine_Similarity(porcodio, 100)
cristo_il_porco = sim.compute()

print(cristo_il_porco)

  (7894, 0)	1
  (0, 0)	0
  (20294, 1)	1
  (20122, 1)	1
  (19702, 1)	1
  (18947, 1)	1
  (18481, 1)	1
  (17795, 1)	1
  (17671, 1)	1
  (17352, 1)	1
  (17278, 1)	1
  (17156, 1)	1
  (16730, 1)	1
  (15877, 1)	1
  (15874, 1)	1
  (14985, 1)	1
  (14508, 1)	1
  (14293, 1)	1
  (13237, 1)	1
  (13213, 1)	1
  (12382, 1)	1
  (12167, 1)	1
  (11585, 1)	1
  (11119, 1)	1
  (10047, 1)	1
  :	:
  (5887, 20632)	1
  (5812, 20632)	1
  (3262, 20632)	1
  (1052, 20632)	1
  (20632, 20632)	0
  (6441, 20632)	2
  (2103, 20632)	2
  (20633, 20633)	0
  (20038, 20634)	1
  (15780, 20634)	1
  (14230, 20634)	1
  (13214, 20634)	1
  (12755, 20634)	1
  (11601, 20634)	1
  (11132, 20634)	1
  (9774, 20634)	1
  (8492, 20634)	1
  (3110, 20634)	1
  (2539, 20634)	1
  (1908, 20634)	1
  (20634, 20634)	0
  (19514, 20634)	2
  (13326, 20634)	2
  (8299, 20634)	2
  (4972, 20634)	2


In [111]:
sim.topK(10)



IndexError: index (1) out of range (>= 1)

In [52]:
sim._weighted_S.shape[0]

20635