In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse as sps
import seaborn as sns
from sklearn.cluster import KMeans

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
ratings.rename(columns={'movieId':'item', 'userId':'user'}, inplace=True)
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv('data/ml-latest-small/movies.csv')
movies.rename(columns={'movieId':'item'}, inplace=True)
movies.head()

Unnamed: 0,item,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movie_titles = movies.set_index('item')['title']
movie_titles.head()

item
1                      Toy Story (1995)
2                        Jumanji (1995)
3               Grumpier Old Men (1995)
4              Waiting to Exhale (1995)
5    Father of the Bride Part II (1995)
Name: title, dtype: object

In [5]:
item_ratings = ratings.set_index(['item','user'])
item_ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
item,user,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,964982703
3,1,4.0,964981247
6,1,4.0,964982224
47,1,5.0,964983815
50,1,5.0,964982931


## GSLIM

### Algorithm 1 : GSLIM
g and g' are initialized with $0.5$ and $1-g$ respectively

![Algorithm 1](algo1.png "Algorithm 1")


In [6]:
ratings['rating'] = ratings['rating'].apply(lambda x: 1 if x > 0 else 0)

In [7]:
item_index = pd.Index(ratings['item'].unique())
user_index = pd.Index(ratings['user'].unique())
norm_mat = sps.csr_matrix((ratings['rating'].values,
 	(user_index.get_indexer(ratings['user']), item_index.get_indexer(ratings['item']))))

In [8]:
m = item_index.shape[0]
n = user_index.shape[0]

g = np.full(user_index.shape, 0.5)
g_prime = 1-g

beta_g = 5
beta_l = 5

lambda_g = 1
lambda_l = 1

R = norm_mat.A

S = np.random.rand(m,m)

In [9]:
def get_pu(mat_to_cluster, num_clusters = 5):
    k_means = KMeans(n_clusters=num_clusters)
    k_means.fit(mat_to_cluster)
    return [user_index[k_means.labels_ == i] for i in np.unique(k_means.labels_)]
pu = np.array(get_pu(norm_mat, 5))

In [10]:
def get_R_pu(pu):
    R_pu = []
    for i in range(pu.shape[0]):
        temp = ratings.set_index('user').loc[pu[i]].reset_index().copy()
        mat = sps.csr_matrix((temp['rating'].values, (user_index.get_indexer(temp['user']), item_index.get_indexer(temp['item']))), shape=(R.shape[0],R.shape[1]))
        R_pu.append(mat.todense())
    return np.array(R_pu)
R_pu = get_R_pu(pu)

In [16]:
#S_pu = np.array([np.random.rand(m,m) for i in range(pu.shape[0])])
S_pu = np.array([np.array(R_pu[i].T@R_pu[i]) for i in range(pu.shape[0])])

In [12]:
def regularize_global(S, i, beta_g, lambda_g):
    return (0.5*beta_g*pow(np.linalg.norm(S[:,i]), 2)) + (lambda_g*np.linalg.norm(S[:,i], ord=1))


def regularize_local(S_pu, i, beta_l, lambda_l):
    sum_l = 0.0
    for elm in S_pu:
        sum_l = sum_l + (0.5*beta_l*pow(np.linalg.norm(elm[:,i]), 2)) + (lambda_l*np.linalg.norm(elm[:,i], ord=1))
    return sum_l


def init_component(R, R_pu, S, S_pu, i, g, g_prime):
    """ 
    Parameters: 
    R (matrix): Rating matrix - users X items.
    R_pu (matrix): Rating matrices for all clusters of users.
    S (matri): Similarity matrix - item X item
    S_pu (matrix): Similarity matrices of all clusters of users.
    g : personalized weights for all users - users X 1
    g_prime : complement of personalized weights for all users - users X 1
    """
    sum_l = np.zeros(R.shape[0],)
    for idx, elm in enumerate(S_pu):
        sum_l = sum_l + np.array(R_pu[idx]@elm[:,i])
    np.multiply(g_prime, sum_l)
    temp = np.multiply(g, R@S[:,i])
    temp = R[:,i] - temp
    return 0.5*pow(np.linalg.norm(temp, 2),2)

In [27]:
S_test = np.array(R.T@R)

In [28]:
S_test

array([[215,  32,  58, ...,   1,   1,   1],
       [ 32,  52,  25, ...,   0,   0,   0],
       [ 58,  25, 102, ...,   1,   1,   1],
       ...,
       [  1,   0,   1, ...,   1,   1,   1],
       [  1,   0,   1, ...,   1,   1,   1],
       [  1,   0,   1, ...,   1,   1,   1]], dtype=int64)

In [25]:
S = S_test.copy()
alpha = 100
for i in range(0, R.shape[1]):
    val1 = 1000
    for j in range(100):
        val = init_component(R, R_pu, S, S_pu, i, g, g_prime) + regularize_local(S_pu, i, beta_l, lambda_l) + regularize_global(S, i, beta_g, lambda_g)
        print(val1, val)
        delta = val1 - val
        print(delta)
        val1 = val
        S[i] = S[i] - delta*alpha
        

1000 497282078.5
-497281078.5
497282078.5 2.2255965068891332e+22
-2.2255965068890833e+22
2.2255965068891332e+22 7.656353255721111e+38
-7.656353255721111e+38
7.656353255721111e+38 7.656353255721111e+38
0.0
7.656353255721111e+38 7.656353255721111e+38
0.0
7.656353255721111e+38 7.656353255721111e+38
0.0


KeyboardInterrupt: 