In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [9]:
table_21 = [
    [5,3,4,4,np.nan],
    [3,1,2,3,3],
    [4,3,4,3,5],
    [3,3,1,5,4],
    [1,5,5,2,1]
]
table_21 = pd.DataFrame(table_21,columns=['i%i'%i for i in range(5)])
table_21.index = ['Alice'] + ['u%i'%i for i in range(1,5)]

In [10]:
table_21

Unnamed: 0,i0,i1,i2,i3,i4
Alice,5,3,4,4,
u1,3,1,2,3,3.0
u2,4,3,4,3,5.0
u3,3,3,1,5,4.0
u4,1,5,5,2,1.0


# User-Based

In [47]:
df = table_21
a = 'Alice'
b = 'u1'

a = df.loc[a,:]
b = df.loc[b,:]
anull = list(a[a.isnull()].index)
bnull = list(b[b.isnull()].index)
missing_both = set(anull).union(set(bnull))
remain = list(set(df.columns) - missing_both)
a = a[remain]
b = b[remain]

In [130]:
def sim(df,a,b):
    a = df.loc[a,:]
    b = df.loc[b,:]
    anull = list(a[a.isnull()].index)
    bnull = list(b[b.isnull()].index)
    missing_both = set(anull).union(set(bnull))
    remain = list(set(df.columns) - missing_both)
    a = a[remain]
    b = b[remain]
    amz = a - a.mean()
    bmz = b - b.mean()
    num = (amz * bmz).sum()
    denom = np.sqrt((amz**2).sum()) * np.sqrt((bmz**2).sum())
    return num/denom

def nearest_neighbors(df,a,n,p):
    #check_u = list(df[~df.index.isin([a])].index)
    check_u = ~df.loc[~df.index.isin(['Alice']),p].isnull()
    check_u = list(check_u[check_u].index)
    sim_a = {u:sim(df,a,u) for u in check_u}
    nearest_users_ind = np.argsort(list(sim_a.values()))[-n:]
    nearest_users = np.array(list(sim_a.keys()))[nearest_users_ind]
    return nearest_users

In [141]:
def pred(df,a,p,n):
    df_a = df.loc[a,:]
    abar = df_a.mean()
    nearest_users = nearest_neighbors(df,a,n,p)
    print(nearest_users)
    num = 0
    denom = 0
    for u in nearest_users:
        df_u = df.loc[u,:]
        u_p = df_u[p]
        u_bar = df_u.mean()
        s = sim(df,a,u)
        num += s*(u_p - u_bar)
        denom += s
    return abar + num/denom

In [145]:
pred(df,'Alice','i4',2)

['u1', 'u2', 'u3', 'u4']
['u2' 'u1']


4.8719798993705918

# Item-Based

In [7]:
a = [1,2,3]
b = [4,5,6]

def cosine_sim(a,b):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    num = np.dot(a,b)
    denom = na * nb
    return num/denom,num,denom

def cosine_sim_item(df,p,q):
    df_pq = df.loc[:,[p,q]]
    df_pq = df_pq.loc[~df_pq.isnull().any(axis=1),:]
    df_p = df_pq.loc[:,p]
    df_q = df_pq.loc[:,q]
    cs,n,d = cosine_sim(df_p,df_q)
    return cs

def adjusted_cosine_sim_item(df,p,q):
    if p==q:
        return 1
    else:
        temp_df = df.subtract(df.mean(axis=1),axis=0)
        df_pq = temp_df.loc[:,[p,q]]
        df_pq = df_pq.loc[~df_pq.isnull().any(axis=1),:]
        df_p = df_pq.loc[:,p]
        df_q = df_pq.loc[:,q]
        cs,n,d = cosine_sim(df_p,df_q)
        return cs

def nearest_neighbors(df,p,n):
    check_i = list(df.drop(p,axis=1).columns)
    sim_p = {i:adjusted_cosine_sim_item(df,p,i) for i in check_i}
    nearest_item_ind = np.argsort(list(sim_p.values()))[-n:]
    items = np.array(list(sim_p.keys()))[nearest_item_ind]
    return items

def pred_item(df,u,p):
    df_u = df.loc[u,:]
    rated_items = list(df_u[~df_u.isnull()].index)
    nearest_items = nearest_neighbors(df,p,2)
    
    num = 0
    denom = 0
    for i in nearest_items:
        sim = adjusted_cosine_sim_item(df,p,i)
        num += sim * df_u[i]
        denom += sim
    return num/denom,num,denom

def item_similarity_matrix(df):
    n = len(df.columns)
    sim_mat = np.zeros((n,n))
    
    for i in range(len(df.columns)):
        u = df.columns[i]
        for j in range(len(df.columns)):
            v = df.columns[j]
            sim_mat[i][j] = adjusted_cosine_sim_item(df,u,v)
    sim_mat = pd.DataFrame(sim_mat,columns=df.columns,index=df.columns)
    return sim_mat
            

In [257]:
cosine_sim_item(df,'i0','i4')

0.99410024349541681

In [258]:
adjusted_cosine_sim_item(df,'i0','i4')

0.80491448237912955

In [260]:
p,num,denom = pred_item(df,'Alice','i4')

In [273]:
item_similarity_matrix(df)

Unnamed: 0,i0,i1,i2,i3,i4
i0,1.0,-0.939725,-0.547068,0.267841,0.804914
i1,-0.939725,1.0,0.620543,-0.360645,-0.908232
i2,-0.547068,0.620543,1.0,-0.88138,-0.76356
i3,0.267841,-0.360645,-0.88138,1.0,0.433063
i4,0.804914,-0.908232,-0.76356,0.433063,1.0


# SVD

In [286]:
from sklearn.decomposition import PCA

In [284]:
sdf = df.drop('Alice',axis=0).T
sdf=sdf.drop('i4',axis=0)
sdf.loc['i1','u3'] = 2
sdf.loc['i1','u4'] = 6

In [285]:
sdf

Unnamed: 0,u1,u2,u3,u4
i0,3.0,4.0,3.0,1.0
i1,1.0,3.0,2.0,6.0
i2,2.0,4.0,1.0,5.0
i3,3.0,3.0,5.0,2.0


In [401]:
from sklearn.utils.extmath import randomized_svd

sdf_np = np.array(sdf)
U, Sigma, V = randomized_svd(sdf_np,n_components=4,flip_sign=False,transpose=False)
U, Sigma, V = np.linalg.svd(sdf_np)
smat = np.diagflat(Sigma)
#Reconstruct with np.dot(np.dot(U,np.diagflat(Sigma)),V)

In [397]:
a = np.array([5,3,4,4])

In [402]:
U

array([[-0.43124523,  0.49315012, -0.55075835, -0.51719991],
       [-0.53273754, -0.53052572,  0.41966021, -0.50854546],
       [-0.52374556, -0.40520071, -0.48729169,  0.5692537 ],
       [-0.50587435,  0.5578152 ,  0.53206894,  0.38708653]])

In [409]:
smat_inv = np.linalg.inv(smat)
np.dot(np.dot(a,U),smat_inv)

array([-0.64418534,  0.30125475, -0.63751178, -0.96174681])

# Try to Use - Movielens

In [120]:
from scipy.sparse import csc_matrix

In [230]:
def cosine_sim(a,b):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    num = np.dot(a,b)
    denom = na * nb
    return num/denom,num,denom

def cosine_sim_mat(a,b):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    num = a.dot(b.T)
    denom = na * nb
    return num/denom,num,denom

def cosine_sim_item(df,p,q):
    df_pq = df.loc[:,[p,q]]
    df_pq = df_pq.loc[~df_pq.isnull().any(axis=1),:]
    df_p = df_pq.loc[:,p]
    df_q = df_pq.loc[:,q]
    cs,n,d = cosine_sim(df_p,df_q)
    return cs

def nearest_neighbors(df,p,n):
    check_i = list(df.drop(p,axis=1).columns)
    sim_p = {i:adjusted_cosine_sim_item(df,p,i) for i in check_i}
    nearest_item_ind = np.argsort(list(sim_p.values()))[-n:]
    items = np.array(list(sim_p.keys()))[nearest_item_ind]
    return items

def pred_item(df,u,p):
    df_u = df.loc[u,:]
    rated_items = list(df_u[~df_u.isnull()].index)
    nearest_items = nearest_neighbors(df,p,2)
    
    num = 0
    denom = 0
    for i in nearest_items:
        sim = adjusted_cosine_sim_item(df,p,i)
        num += sim * df_u[i]
        denom += sim
    return num/denom,num,denom

def adjusted_cosine_sim_item(df,p,q):
    df_pq = df_mat[:,[i,j]]
    nz = df_pq.nonzero()
    df = pd.DataFrame(nz[0],columns=['movies'])
    df['ind'] = 1
    ct = df.groupby('movies').count().reset_index()
    ct = ct[ct['ind']==2]['movies']
    
    if ct.shape[0] != 0:
        df_pq = df_pq[ct,:]
        df_p = df_pq[:,0].todense().reshape(1,-1)[0]
        df_q = df_pq[:,1].todense().reshape(1,-1)[0]
        cs,n,d = cosine_sim_mat(df_p,df_q)
        return cs[0,0]
    else:
        return np.nan

#Rewrite to use sparse matrix
def item_similarity_matrix(df):
    n = df.shape[1]
    sim_mat = np.zeros((n,n))
    
    for i in range(n):
        if i%10==0:
            print(i)
        u = df[:,i]
        for j in range(n):
            v = df[:,j]
            if i==j:
                acsi = 1
            elif i > j:
                acsi = sim_mat[j][i]
            else:
                acsi = adjusted_cosine_sim_item(df,i,j)
            sim_mat[i][j] = acsi
    sim_mat = pd.DataFrame(sim_mat,columns=df.columns,index=df.columns)
    return sim_mat

In [244]:
path = 'ml-latest-small\\'
df = pd.read_csv(path + 'ratings.csv')
movie_xref = pd.read_csv(path + 'movies.csv')

In [245]:
movies = df.groupby('movieId').mean()['rating'].reset_index()
movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
users = df['userId'].unique()
data = df['rating']
#df_mat = coo_matrix((data,(users,movies)))


In [246]:
movies['mean_rating'] = movies['rating']
movies = movies.drop('rating',axis=1)

In [247]:
movies['mid'] = movies.index

In [248]:
df = df.merge(movies,left_on='movieId',right_on='movieId')

In [249]:
df['uid'] = df['userId']-1
df['adjusted_rating'] = df['rating'] - df['mean_rating']

In [250]:
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['adjusted_rating']

In [252]:
df_mat = csc_matrix((ratings_list,(users_list,movies_list)))

In [253]:
from sklearn.metrics.pairwise import cosine_similarity

In [254]:
s = cosine_similarity(df_mat.T,dense_output=False)

In [262]:
nearest_movies = np.argsort(s[0].todense()).tolist()[0][-10:-1]
movies[movies['mid'].isin(nearest_movies)]

Unnamed: 0,movieId,title,genres,mean_rating,mid
328,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,3.7775,328
521,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,3.674419,521
883,1092,Basic Instinct (1992),Crime|Mystery|Thriller,3.294643,883
1866,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,3.609524,1866
2506,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.844,2506
3419,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,3.847701,3419
3803,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.884615,3803
4081,5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller,3.522388,4081
6892,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.235537,6892


In [151]:
df = pd.DataFrame(nz[0],columns=['movies'])
df['ind'] = 1
ct = df.groupby('movies').count().reset_index()
ct = ct[ct['ind']==2]['movies']

In [153]:
np.array(ct)

array([ 18,  84, 114, 127, 164, 181, 284, 441, 467, 495, 533, 547, 563,
       602, 606, 623, 640, 664], dtype=int64)

In [98]:
i = 1
j = 50
u = df_mat[:,i]
v = df_mat[:,j]
temp_df = df_mat - df_mat.mean(axis=1)
df_pq = temp_df[:,[i,j]]
nz = df_pq.nonzero()
nzu = np.unique(nz[0])
df_pq = df_pq[nzu,:]

In [183]:
df_p.dot(df_q.T)

matrix([[ 1.83066688]])