In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [5]:
def load_movielens(mean_adjust = 'user'):
    '''Loads movielens set into a Dataframe. Adjusted rating is user by default'''
    path = 'ml-latest-small\\'
    df = pd.read_csv(path + 'ratings.csv')
    movie_xref = pd.read_csv(path + 'movies.csv')
    users = df.groupby('userId').mean()['rating'].reset_index()
    users['user_mean_rating'] = users['rating']
    users['uid'] = users.index
    users.drop('rating',inplace=True,axis=1)
    df = df.merge(users,left_on='userId',right_on='userId')
    movies = df.groupby('movieId').mean()['rating'].reset_index()
    movies['movie_mean_rating'] = movies['rating']
    movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
    movies.drop('rating',inplace=True,axis=1)
    movies['mid'] = movies.index
    df = df.merge(movies,left_on='movieId',right_on='movieId')
    if mean_adjust == 'user':
        df['adjusted_rating'] = df['rating'] - df['user_mean_rating']
    elif mean_adjust == 'movie':
        df['adjusted_rating'] = df['rating'] - df['movie_mean_rating']
    return df

In [3]:
table_21 = [
    [5,3,4,4,np.nan],
    [3,1,2,3,3],
    [4,3,4,3,5],
    [3,3,1,5,4],
    [1,5,5,2,1]
]
table_21 = pd.DataFrame(table_21,columns=['i%i'%i for i in range(5)])
table_21.index = ['Alice'] + ['u%i'%i for i in range(1,5)]

In [4]:
table_21

Unnamed: 0,i0,i1,i2,i3,i4
Alice,5,3,4,4,
u1,3,1,2,3,3.0
u2,4,3,4,3,5.0
u3,3,3,1,5,4.0
u4,1,5,5,2,1.0


# User-Based

In [47]:
df = table_21
a = 'Alice'
b = 'u1'

a = df.loc[a,:]
b = df.loc[b,:]
anull = list(a[a.isnull()].index)
bnull = list(b[b.isnull()].index)
missing_both = set(anull).union(set(bnull))
remain = list(set(df.columns) - missing_both)
a = a[remain]
b = b[remain]

In [130]:
def sim(df,a,b):
    a = df.loc[a,:]
    b = df.loc[b,:]
    anull = list(a[a.isnull()].index)
    bnull = list(b[b.isnull()].index)
    missing_both = set(anull).union(set(bnull))
    remain = list(set(df.columns) - missing_both)
    a = a[remain]
    b = b[remain]
    amz = a - a.mean()
    bmz = b - b.mean()
    num = (amz * bmz).sum()
    denom = np.sqrt((amz**2).sum()) * np.sqrt((bmz**2).sum())
    return num/denom  

def nearest_neighbors(df,a,n,p):
    #check_u = list(df[~df.index.isin([a])].index)
    check_u = ~df.loc[~df.index.isin(['Alice']),p].isnull()
    check_u = list(check_u[check_u].index)
    sim_a = {u:sim(df,a,u) for u in check_u}
    nearest_users_ind = np.argsort(list(sim_a.values()))[-n:]
    nearest_users = np.array(list(sim_a.keys()))[nearest_users_ind]
    return nearest_users

In [141]:
def pred(df,a,p,n):
    df_a = df.loc[a,:]
    abar = df_a.mean()
    nearest_users = nearest_neighbors(df,a,n,p)
    print(nearest_users)
    num = 0
    denom = 0
    for u in nearest_users:
        df_u = df.loc[u,:]
        u_p = df_u[p]
        u_bar = df_u.mean()
        s = sim(df,a,u)
        num += s*(u_p - u_bar)
        denom += s
    return abar + num/denom

In [145]:
pred(df,'Alice','i4',2)

['u1', 'u2', 'u3', 'u4']
['u2' 'u1']


4.8719798993705918

# Item-Based

In [65]:
a = [1,2,3]
b = [4,5,6]

def cosine_sim(a,b):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    num = np.dot(a,b)
    denom = na * nb
    return num/denom,num,denom

def cosine_sim_item(df,p,q):
    df_pq = df.loc[:,[p,q]]
    df_pq = df_pq.loc[~df_pq.isnull().any(axis=1),:]
    df_p = df_pq.loc[:,p]
    df_q = df_pq.loc[:,q]
    cs,n,d = cosine_sim(df_p,df_q)
    return cs

def adjusted_cosine_sim_item(df,p,q):
    if p==q:
        return 1
    else:
        temp_df = df.subtract(df.mean(axis=1),axis=0)
        df_pq = temp_df.loc[:,[p,q]]
        df_pq = df_pq.loc[~df_pq.isnull().any(axis=1),:]
        df_p = df_pq.loc[:,p]
        df_q = df_pq.loc[:,q]
        cs,n,d = cosine_sim(df_p,df_q)
        return cs

def nearest_neighbors(df,p,n):
    check_i = list(df.drop(p,axis=1).columns)
    sim_p = {i:adjusted_cosine_sim_item(df,p,i) for i in check_i}
    nearest_item_ind = np.argsort(list(sim_p.values()))[-n:]
    items = np.array(list(sim_p.keys()))[nearest_item_ind]
    return items

def pred_item(df,u,p):
    df_u = df.loc[u,:]
    rated_items = list(df_u[~df_u.isnull()].index)
    nearest_items = nearest_neighbors(df,p,2)
    
    num = 0
    denom = 0
    for i in nearest_items:
        sim = adjusted_cosine_sim_item(df,p,i)
        num += sim * df_u[i]
        denom += sim
    return num/denom,num,denom

def item_similarity_matrix(df):
    n = len(df.columns)
    sim_mat = np.zeros((n,n))
    
    for i in range(len(df.columns)):
        u = df.columns[i]
        for j in range(len(df.columns)):
            v = df.columns[j]
            sim_mat[i][j] = adjusted_cosine_sim_item(df,u,v)
    sim_mat = pd.DataFrame(sim_mat,columns=df.columns,index=df.columns)
    return sim_mat
            
def item_similarity_matrix_2(df):
    ndf = np.array(df.fillna(0).T)
    A = np.dot(ndf,ndf.T)
    B = ndf**2
    C = np.sqrt(B.sum(axis=1))
    C = np.outer(C.T,C)
    return A/C

In [9]:
df = table_21

In [10]:
cosine_sim_item(df,'i0','i4')

0.99410024349541681

In [258]:
adjusted_cosine_sim_item(df,'i0','i4')

0.80491448237912955

In [260]:
p,num,denom = pred_item(df,'Alice','i4')

In [11]:
item_similarity_matrix(df)

Unnamed: 0,i0,i1,i2,i3,i4
i0,1.0,-0.939725,-0.547068,0.267841,0.804914
i1,-0.939725,1.0,0.620543,-0.360645,-0.908232
i2,-0.547068,0.620543,1.0,-0.88138,-0.76356
i3,0.267841,-0.360645,-0.88138,1.0,0.433063
i4,0.804914,-0.908232,-0.76356,0.433063,1.0


In [35]:
df.fillna(0)

Unnamed: 0,i0,i1,i2,i3,i4
Alice,5,3,4,4,0.0
u1,3,1,2,3,3.0
u2,4,3,4,3,5.0
u3,3,3,1,5,4.0
u4,1,5,5,2,1.0


In [62]:
ndf = np.array(df.fillna(0).T)
A = np.dot(ndf,ndf.T)
B = ndf**2
C = np.sqrt(B.sum(axis=1))
C = np.outer(C.T,C)

In [56]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(df.fillna(0).T)

array([[ 1.        ,  0.78025959,  0.81978229,  0.94337007,  0.7592566 ],
       [ 0.78025959,  1.        ,  0.94201969,  0.84798442,  0.67320142],
       [ 0.81978229,  0.94201969,  1.        ,  0.78402509,  0.62242512],
       [ 0.94337007,  0.84798442,  0.78402509,  1.        ,  0.81152606],
       [ 0.7592566 ,  0.67320142,  0.62242512,  0.81152606,  1.        ]])

# SVD

In [286]:
from sklearn.decomposition import PCA

In [284]:
sdf = df.drop('Alice',axis=0).T
sdf=sdf.drop('i4',axis=0)
sdf.loc['i1','u3'] = 2
sdf.loc['i1','u4'] = 6

In [285]:
sdf

Unnamed: 0,u1,u2,u3,u4
i0,3.0,4.0,3.0,1.0
i1,1.0,3.0,2.0,6.0
i2,2.0,4.0,1.0,5.0
i3,3.0,3.0,5.0,2.0


In [401]:
from sklearn.utils.extmath import randomized_svd

sdf_np = np.array(sdf)
U, Sigma, V = randomized_svd(sdf_np,n_components=4,flip_sign=False,transpose=False)
U, Sigma, V = np.linalg.svd(sdf_np)
smat = np.diagflat(Sigma)
#Reconstruct with np.dot(np.dot(U,np.diagflat(Sigma)),V)

In [397]:
a = np.array([5,3,4,4])

In [402]:
U

array([[-0.43124523,  0.49315012, -0.55075835, -0.51719991],
       [-0.53273754, -0.53052572,  0.41966021, -0.50854546],
       [-0.52374556, -0.40520071, -0.48729169,  0.5692537 ],
       [-0.50587435,  0.5578152 ,  0.53206894,  0.38708653]])

In [409]:
smat_inv = np.linalg.inv(smat)
np.dot(np.dot(a,U),smat_inv)

array([-0.64418534,  0.30125475, -0.63751178, -0.96174681])

# Try to Use - Movielens

In [1]:
from scipy.sparse import csc_matrix

In [2]:
def cosine_sim(a,b):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    num = np.dot(a,b)
    denom = na * nb
    return num/denom,num,denom

def cosine_sim_mat(a,b):
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    num = a.dot(b.T)
    denom = na * nb
    return num/denom,num,denom

def cosine_sim_item(df,p,q):
    df_pq = df.loc[:,[p,q]]
    df_pq = df_pq.loc[~df_pq.isnull().any(axis=1),:]
    df_p = df_pq.loc[:,p]
    df_q = df_pq.loc[:,q]
    cs,n,d = cosine_sim(df_p,df_q)
    return cs

def nearest_neighbors(df,p,n):
    check_i = list(df.drop(p,axis=1).columns)
    sim_p = {i:adjusted_cosine_sim_item(df,p,i) for i in check_i}
    nearest_item_ind = np.argsort(list(sim_p.values()))[-n:]
    items = np.array(list(sim_p.keys()))[nearest_item_ind]
    return items

def pred_item(df,u,p):
    df_u = df.loc[u,:]
    rated_items = list(df_u[~df_u.isnull()].index)
    nearest_items = nearest_neighbors(df,p,2)
    
    num = 0
    denom = 0
    for i in nearest_items:
        sim = adjusted_cosine_sim_item(df,p,i)
        num += sim * df_u[i]
        denom += sim
    return num/denom,num,denom

def adjusted_cosine_sim_item(df,p,q):
    df_pq = df_mat[:,[i,j]]
    nz = df_pq.nonzero()
    df = pd.DataFrame(nz[0],columns=['movies'])
    df['ind'] = 1
    ct = df.groupby('movies').count().reset_index()
    ct = ct[ct['ind']==2]['movies']
    
    if ct.shape[0] != 0:
        df_pq = df_pq[ct,:]
        df_p = df_pq[:,0].todense().reshape(1,-1)[0]
        df_q = df_pq[:,1].todense().reshape(1,-1)[0]
        cs,n,d = cosine_sim_mat(df_p,df_q)
        return cs[0,0]
    else:
        return np.nan

#Rewrite to use sparse matrix
def item_similarity_matrix(df):
    n = df.shape[1]
    sim_mat = np.zeros((n,n))
    
    for i in range(n):
        if i%10==0:
            print(i)
        u = df[:,i]
        for j in range(n):
            v = df[:,j]
            if i==j:
                acsi = 1
            elif i > j:
                acsi = sim_mat[j][i]
            else:
                acsi = adjusted_cosine_sim_item(df,i,j)
            sim_mat[i][j] = acsi
    sim_mat = pd.DataFrame(sim_mat,columns=df.columns,index=df.columns)
    return sim_mat

In [5]:
path = 'ml-latest-small\\'
df = pd.read_csv(path + 'ratings.csv')
movie_xref = pd.read_csv(path + 'movies.csv')

In [6]:
movies = df.groupby('movieId').mean()['rating'].reset_index()
movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
users = df['userId'].unique()
data = df['rating']
#df_mat = coo_matrix((data,(users,movies)))


In [7]:
movies['mean_rating'] = movies['rating']
movies = movies.drop('rating',axis=1)

In [8]:
movies['mid'] = movies.index

In [9]:
df = df.merge(movies,left_on='movieId',right_on='movieId')

In [10]:
df['uid'] = df['userId']-1
df['adjusted_rating'] = df['rating'] - df['mean_rating']

In [11]:
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['adjusted_rating']

In [12]:
df_mat = csc_matrix((ratings_list,(users_list,movies_list)))

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
s = cosine_similarity(df_mat.T,dense_output=False)

In [15]:
nearest_movies = np.argsort(s[0].todense()).tolist()[0][-10:-1]
movies[movies['mid'].isin(nearest_movies)]

Unnamed: 0,movieId,title,genres,mean_rating,mid
328,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,3.7775,328
521,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,3.674419,521
883,1092,Basic Instinct (1992),Crime|Mystery|Thriller,3.294643,883
1866,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,3.609524,1866
2506,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.844,2506
3419,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,3.847701,3419
3803,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.884615,3803
4081,5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller,3.522388,4081
6892,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.235537,6892


In [16]:
df = pd.DataFrame(nz[0],columns=['movies'])
df['ind'] = 1
ct = df.groupby('movies').count().reset_index()
ct = ct[ct['ind']==2]['movies']

NameError: name 'nz' is not defined

In [153]:
np.array(ct)

array([ 18,  84, 114, 127, 164, 181, 284, 441, 467, 495, 533, 547, 563,
       602, 606, 623, 640, 664], dtype=int64)

In [98]:
i = 1
j = 50
u = df_mat[:,i]
v = df_mat[:,j]
temp_df = df_mat - df_mat.mean(axis=1)
df_pq = temp_df[:,[i,j]]
nz = df_pq.nonzero()
nzu = np.unique(nz[0])
df_pq = df_pq[nzu,:]

In [183]:
df_p.dot(df_q.T)

matrix([[ 1.83066688]])

# Association Rule Learning

In [15]:
path = 'ml-latest-small\\'
df = pd.read_csv(path + 'ratings.csv')
movie_xref = pd.read_csv(path + 'movies.csv')
users = df.groupby('userId').mean()['rating'].reset_index()
users['user_mean_rating'] = users['rating']
users['uid'] = users.index
users.drop('rating',inplace=True,axis=1)
df = df.merge(users,left_on='userId',right_on='userId')
movies = df.groupby('movieId').mean()['rating'].reset_index()
movies['movie_mean_rating'] = movies['rating']
movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
movies.drop('rating',inplace=True,axis=1)
movies['mid'] = movies.index
df = df.merge(movies,left_on='movieId',right_on='movieId')

In [20]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating,uid,movie_mean_rating,title,genres,mid
0,1,31,2.5,1260759144,2.55,0,3.178571,Dangerous Minds (1995),Drama,30
1,7,31,3.0,851868750,3.465909,6,3.178571,Dangerous Minds (1995),Drama,30
2,31,31,4.0,1273541953,4.166667,30,3.178571,Dangerous Minds (1995),Drama,30
3,32,31,4.0,834828440,3.666667,31,3.178571,Dangerous Minds (1995),Drama,30
4,36,31,3.0,847057202,3.615385,35,3.178571,Dangerous Minds (1995),Drama,30


In [74]:
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['adjusted_rating']
ratings_list = ratings_list.apply(lambda x: x > 0) * 1

In [75]:
df_mat = csc_matrix((ratings_list,(users_list,movies_list)))

In [76]:
df_mat

<671x9066 sparse matrix of type '<class 'numpy.int32'>'
	with 100004 stored elements in Compressed Sparse Column format>

In [113]:
#cols = movies.sort_values(by='mid')['title']
sdf = pd.SparseDataFrame(df_mat)#,columns=cols)
sdf.fillna(0,inplace=True)

In [114]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [115]:
frequent_itemsets = apriori(sdf, min_support=0.07)

In [141]:
def rename(x):
    s = set()
    for i in x:
        name = movies[movies['mid'] == i]['title'].values[0]
        s.add(name)
    return s

In [143]:
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(rename)

In [147]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.sort_values(by='lift',ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
21081,(Lord of the Rings: The Fellowship of the Ring...,"(Star Wars: Episode IV - A New Hope (1977), St...",0.092399,0.090909,0.070045,0.758065,8.33871,0.061645,3.757576
21140,"(Star Wars: Episode IV - A New Hope (1977), St...",(Lord of the Rings: The Fellowship of the Ring...,0.090909,0.092399,0.070045,0.770492,8.33871,0.061645,3.954545
21080,"(Star Wars: Episode IV - A New Hope (1977), Lo...",(Raiders of the Lost Ark (Indiana Jones and th...,0.107303,0.078987,0.070045,0.652778,8.264413,0.061569,2.652519
21141,(Raiders of the Lost Ark (Indiana Jones and th...,"(Star Wars: Episode IV - A New Hope (1977), Lo...",0.078987,0.107303,0.070045,0.886792,8.264413,0.061569,7.885494
21120,"(Star Wars: Episode IV - A New Hope (1977), St...",(Lord of the Rings: The Fellowship of the Ring...,0.111773,0.076006,0.070045,0.626667,8.244967,0.061549,2.474984
21101,(Lord of the Rings: The Fellowship of the Ring...,"(Star Wars: Episode IV - A New Hope (1977), St...",0.076006,0.111773,0.070045,0.921569,8.244967,0.061549,11.324888
21085,(Lord of the Rings: The Fellowship of the Ring...,"(Star Wars: Episode IV - A New Hope (1977), Lo...",0.080477,0.105812,0.070045,0.87037,8.225613,0.061529,6.89802
21136,"(Star Wars: Episode IV - A New Hope (1977), Lo...",(Lord of the Rings: The Fellowship of the Ring...,0.105812,0.080477,0.070045,0.661972,8.225613,0.061529,2.720256
21109,"(Star Wars: Episode IV - A New Hope (1977), Ra...",(Lord of the Rings: The Fellowship of the Ring...,0.076006,0.113264,0.070045,0.921569,8.136481,0.061436,11.305887
21112,(Lord of the Rings: The Fellowship of the Ring...,"(Star Wars: Episode IV - A New Hope (1977), Ra...",0.113264,0.076006,0.070045,0.618421,8.136481,0.061436,2.421502


# Linear Regression Approach

In [55]:
def pairwise_sim_user(df):
    dot = np.dot(df,df.T)
    norm = np.sqrt(np.sum(df.power(2),axis=1))
    norm = np.outer(norm,norm.reshape(1,-1))
    return dot/norm

def pairwise_sim_item(df):
    return pairwise_sim_user(df.T)

def cost_function(x,df,user,user_avgs):
    sim_mat = pairwise_sim_user(df)
    weights = x.reshape(-1,1)
    user_rated_items = df[user].nonzero()[1]
    user_ratings = df[user,user_rated_items].todense()
    nearest_users = np.argsort(sim_mat[user])[0,-11:-1].tolist()[0]
    peer_set = df[nearest_users,:]
    peer_set = peer_set[:,user_rated_items].todense()
    peer_set = np.asarray(peer_set)
    mean_user = user_avgs[user]
    peer_set_sum = np.sum(peer_set * weights,axis=0)
    predict = peer_set_sum + mean_user
    mse = (np.asarray((user_ratings - predict))**2).sum()
    return mse

def cost_function_predict(x,df,user,user_avgs):
    sim_mat = pairwise_sim_user(df)
    weights = x.reshape(-1,1)
    user_rated_items = df[user].nonzero()[1]
    user_ratings = df[user,user_rated_items].todense()
    nearest_users = np.argsort(sim_mat[user])[0,-11:-1].tolist()[0]
    peer_set = df[nearest_users,:]
    peer_set = peer_set[:,user_rated_items].todense()
    peer_set = np.asarray(peer_set)
    mean_user = user_avgs[user]
    peer_set_sum = np.sum(peer_set * weights,axis=0)
    predict = peer_set_sum + mean_user
    mse = (np.asarray((user_ratings - predict))**2).sum()
    return mse,predict

In [56]:
from scipy.sparse import csc_matrix

df = load_movielens()
users_list = df['uid']
user_avgs = df.groupby('uid').mean()['rating']
movies_list = df['mid']
ratings_list = df['rating']
df_mat = csc_matrix((ratings_list,(users_list,movies_list)))
user_sim_mat = pairwise_sim_user(df_mat)
item_sim_mat = pairwise_sim_item(df_mat)

In [57]:
user = 0
user_rated_items = df_mat[user].nonzero()[1]
nearest_users = np.argsort(sim_mat[user])[0,-6:-1].tolist()[0]

mean_user = user_avgs[user]

In [211]:
import numpy.random

weights = np.random.rand(10,1)
m = cost_function(weights,df_mat,0,user_avgs)

mse = (np.asarray((user_ratings - (peer_set_sum + mean_user)))**2).sum()

In [59]:
from scipy.optimize import minimize

In [60]:
res = minimize(cost_function,weights,args=(df_mat,0,user_avgs))

In [61]:
res.x

array([-0.13220403,  0.06584494, -0.16886173,  0.04312117,  0.26526064,
       -0.24415956,  0.18311145,  0.17980334, -0.18981619,  0.07055322])

In [62]:
m,predict = cost_function_predict(res.x,df_mat,0,user_avgs)

In [63]:
user = 0
user_rated_items = df_mat[user].nonzero()[1]
user_ratings = df_mat[user,user_rated_items].todense()

In [64]:
user_rated_items

array([  30,  833,  859,  906,  931, 1017, 1041, 1047, 1083, 1087, 1111,
       1140, 1515, 1665, 1708, 1743, 1815, 1962, 2380, 2925])

In [65]:
predict

array([ 3.27582478,  2.94789096,  2.79693627,  2.23533226,  3.07878142,
        2.67936351,  1.98250592,  2.41641587,  2.69480944,  2.04841556,
        2.39114609,  1.37015253,  2.67936351,  3.87115551,  2.22417518,
        2.74991673,  2.9200195 ,  3.20531344,  2.05823835,  2.23293907])

Joint interpolation with similarity weighting

In [17]:
df_mat.nonzero()

(array([  0,   0,   0, ..., 670, 670, 670], dtype=int32),
 array([  30,  833,  859, ..., 4597, 4610, 4696], dtype=int32))

In [87]:
def predict(df,user,user_avgs,item,weights):
    weights = weights.reshape(-1,1)
    user_sim_mat = pairwise_sim_user(df)
    nearest_users = np.argsort(user_sim_mat[user])[0,-11:-1].tolist()[0]
    peer_set = df[nearest_users,:]
    peer_set = peer_set[:,item].todense()
    peer_set = np.asarray(peer_set)
    mean_user = user_avgs[user]
    predict_item = mean_user + np.sum(peer_set * weights,axis=0)
    return predict_item

def predict_2(df,user,user_avgs,weights):
    sim_mat = pairwise_sim_user(df)
    weights = weights.reshape(-1,1)
    user_rated_items = df[user].nonzero()[1]
    user_ratings = df[user,user_rated_items].todense()
    nearest_users = np.argsort(sim_mat[user])[0,-11:-1].tolist()[0]
    peer_set = df[nearest_users,:]
    peer_set = peer_set[:,user_rated_items].todense()
    peer_set = np.asarray(peer_set)
    mean_user = user_avgs[user]
    peer_set_sum = np.sum(peer_set * weights,axis=0)
    predict = peer_set_sum + mean_user
    return predict

def cost(x,df,user,user_avgs):
    user_sim_mat = pairwise_sim_user(df)
    item_sim_mat = pairwise_sim_item(df)
    weights = x.reshape(-1,1)
    user_items = df[user,:].nonzero()[1]
    ret_sum = 0
    predict_array = predict_2(df,user,user_avgs,x)
    for s in range(len(user_items)):
        s_mat = user_items[s]
        for j in range(len(user_items)):
            j_mat = user_items[j]
            if j == s:
                continue
            else:
                ret_sum += item_sim_mat[s_mat,j_mat] * ((df[user,s_mat] - predict_array[j])**2)
    return ret_sum
    

In [90]:
weights = np.random.rand(1,10)
item = 30
user = 0
#predict(df_mat,user,item,weights)

#p = predict_2(df_mat,user,user_avgs,weights)
c = cost(weights,df_mat,user,user_avgs)
res = minimize(cost,weights,args=(df_mat,0,user_avgs))

In [93]:
predict_2(df_mat,user,user_avgs,res.x)

array([ 2.42138407,  2.46850263,  2.48422235,  2.50957646,  2.55512793,
        2.55769952,  2.63112906,  2.58749674,  2.45456597,  2.56820569,
        2.48547992,  2.58669232,  2.55769952,  2.3813789 ,  2.60098318,
        2.53890591,  2.57064596,  2.48686678,  2.61392384,  2.58537509])