# Latent Factor Models for Collaborative Filtering

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [2]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])
print user_ratings_df.head()
print user_features_df.head()
print item_features_df.head()

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]


print merged_df.head()

merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print test.to_latex()



   The Call of Cthulhu   Frankenstein   Dracula   Neuromancer   Space Odyssey
0                  8.0            2.0       NaN           5.0             4.0
1                  3.0            2.0       NaN           7.0             7.0
2                  9.0            NaN       7.0           8.0             5.0
3                  NaN            NaN       7.0           8.0             9.0
4                  NaN            1.0       8.0           3.0             7.0
   Sex   Over60  key  user_id
0  1.0      0.0    0        0
1  0.0      1.0    0        1
2  0.0      0.0    0        2
3  1.0      0.0    0        3
4  0.0      1.0    0        4
   Critic0   Critic1  key  item_id
0      0.3       0.9    0        0
1      0.9       0.3    0        1
2      0.6       0.4    0        2
3      0.2       0.1    0        3
4      0.7       0.8    0        4
   Sex   Over60  key  user_id  Critic0   Critic1  item_id
0  1.0      0.0    0        0      0.3       0.9        0
1  1.0      0.0    0      

In [4]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print item_features_df.to_latex()


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print user_features

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}



In [5]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print mse
                    
                    
    
                    
                    
    


In [6]:
for _ in range(0,10): 
    sgd()

0.282365149806
0.277316788591
0.275765898638
0.274994491683
0.274532470124
0.274228559206
0.274018512263
0.273870242834
0.273765938628
0.273694969315


In [7]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print user_features_weights
print item_features_weights
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[ 1.591  2.076  0.52   0.292  0.885]
 [-0.345  0.263 -0.511  0.125 -0.388]
 [ 0.932  0.501  0.349  0.734  1.696]
 [ 0.422  1.185  0.519  0.094  1.1  ]
 [ 0.045  0.64   0.42   0.975  0.297]
 [-0.171  0.532  0.214  0.025 -1.78 ]
 [ 0.708  0.208  0.761  0.735  0.002]
 [-0.047  0.274  0.118  0.373  0.344]
 [-0.012  0.614 -0.089  0.237 -0.164]
 [ 0.387  0.641  0.598  0.686 -0.101]]
[[ 1.984  2.486  1.931  2.358  2.87 ]
 [ 0.161  0.515  0.448  0.314  0.171]
 [ 0.069  0.098  0.667  0.594  0.647]
 [ 0.013  0.026  1.451  0.798  1.339]
 [ 0.074  0.138  0.006  0.175  0.082]]


In [8]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = data.columns
comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

NameError: name 'data' is not defined

In [8]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 8.00753213201)","(2.0, 1.68132476702)","(nan, -0.816458005082)","(5.0, 5.24534140815)","(4.0, 4.07284713322)"
1,"(3.0, 2.9651437535)","(2.0, 2.77574260418)","(nan, 20.371555042)","(7.0, 6.42856542196)","(7.0, 6.82402220025)"
2,"(9.0, 9.17066467322)","(nan, 3.72857419036)","(7.0, 7.05097693395)","(8.0, 7.74902903692)","(5.0, 5.02693050564)"
3,"(nan, 8.89059466545)","(nan, 4.80400274168)","(7.0, 7.00071427519)","(8.0, 8.00026744083)","(9.0, 9.00146942092)"
4,"(nan, 2.04931531603)","(1.0, 0.605664829651)","(8.0, 7.99524753077)","(3.0, 3.31531949311)","(7.0, 7.09230434669)"
5,"(2.0, 2.01498637399)","(3.0, 2.96545438759)","(5.0, 5.00171715631)","(nan, 3.70906735537)","(nan, 16.1046429873)"
6,"(4.0, 4.10071806365)","(2.0, 0.584157794895)","(nan, -0.615769387347)","(2.0, 2.9904970354)","(7.0, 7.3316935937)"
7,"(7.0, 6.92484500343)","(1.0, 2.88968696942)","(2.0, 2.01830509523)","(7.0, 5.59586294301)","(9.0, 8.57455552152)"
8,"(3.0, 3.05046895637)","(3.0, 2.52948108941)","(nan, 26.4577215043)","(7.0, 7.30250562055)","(3.0, 3.11091030954)"
9,"(4.0, 3.7436458588)","(nan, -0.186539411277)","(5.0, 4.92523954912)","(3.0, 3.37640884275)","(3.0, 2.95913673392)"


In [9]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()