In [1]:
import tensorflow as tf
import sys
print(sys.version)
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
class MF_RS():
    def __init__(self, numUsers, numSongs, embedding_dim, reg_lambda=0.01, conf_lambda=1.0, conf_dim = 1):
        
        #hyper parameters
        self.batch_size = np.min([200, numUsers, numSongs]);
        self.numUsers = numUsers
        self.numSongs = numSongs
        self.epochs = 50
        self.reg_lambda = reg_lambda
        self.conf_lambda = conf_lambda
        
        #embedding matricies for users and songs
        self.userMat = tf.Variable(tf.random_normal([numUsers, embedding_dim]))
        self.songMat = tf.Variable(tf.random_normal([numSongs, embedding_dim]))
        self.userBias = tf.Variable(tf.random_normal([numUsers]))
        self.songBias = tf.Variable(tf.random_normal([numSongs]))
        self.overallBias = tf.Variable(tf.random_normal([1]))
        self.C_user = tf.Variable(.1*tf.ones([numUsers, conf_dim]))
        self.C_song = tf.Variable(.1*tf.ones([numSongs, conf_dim]))
        
        #input tensors for songs, usres, ratings
        self.users = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.songs = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.rating = tf.placeholder(tf.float32, shape = (self.batch_size))
        
        #map each user/song to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.W = tf.nn.embedding_lookup(self.songMat, self.songs)
        # bias
        self.U_bias = tf.nn.embedding_lookup(self.userBias, self.users)
        self.W_bias = tf.nn.embedding_lookup(self.songBias, self.songs)
        # confidence params
        self.C_ui = tf.maximum(0.0, tf.nn.embedding_lookup(self.C_user, self.users))
        self.C_sj = tf.maximum(0.0, tf.nn.embedding_lookup(self.C_song, self.songs))

        
        #predicted rating is dot product of user and song
        bias = self.U_bias+self.W_bias+self.overallBias
        pq = tf.reduce_sum(tf.mul(self.U, self.W), 1)
        self.yhat = pq + bias
            
        # l2 reg
        self.confidence_reg = self.conf_lambda * tf.reduce_sum(tf.exp(-self.C_ui) + tf.exp(-self.C_sj))
        self.l2_reg = self.reg_lambda * ( tf.reduce_sum((tf.square(self.U) + tf.square(self.W))) + 
                                         tf.reduce_sum(tf.square(self.U_bias) + tf.square(self.W_bias)))
        self.reg = self.confidence_reg + self.l2_reg
        self.error = tf.reduce_mean(tf.reduce_sum(self.C_ui * self.C_sj, 1) * tf.nn.l2_loss(self.yhat - self.rating))
        self.cost = (self.error + self.reg)/1e7
        self.optimizer = tf.train.AdamOptimizer(learning_rate = .01).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())    
        
    def train(self, users, songs, ratings):
        
        for i in range(self.epochs):
            
            avg_cost = 0
            perm = np.random.permutation(len(ratings))
            num_batches = len(ratings) // self.batch_size
            
            for b_idx in range(num_batches):
                
                batch = perm[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                users_batch = users[batch]
                songs_batch = songs[batch]
                ratings_batch = ratings[batch]
                                
                avg_cost += self.session.run([self.cost, self.optimizer],
                                  {self.users:users_batch, self.songs:songs_batch, self.rating:ratings_batch})[0]
                
            print(avg_cost/num_batches)
    def test(self, users, songs):
        yhat = np.zeros(len(users))
        num_batches = len(users) // self.batch_size
        for b_idx in range(num_batches):
            batch = range(self.batch_size * b_idx,self.batch_size * (b_idx + 1))
            users_batch = users[batch]
            songs_batch = songs[batch]
            yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        batch = range(-self.batch_size,0)
        users_batch = users[batch]
        songs_batch = songs[batch]
        yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        return yhat
    def evaluate(self, users, songs, ratings):
        yhat = self.test(users, songs)
        return np.mean((yhat - ratings)**2)

In [3]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 4, 5])
c = np.array([4, 3, 2, 5, 1])
#unique users / songs
uni_a = np.unique(a)
uni_b = np.unique(b)

#dict mapping the id to an index
a_map = dict(zip(uni_a,range(len(uni_a))))
b_map = dict(zip(uni_b,range(len(uni_b))))

user_idx =  np.array([ a_map[user] for user in a])
song_idx =  np.array([ b_map[song] for song in b])
model = MF_RS(len(uni_a), len(uni_b), 7)
np.random.seed(2)
model.train(user_idx, song_idx, c)


1.03209652025e-06
1.03125591977e-06
1.03020602182e-06
1.02907449673e-06
1.02793478618e-06
1.02683520709e-06
1.0258098655e-06
1.02488138509e-06
1.02406374936e-06
1.02336423424e-06
1.02278386294e-06
1.0223186564e-06
1.02196077023e-06
1.02169906313e-06
1.02151989267e-06
1.02140791114e-06
1.02134742974e-06
1.02132310076e-06
1.02131991753e-06
1.02132469237e-06
1.02132628399e-06
1.02131571111e-06
1.02128683466e-06
1.02123522083e-06
1.02115961909e-06
1.02105980204e-06
1.02093770238e-06
1.02079661701e-06
1.02064029761e-06
1.02047295059e-06
1.0202992371e-06
1.0201230225e-06
1.01994794477e-06
1.01977673239e-06
1.01961177279e-06
1.01945443021e-06
1.01930493202e-06
1.01916316453e-06
1.01902821825e-06
1.01889918369e-06
1.01877412817e-06
1.01865157376e-06
1.01852947409e-06
1.01840691968e-06
1.01828197785e-06
1.01815408016e-06
1.01802220343e-06
1.01788612028e-06
1.01774571704e-06
1.01760122106e-06


In [4]:
movieratings = pd.read_csv('ratings.csv')

In [5]:
def getDfSummary(input_data):
    output_data = input_data.describe(include = 'all').T
    var = pd.DataFrame(data = {'nanvals': pd.Series(), 'number_distinct': pd.Series()})
    for i in range(len(input_data.columns)):
        nanvals = input_data.ix[:,i].isnull().sum()
        number_distinct = len(input_data.ix[:,i].value_counts())
        var = var.append(pd.DataFrame([[nanvals, number_distinct]], columns = ['nanvals', 'number_distinct']))
    var.index = output_data.index.values
    output_data['nanvals'] = var['nanvals']
    output_data['number_distinct'] = var['number_distinct']
    return output_data
output_data = getDfSummary(movieratings)

In [6]:
users = movieratings.ix[:,0].values
songs = movieratings.ix[:,1].values
ratings = movieratings.ix[:,2].values

#unique users / songs
uni_users = movieratings['userId'].unique()
uni_songs = movieratings['movieId'].unique()

#dict mapping the id to an index
user_map = dict(zip(uni_users,range(len(uni_users))))
song_map = dict(zip(uni_songs,range(len(uni_songs))))

user_idx =  np.array([ user_map[user] for user in users])
song_idx =  np.array([ song_map[song] for song in songs])

print(len(uni_users),len(uni_songs))

perm = np.random.permutation(len(users))
trn_idx = perm[:(len(users)*2)//3]
val_idx = perm[(len(users)*2)//3:]
user_idx_trn, song_idx_trn, ratings_trn = user_idx[trn_idx], song_idx[trn_idx], ratings[trn_idx]
user_idx_val, song_idx_val, ratings_val = user_idx[val_idx], song_idx[val_idx], ratings[val_idx]

671 9066


In [7]:
songmodel = MF_RS(len (uni_users), len(uni_songs), 11, reg_lambda=0.001, conf_lambda=1000, conf_dim = 3)

In [8]:
songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)

21.636080223107992

In [9]:
np.random.seed(1)
songmodel.train(user_idx, song_idx, ratings)

0.0532136166245
0.0218381272312
0.013121736791
0.00896964817308
0.00656625431217
0.00501724747056
0.00395213423762
0.003185456309
0.00262003202317
0.00219366226182
0.00186793873995
0.00161467132298
0.00141773087834
0.00126083238982
0.00113771994179
0.00103759308718
0.000958210394252
0.000896041884087
0.000844651742722
0.000802496855962
0.000767920808517
0.000739893552382
0.000717573277885
0.00069671014999
0.000682119958801
0.000667308451317
0.000657778616529
0.000648247414269
0.000639579784009
0.000632184081827
0.000627973630035
0.000621331769216
0.000617766687414
0.000613968412683
0.000611095111177
0.000607407675765
0.000606091080117
0.000603447377158
0.000602204355178
0.000601153037976
0.00059729701106
0.00059641037113
0.000595267835015
0.000594353262044
0.000593454447458
0.00059232681233
0.000590962748567
0.000590328781807
0.000590229666035
0.000589308253955


In [10]:
songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)

0.28559138800035