In [1]:
import tensorflow as tf
import sys
print(sys.version)
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
class MF_RS():
    def __init__(self, numUsers, numSongs, embedding_dim, reg_lambda=0.01, conf_lambda=1.0, conf_dim = 1):
        
        #hyper parameters
        self.batch_size = np.min([200, numUsers, numSongs]);
        self.numUsers = numUsers
        self.numSongs = numSongs
        self.epochs = 5
        self.reg_lambda = reg_lambda
        self.conf_lambda = conf_lambda
        
        #embedding matricies for users and songs
        self.userMat = tf.Variable(tf.random_normal([numUsers, embedding_dim]))
        self.songMat = tf.Variable(tf.random_normal([numSongs, embedding_dim]))
        self.userBias = tf.Variable(tf.random_normal([numUsers]))
        self.songBias = tf.Variable(tf.random_normal([numSongs]))
        self.overallBias = tf.Variable(tf.random_normal([1]))
        if conf_dim > 0:
            self.C_user = tf.Variable(.1*tf.ones([numUsers, conf_dim]))
            self.C_song = tf.Variable(.1*tf.ones([numSongs, conf_dim]))
        
        #input tensors for songs, usres, ratings
        self.users = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.songs = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.rating = tf.placeholder(tf.float32, shape = (self.batch_size))
        
        #map each user/song to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.W = tf.nn.embedding_lookup(self.songMat, self.songs)
        # bias
        self.U_bias = tf.nn.embedding_lookup(self.userBias, self.users)
        self.W_bias = tf.nn.embedding_lookup(self.songBias, self.songs)
        # confidence params
        if conf_dim > 0:
            self.C_ui = tf.maximum(0.0, tf.nn.embedding_lookup(self.C_user, self.users))
            self.C_sj = tf.maximum(0.0, tf.nn.embedding_lookup(self.C_song, self.songs))

        
        #predicted rating is dot product of user and song
        bias = self.U_bias+self.W_bias+self.overallBias
        pq = tf.reduce_sum(tf.mul(self.U, self.W), 1)
        self.yhat = pq + bias
            
        # l2 reg
        if conf_dim > 0:
            self.confidence_reg = self.conf_lambda * tf.reduce_sum(tf.exp(-self.C_ui) + tf.exp(-self.C_sj))
        self.l2_reg = self.reg_lambda * ( tf.reduce_sum((tf.square(self.U) + tf.square(self.W))) + 
                                         tf.reduce_sum(tf.square(self.U_bias) + tf.square(self.W_bias)))
        if conf_dim > 0:
            self.reg = self.l2_reg + self.confidence_reg
        else:
            self.reg = self.l2_reg
        if conf_dim > 0:
             self.error = tf.reduce_mean(tf.reduce_sum(self.C_ui * self.C_sj, 1) *
                                         tf.nn.l2_loss(self.yhat - self.rating))
        else:
            self.error = tf.reduce_mean(tf.nn.l2_loss(self.yhat - self.rating))
        self.cost = (self.error + self.reg)/1e7
        self.optimizer = tf.train.AdamOptimizer(learning_rate = .01).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())    
        
    def train(self, users, songs, ratings, verb = 0):
        
        for i in range(self.epochs):
            
            avg_cost = 0
            perm = np.random.permutation(len(ratings))
            num_batches = len(ratings) // self.batch_size
            
            for b_idx in range(num_batches):
                
                batch = perm[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                users_batch = users[batch]
                songs_batch = songs[batch]
                ratings_batch = ratings[batch]
                                
                avg_cost += self.session.run([self.cost, self.optimizer],
                                  {self.users:users_batch, self.songs:songs_batch, self.rating:ratings_batch})[0]
            if verb > 0:
                print(avg_cost/num_batches)
    def test(self, users, songs):
        yhat = np.zeros(len(users))
        num_batches = len(users) // self.batch_size
        for b_idx in range(num_batches):
            batch = range(self.batch_size * b_idx,self.batch_size * (b_idx + 1))
            users_batch = users[batch]
            songs_batch = songs[batch]
            yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        batch = range(-self.batch_size,0)
        users_batch = users[batch]
        songs_batch = songs[batch]
        yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        return yhat
    def evaluate(self, users, songs, ratings):
        yhat = self.test(users, songs)
        return np.mean((yhat - ratings)**2)

In [4]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 4, 5])
c = np.array([4, 3, 2, 5, 1])
#unique users / songs
uni_a = np.unique(a)
uni_b = np.unique(b)

#dict mapping the id to an index
a_map = dict(zip(uni_a,range(len(uni_a))))
b_map = dict(zip(uni_b,range(len(uni_b))))

user_idx =  np.array([ a_map[user] for user in a])
song_idx =  np.array([ b_map[song] for song in b])
model = MF_RS(len(uni_a), len(uni_b), 7)
np.random.seed(2)
model.train(user_idx, song_idx, c)


In [5]:
movieratings = pd.read_csv('ratings.csv')

In [6]:
def getDfSummary(input_data):
    output_data = input_data.describe(include = 'all').T
    var = pd.DataFrame(data = {'nanvals': pd.Series(), 'number_distinct': pd.Series()})
    for i in range(len(input_data.columns)):
        nanvals = input_data.ix[:,i].isnull().sum()
        number_distinct = len(input_data.ix[:,i].value_counts())
        var = var.append(pd.DataFrame([[nanvals, number_distinct]], columns = ['nanvals', 'number_distinct']))
    var.index = output_data.index.values
    output_data['nanvals'] = var['nanvals']
    output_data['number_distinct'] = var['number_distinct']
    return output_data
output_data = getDfSummary(movieratings)

In [7]:
users = movieratings.ix[:,0].values
songs = movieratings.ix[:,1].values
ratings = movieratings.ix[:,2].values

#unique users / songs
uni_users = movieratings['userId'].unique()
uni_songs = movieratings['movieId'].unique()

#dict mapping the id to an index
user_map = dict(zip(uni_users,range(len(uni_users))))
song_map = dict(zip(uni_songs,range(len(uni_songs))))

user_idx =  np.array([ user_map[user] for user in users])
song_idx =  np.array([ song_map[song] for song in songs])

print(len(uni_users),len(uni_songs))

perm = np.random.permutation(len(users))
trn_idx = perm[:(len(users)*2)//3]
val_idx = perm[(len(users)*2)//3:]
user_idx_trn, song_idx_trn, ratings_trn = user_idx[trn_idx], song_idx[trn_idx], ratings[trn_idx]
user_idx_val, song_idx_val, ratings_val = user_idx[val_idx], song_idx[val_idx], ratings[val_idx]

671 9066


In [8]:
songmodel = MF_RS(len (uni_users), len(uni_songs), 30, reg_lambda=0.001, conf_lambda=1000, conf_dim = 1)

In [9]:
songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)

48.707730090315792

In [10]:
np.random.seed(1)
songmodel.train(user_idx, song_idx, ratings)

In [11]:
songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)

0.68658300424748842

In [12]:
songmodel = MF_RS(len (uni_users), len(uni_songs), 30, reg_lambda=0.001, conf_lambda=1000, conf_dim = 3)
print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
np.random.seed(1)
songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
print("accuracy after training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))


accuracy before training 53.5982724306
accuracy after training 2.70218726726


In [13]:
for edim in [10, 30, 50]:
    for d in [1, 2]:
        songmodel = MF_RS(len (uni_users), len(uni_songs), edim, reg_lambda=0.001, conf_lambda=1000, conf_dim = d)
        print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
        np.random.seed(1)
        songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
        print("accuracy after training with edim ", edim, " and confidence dim ", d, ": ", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))


accuracy before training 21.7213648008
accuracy after training with edim  10  and confidence dim  1 :  1.24421486968
accuracy before training 31.8189235567
accuracy after training with edim  10  and confidence dim  2 :  1.24135128665
accuracy before training 47.165054391
accuracy after training with edim  30  and confidence dim  1 :  2.86092991239
accuracy before training 40.5815771732
accuracy after training with edim  30  and confidence dim  2 :  2.6936164997
accuracy before training 60.4461218798
accuracy after training with edim  50  and confidence dim  1 :  5.47256563569
accuracy before training 78.4892667578
accuracy after training with edim  50  and confidence dim  2 :  5.28099873171


In [14]:
for edim in [10, 30, 50]:
    songmodel = MF_RS(len (uni_users), len(uni_songs), edim, reg_lambda=0.001, conf_lambda=1000, conf_dim = 0)
    print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
    np.random.seed(1)
    songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
    print("accuracy after training with edim ", edim, " and no confidence: ", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))


accuracy before training 22.7529616916
accuracy after training with edim  10  and no confidence:  1.48182291882
accuracy before training 52.6123953132
accuracy after training with edim  30  and no confidence:  4.79149006172
accuracy before training 92.6781853238
accuracy after training with edim  50  and no confidence:  11.1415986738


In [15]:
for edim in [10, 30, 60, 100]:
    for d in [1, 3, 10, 20]:
        songmodel = MF_RS(len (uni_users), len(uni_songs), edim, reg_lambda=0.001, conf_lambda=1000, conf_dim = d)
        print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
        np.random.seed(1)
        songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
        print("accuracy after training with edim ", edim, " and confidence dim ", d, ": ", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))


accuracy before training 22.408170338
accuracy after training with edim  10  and confidence dim  1 :  1.24992822291
accuracy before training 25.2960808685
accuracy after training with edim  10  and confidence dim  3 :  1.2300610421
accuracy before training 48.6158868403
accuracy after training with edim  10  and confidence dim  10 :  1.29734930919
accuracy before training 23.9217704084
accuracy after training with edim  10  and confidence dim  20 :  1.28554023228
accuracy before training 51.0069626578
accuracy after training with edim  30  and confidence dim  1 :  2.9090727434
accuracy before training 45.1859372618
accuracy after training with edim  30  and confidence dim  3 :  2.72715812505
accuracy before training 55.8390342814
accuracy after training with edim  30  and confidence dim  10 :  2.6688337079
accuracy before training 45.957914889
accuracy after training with edim  30  and confidence dim  20 :  2.53673417528
accuracy before training 67.0480999159
accuracy after training wi

KeyboardInterrupt: 