# Implementación de una RBM para recomendación sobre películas

Resultados de referencia:

http://www.mymedialite.net/examples/datasets.html
http://people.ischool.berkeley.edu/~nakov/selected_papers_list/ICML2013_CF_Botzmann_poster.pdf

Entrenamiento de la RBM:

https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf

Baselines:

https://www.netflixprize.com/assets/GrandPrize2009_BPC_BellKor.pdf

Código de referencia en Theano:

https://github.com/felipecruz/CFRBM

Información sobre la competencia original:

https://en.wikipedia.org/wiki/Netflix_Prize


In [6]:
import numpy as np
def outer(x, y):
    return x[:, :, np.newaxis] * y[:, np.newaxis, :]

class CFRBM:
    def _logistic(self, x):
        return 1.0 / (1 + np.exp(-x))
    def __init__(self, num_visible, num_hidden, initial_v=None,
                 initial_weigths=None, debug=False):
        self.dim = (num_visible, num_hidden)
        self.num_visible = num_visible
        self.num_hidden = num_hidden

        if initial_weigths:
            initial_weights = np.load('{}.W.npy'.format(initial_weigths))
            initial_hbias = np.load('{}.h.npy'.format(initial_weigths))
            initial_vbias = np.load('{}.b.npy'.format(initial_weigths))
        else:
            initial_weights = np.array(np.random.normal(0, 0.1, size=self.dim),
                                       dtype=np.float32)
            initial_hbias = np.zeros(num_hidden, dtype=np.float32)

            if initial_v:
                initial_vbias = np.array(initial_v, dtype=np.float32)
            else:
                initial_vbias = np.zeros(num_visible, dtype=np.float32)

        self.weights = initial_weights
        self.vbias = initial_vbias
        self.hbias = value=initial_hbias
        self.prev_gw = np.zeros(shape=self.dim, dtype=np.float32)
        self.prev_gh = np.zeros(num_hidden, dtype=np.float32)
        self.prev_gv = np.zeros(num_visible, dtype=np.float32)

        #self.theano_rng = T.shared_randomstreams.RandomStreams(
        #    np.random.RandomState(17).randint(2**30))
    def prop_up(self, vis):
        return self._logistic(np.dot(vis, self.weights) + self.hbias)
    def sample_hidden(self, vis):
        activations = self.prop_up(vis)
        h1_sample= activations > np.random.rand(activations.shape[0],activations.shape[1])
        return h1_sample, activations
    def prop_down(self, h):
        return self._logistic(np.dot(h, self.weights.T) + self.vbias)
    def sample_visible(self, h, k=5):
        activations = self.prop_down(h)
        k_ones = np.ones(k)
        #v1_sample= activation > np.random.rand(h.shape[0], h.shape[1])
        partitions = activations.reshape((-1, k)).sum(axis=1).reshape((-1, 1)) * k_ones
        activations = activations / partitions.reshape(activations.shape)
        v1_sample= activations > np.random.rand(activations.shape[0], activations.shape[1])
        #v1_sample = self.theano_rng.binomial(size=activations.shape,
        #                                     n=1, p=activations,
        #                                     dtype=theano.config.floatX)
        return v1_sample, activations
    def contrastive_divergence_1(self, v1):
        h1, _ = self.sample_hidden(v1)
        v2, v2a = self.sample_visible(h1)
        h2, h2a = self.sample_hidden(v2)
        return (v1, h1, v2, v2a, h2, h2a)
    
    def gradient(self, v1, h1, v2, h2p, masks):
        v1h1_mask = outer(masks, h1)
        gw = ((outer(v1, h1) * v1h1_mask) - (outer(v2, h2p) * v1h1_mask)).mean(axis=0)
        gv = ((v1 * masks) - (v2 * masks)).mean(axis=0)
        gh = (h1 - h2p).mean(axis=0)
        return (gw, gv, gh)
    
    def cdk_fun(self, vis, masks, k=1, w_lr=0.000021, v_lr=0.000025,
                h_lr=0.000025, decay=0.0000, momentum=0.0):
        v1, h1, v2, v2a, h2, h2a = self.contrastive_divergence_1(vis)

        for i in range(k-1):
            v1, h1, v2, v2a, h2, h2a = self.contrastive_divergence_1(v2)

        (W, V, H) = self.gradient(v1, h1, v2, h2a, masks)

        if decay:
            W -= decay * self.weights

        self.weights=self.weights + (momentum * self.prev_gw) + (W * w_lr)
        self.vbias=self.vbias + (momentum * self.prev_gv) + (V * v_lr)
        self.hbias = self.hbias + (momentum * self.prev_gh) + (H * h_lr)
        self.prev_gw= W
        self.prev_gh= H
        self.prev_gv= V
    
    def predict(self, v1):
        h1, _ = self.sample_hidden(v1)
        v2, v2a = self.sample_visible(h1)
        return v2a


In [7]:
def run(name, dataset, config, all_movies, all_users, test, initial_v, sep):
    config_name = config['name']
    number_hidden = config['number_hidden']
    epochs = config['epochs']
    ks = config['ks']
    momentums = config['momentums']
    l_w = config['l_w']
    l_v = config['l_v']
    l_h = config['l_h']
    decay = config['decay']
    batch_size = config['batch_size']

    config_result = config.copy()
    config_result['results'] = []

    #vis = np.matrix()
    #vmasks = np.matrix()

    rbm = CFRBM(len(all_movies)*5, number_hidden)
    profiles = defaultdict(list)
    with open(dataset, 'rt') as data:
        for i, line in enumerate(data):
            uid, mid, rat, timstamp = line.strip().split(sep)
            profiles[uid].append((mid, float(rat)))

    print("Users and ratings loaded")

    for j in range(epochs):
        def get_index(col):
            if j/(epochs/len(col)) < len(col):
                return int(j/(epochs/len(col)))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        icurrent_l_w = get_index(l_w)
        icurrent_l_v = get_index(l_v)
        icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w = l_w[icurrent_l_w]
        current_l_v = l_v[icurrent_l_v]
        current_l_h = l_h[icurrent_l_h]
        def train(vis,vmasks):
            rbm.cdk_fun(vis,
                            vmasks,
                            k=k,
                            w_lr=current_l_w,
                            v_lr=current_l_v,
                            h_lr=current_l_h,
                            decay=decay,
                            momentum=momentum)
        predict=rbm.predict

        for batch_i, batch in enumerate(chunker(profiles.keys(),
                                                batch_size)):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            profile_batch = [bin_profiles[id] for id in batch]
            masks_batch = [masks[id] for id in batch]
            train_batch = np.array(profile_batch).reshape(size,
                                                          len(all_movies) * 5)
            train_masks = np.array(masks_batch).reshape(size,
                                                        len(all_movies) * 5)
            train_masks = train_masks.astype('float32')
            if j>0:
                train(train_batch, train_masks)
            sys.stdout.write('.')
            sys.stdout.flush()

        ratings = []
        predictions = []

        for batch in chunker(tests.keys(), batch_size):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            positions = {profile_id: pos for pos, profile_id
                         in enumerate(batch)}
            profile_batch = [bin_profiles[el] for el in batch]
            test_batch = np.array(profile_batch).reshape(size,
                                                         len(all_movies) * 5)
            user_preds = revert_expected_value(predict(test_batch))
            for profile_id in batch:
                test_movies = tests[profile_id]
                try:
                    for movie, rating in test_movies:
                        current_profile = user_preds[positions[profile_id]]
                        predicted = current_profile[all_movies.index(movie)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        mae = vabs(distances).mean()
        rmse = sqrt((distances ** 2).mean())

        iteration_result = {
            'iteration': j,
            'k': k,
            'momentum': momentum,
            'mae': mae,
            'rmse': rmse,
            'lrate': current_l_w
        }

        config_result['results'].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open('{}_{}.json'.format(config_name, name), 'wt') as res_output:
            res_output.write(json.dumps(config_result, indent=4))


In [8]:
import json
import sys

from collections import defaultdict
from math import sqrt

import numpy as np
#import theano.tensor as T

#from rbm import CFRBM
from experiments import read_experiment
from utils import chunker, revert_expected_value, expand, iteration_str
from dataset import load_dataset



In [9]:
experiments = read_experiment("ubased.json")

In [10]:
experiment=experiments[0]

In [12]:
name = experiment['name']
train_path = experiment['train_path']
test_path = experiment['test_path']
sep = experiment['sep']
configs = experiment['configs']
all_users, all_movies, tests = load_dataset(train_path, test_path, sep, user_based=True)

In [13]:
config=configs[0]

In [14]:
run(name, train_path, config, all_movies,all_users,tests,None, sep)

Users and ratings loaded
...............................................................................................
End iter 0 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 1.0363462217069113/1.2767873809020238
...............................................................................................
End iter 1 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.9117306044260725/1.1677817738392675
...............................................................................................

  import sys



End iter 2 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.8856177849473207/1.1441014334109338
...............................................................................................
End iter 3 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.8462469123355346/1.1084964727527817
...............................................................................................
End iter 4 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.8345011846549377/1.0991571284870039
...............................................................................................
End iter 5 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.8186217674043454/1.0851019244381228
...............................................................................................
End iter 6 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.8132278066239855/1.0808427273009935
...............................................................................................
End iter 7 - k/lr: 1/0.0005 momentum: 0.5 - MAE/RMSE: 0.808