From a4aabfe1e2a524365c6d14dbf62d99f81819c130 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 10 Nov 2019 20:29:41 +0100 Subject: [PATCH 01/75] Initial commit with functioning notebook, missing original files --- notebooks/02_model/rippleNet_deep_dive.ipynb | 292 ++++++++++++++++++ .../recommender/ripplenet/data_loader.py | 130 ++++++++ reco_utils/recommender/ripplenet/main.py | 45 +++ reco_utils/recommender/ripplenet/model.py | 167 ++++++++++ .../recommender/ripplenet/preprocess.py | 102 ++++++ reco_utils/recommender/ripplenet/train.py | 58 ++++ 6 files changed, 794 insertions(+) create mode 100644 notebooks/02_model/rippleNet_deep_dive.ipynb create mode 100644 reco_utils/recommender/ripplenet/data_loader.py create mode 100644 reco_utils/recommender/ripplenet/main.py create mode 100644 reco_utils/recommender/ripplenet/model.py create mode 100644 reco_utils/recommender/ripplenet/preprocess.py create mode 100644 reco_utils/recommender/ripplenet/train.py diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb new file mode 100644 index 0000000000..52ea01e57c --- /dev/null +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RippleNet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", + "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n", + "Pandas version: 0.25.1\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"../../\")\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import argparse \n", + "\n", + "from reco_utils.recommender.ripplenet.preprocess import (read_item_index_to_entity_id_file, \n", + " convert_rating, \n", + " convert_kg)\n", + "\n", + "from reco_utils.recommender.ripplenet.data_loader import (load_rating, \n", + " load_kg, \n", + " get_ripple_set)\n", + "\n", + "from reco_utils.recommender.ripplenet.train import (train)\n", + "\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Pandas version: {}\".format(pd.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "MOVIELENS_DATASET = \"100k\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read original data and transform entity ids to numerical" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "kg_original = pd.read_csv(\"../../reco_utils/recommender/ripplenet/data/movielens_100k_wikidata.csv\")\n", + "ratings_original = pd.read_csv(\"../../reco_utils/recommender/ripplenet/data/ratings_movielens_100k.csv\")\n", + "path_out = \"../../reco_utils/recommender/ripplenet/data/movielens100k/\"\n", + "if(os.path.exists(path_out)==False):\n", + " os.mkdir(path_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def transform_id(df, entities_id, col_transform, col_name = \"unified_id\"):\n", + " df = df.merge(entities_id, left_on = col_transform, right_on = \"entity\")\n", + " df = df.rename(columns = {\"unified_id\": col_name})\n", + " return df.drop(columns = [col_transform, \"entity\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "var_id = \"movielens_id\"\n", + "entities_id = pd.DataFrame({\"entity\":list(set(kg_original.original_entity)) + list(set(kg_original.linked_entities))}).reset_index()\n", + "entities_id = entities_id.rename(columns = {\"index\": \"unified_id\"})\n", + "\n", + "item_to_entity = kg_original[[var_id, \"original_entity\"]].drop_duplicates().reset_index().drop(columns = \"index\")\n", + "item_to_entity = transform_id(item_to_entity, entities_id, \"original_entity\")\n", + "item_to_entity.to_csv(path_out+\"item_to_entity.csv\", index = False, header = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "kg = kg_original[[\"original_entity\", \"linked_entities\"]].drop_duplicates()\n", + "kg = transform_id(kg, entities_id, \"original_entity\", \"original_entity_id\")\n", + "kg = transform_id(kg, entities_id, \"linked_entities\", \"linked_entities_id\")\n", + "kg[\"relation\"] = 1\n", + "kg[[\"original_entity_id\",\"relation\", \"linked_entities_id\"]].to_csv(path_out+\"kg_wikidata.csv\", index = False, header = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "vars_movielens = [\"UserId\", \"ItemId\", \"Rating\", \"Timestamp\"]\n", + "ratings_original[vars_movielens].sort_values(vars_movielens[1]).to_csv(path_out+\"ratings.csv\", index = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess module from RippleNet" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading item index to entity id file: ../../reco_utils/recommender/ripplenet/data/movielens100k/item_to_entity.csv ...\n" + ] + } + ], + "source": [ + "file = path_out+\"item_to_entity.csv\"\n", + "item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(file, sep = \",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading rating file ...\n", + "converting rating file ...\n", + "number of users: 943\n", + "number of items: 1677\n" + ] + } + ], + "source": [ + "file = path_out+\"ratings.csv\"\n", + "convert_rating(file, sep = \",\", item_index_old2new = item_index_old2new, threshold = 1, path_out=path_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "converting kg file ...\n", + "number of entities (containing items): 22994\n", + "number of relations: 1\n" + ] + } + ], + "source": [ + "file = path_out + \"kg_wikidata.csv\"\n", + "convert_kg(file, sep =\",\", entity_id2index = entity_id2index, path_out=path_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "parser = argparse.ArgumentParser() \n", + "parser.add_argument('--dataset', type=str, default='movielens100k', help='which dataset to use') \n", + "parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings') \n", + "parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') \n", + "parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term') \n", + "parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term') \n", + "parser.add_argument('--lr', type=float, default=0.02, help='learning rate') \n", + "parser.add_argument('--batch_size', type=int, default=1024, help='batch size') \n", + "parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') \n", + "parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') \n", + "parser.add_argument('--item_update_mode', type=str, default='plus_transform', help='how to update item at the end of each hop') \n", + "parser.add_argument('--using_all_hops', type=bool, default=True, help='whether using outputs of all hops or just the last hop when making prediction') \n", + "args = parser.parse_args(\"--dataset movielens100k\".split())" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading rating file ...\n", + "splitting dataset ...\n", + "reading KG file ...\n", + "constructing knowledge graph ...\n", + "constructing ripple set ...\n" + ] + } + ], + "source": [ + "train_data, eval_data, test_data, user_history_dict = load_rating(path_out, args)\n", + "n_entity, n_relation, kg = load_kg(path_out, args)\n", + "ripple_set = get_ripple_set(args, kg, user_history_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "data_info = train_data, eval_data, test_data, n_entity, n_relation, ripple_set\n", + "show_loss = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train(args, data_info, show_loss)" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python (reco)", + "language": "python", + "name": "reco_base" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py new file mode 100644 index 0000000000..8e36334feb --- /dev/null +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -0,0 +1,130 @@ +import collections +import os +import numpy as np + + +def load_data(path_data, args): + train_data, eval_data, test_data, user_history_dict = load_rating(path_data, args) + n_entity, n_relation, kg = load_kg(path_data, args) + ripple_set = get_ripple_set(args, kg, user_history_dict) + return train_data, eval_data, test_data, n_entity, n_relation, ripple_set + + +def load_rating(path_data, args): + print('reading rating file ...') + + # reading rating file + rating_file = path_data + '/ratings_final' + if os.path.exists(rating_file + '.npy'): + rating_np = np.load(rating_file + '.npy') + else: + rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32) + np.save(rating_file + '.npy', rating_np) + + # n_user = len(set(rating_np[:, 0])) + # n_item = len(set(rating_np[:, 1])) + return dataset_split(rating_np) + + +def dataset_split(rating_np): + print('splitting dataset ...') + + # train:eval:test = 6:2:2 + eval_ratio = 0.2 + test_ratio = 0.2 + n_ratings = rating_np.shape[0] + + eval_indices = np.random.choice(n_ratings, size=int(n_ratings * eval_ratio), replace=False) + left = set(range(n_ratings)) - set(eval_indices) + test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False) + train_indices = list(left - set(test_indices)) + # print(len(train_indices), len(eval_indices), len(test_indices)) + + # traverse training data, only keeping the users with positive ratings + user_history_dict = dict() + for i in train_indices: + user = rating_np[i][0] + item = rating_np[i][1] + rating = rating_np[i][2] + if rating == 1: + if user not in user_history_dict: + user_history_dict[user] = [] + user_history_dict[user].append(item) + + train_indices = [i for i in train_indices if rating_np[i][0] in user_history_dict] + eval_indices = [i for i in eval_indices if rating_np[i][0] in user_history_dict] + test_indices = [i for i in test_indices if rating_np[i][0] in user_history_dict] + # print(len(train_indices), len(eval_indices), len(test_indices)) + + train_data = rating_np[train_indices] + eval_data = rating_np[eval_indices] + test_data = rating_np[test_indices] + + return train_data, eval_data, test_data, user_history_dict + + +def load_kg(path_data, args): + print('reading KG file ...') + + # reading kg file + kg_file = path_data + '/kg_final' + if os.path.exists(kg_file + '.npy'): + kg_np = np.load(kg_file + '.npy') + else: + kg_np = np.loadtxt(kg_file + '.txt', dtype=np.int32) + np.save(kg_file + '.npy', kg_np) + + n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2])) + n_relation = len(set(kg_np[:, 1])) + + kg = construct_kg(kg_np) + + return n_entity, n_relation, kg + + +def construct_kg(kg_np): + print('constructing knowledge graph ...') + kg = collections.defaultdict(list) + for head, relation, tail in kg_np: + kg[head].append((tail, relation)) + return kg + + +def get_ripple_set(args, kg, user_history_dict): + print('constructing ripple set ...') + + # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...] + ripple_set = collections.defaultdict(list) + + for user in user_history_dict: + for h in range(args.n_hop): + memories_h = [] + memories_r = [] + memories_t = [] + + if h == 0: + tails_of_last_hop = user_history_dict[user] + else: + tails_of_last_hop = ripple_set[user][-1][2] + + for entity in tails_of_last_hop: + for tail_and_relation in kg[entity]: + memories_h.append(entity) + memories_r.append(tail_and_relation[1]) + memories_t.append(tail_and_relation[0]) + + # if the current ripple set of the given user is empty, we simply copy the ripple set of the last hop here + # this won't happen for h = 0, because only the items that appear in the KG have been selected + # this only happens on 154 users in Book-Crossing dataset (since both BX dataset and the KG are sparse) + if len(memories_h) == 0: + ripple_set[user].append(ripple_set[user][-1]) + else: + # sample a fixed-size 1-hop memory for each user + replace = len(memories_h) < args.n_memory + indices = np.random.choice(len(memories_h), size=args.n_memory, replace=replace) + memories_h = [memories_h[i] for i in indices] + memories_r = [memories_r[i] for i in indices] + memories_t = [memories_t[i] for i in indices] + ripple_set[user].append((memories_h, memories_r, memories_t)) + + return ripple_set diff --git a/reco_utils/recommender/ripplenet/main.py b/reco_utils/recommender/ripplenet/main.py new file mode 100644 index 0000000000..deea001efd --- /dev/null +++ b/reco_utils/recommender/ripplenet/main.py @@ -0,0 +1,45 @@ +import argparse +import numpy as np +from data_loader import load_data +from train import train + +np.random.seed(555) + +parser = argparse.ArgumentParser() +parser.add_argument('--dataset', type=str, default='movie', help='which dataset to use') +parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings') +parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') +parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term') +parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term') +parser.add_argument('--lr', type=float, default=0.02, help='learning rate') +parser.add_argument('--batch_size', type=int, default=1024, help='batch size') +parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') +parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') +parser.add_argument('--item_update_mode', type=str, default='plus_transform', + help='how to update item at the end of each hop') +parser.add_argument('--using_all_hops', type=bool, default=True, + help='whether using outputs of all hops or just the last hop when making prediction') + +''' +# default settings for Book-Crossing +parser = argparse.ArgumentParser() +parser.add_argument('--dataset', type=str, default='book', help='which dataset to use') +parser.add_argument('--dim', type=int, default=4, help='dimension of entity and relation embeddings') +parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') +parser.add_argument('--kge_weight', type=float, default=1e-2, help='weight of the KGE term') +parser.add_argument('--l2_weight', type=float, default=1e-5, help='weight of the l2 regularization term') +parser.add_argument('--lr', type=float, default=1e-3, help='learning rate') +parser.add_argument('--batch_size', type=int, default=1024, help='batch size') +parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') +parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') +parser.add_argument('--item_update_mode', type=str, default='plus_transform', + help='how to update item at the end of each hop') +parser.add_argument('--using_all_hops', type=bool, default=True, + help='whether using outputs of all hops or just the last hop when making prediction') +''' + +args = parser.parse_args() + +show_loss = False +data_info = load_data(args) +train(args, data_info, show_loss) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py new file mode 100644 index 0000000000..1fb806833e --- /dev/null +++ b/reco_utils/recommender/ripplenet/model.py @@ -0,0 +1,167 @@ +import tensorflow as tf +import numpy as np +from sklearn.metrics import roc_auc_score + + +class RippleNet(object): + def __init__(self, args, n_entity, n_relation): + self._parse_args(args, n_entity, n_relation) + self._build_inputs() + self._build_embeddings() + self._build_model() + self._build_loss() + self._build_train() + + def _parse_args(self, args, n_entity, n_relation): + self.n_entity = n_entity + self.n_relation = n_relation + self.dim = args.dim + self.n_hop = args.n_hop + self.kge_weight = args.kge_weight + self.l2_weight = args.l2_weight + self.lr = args.lr + self.n_memory = args.n_memory + self.item_update_mode = args.item_update_mode + self.using_all_hops = args.using_all_hops + + def _build_inputs(self): + self.items = tf.placeholder(dtype=tf.int32, shape=[None], name="items") + self.labels = tf.placeholder(dtype=tf.float64, shape=[None], name="labels") + self.memories_h = [] + self.memories_r = [] + self.memories_t = [] + + for hop in range(self.n_hop): + self.memories_h.append( + tf.placeholder(dtype=tf.int32, shape=[None, self.n_memory], name="memories_h_" + str(hop))) + self.memories_r.append( + tf.placeholder(dtype=tf.int32, shape=[None, self.n_memory], name="memories_r_" + str(hop))) + self.memories_t.append( + tf.placeholder(dtype=tf.int32, shape=[None, self.n_memory], name="memories_t_" + str(hop))) + + def _build_embeddings(self): + self.entity_emb_matrix = tf.get_variable(name="entity_emb_matrix", dtype=tf.float64, + shape=[self.n_entity, self.dim], + initializer=tf.contrib.layers.xavier_initializer()) + self.relation_emb_matrix = tf.get_variable(name="relation_emb_matrix", dtype=tf.float64, + shape=[self.n_relation, self.dim, self.dim], + initializer=tf.contrib.layers.xavier_initializer()) + + def _build_model(self): + # transformation matrix for updating item embeddings at the end of each hop + self.transform_matrix = tf.get_variable(name="transform_matrix", shape=[self.dim, self.dim], dtype=tf.float64, + initializer=tf.contrib.layers.xavier_initializer()) + + # [batch size, dim] + self.item_embeddings = tf.nn.embedding_lookup(self.entity_emb_matrix, self.items) + + self.h_emb_list = [] + self.r_emb_list = [] + self.t_emb_list = [] + for i in range(self.n_hop): + # [batch size, n_memory, dim] + self.h_emb_list.append(tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_h[i])) + + # [batch size, n_memory, dim, dim] + self.r_emb_list.append(tf.nn.embedding_lookup(self.relation_emb_matrix, self.memories_r[i])) + + # [batch size, n_memory, dim] + self.t_emb_list.append(tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i])) + + o_list = self._key_addressing() + + self.scores = tf.squeeze(self.predict(self.item_embeddings, o_list)) + self.scores_normalized = tf.sigmoid(self.scores) + + def _key_addressing(self): + o_list = [] + for hop in range(self.n_hop): + # [batch_size, n_memory, dim, 1] + h_expanded = tf.expand_dims(self.h_emb_list[hop], axis=3) + + # [batch_size, n_memory, dim] + Rh = tf.squeeze(tf.matmul(self.r_emb_list[hop], h_expanded), axis=3) + + # [batch_size, dim, 1] + v = tf.expand_dims(self.item_embeddings, axis=2) + + # [batch_size, n_memory] + probs = tf.squeeze(tf.matmul(Rh, v), axis=2) + + # [batch_size, n_memory] + probs_normalized = tf.nn.softmax(probs) + + # [batch_size, n_memory, 1] + probs_expanded = tf.expand_dims(probs_normalized, axis=2) + + # [batch_size, dim] + o = tf.reduce_sum(self.t_emb_list[hop] * probs_expanded, axis=1) + + self.item_embeddings = self.update_item_embedding(self.item_embeddings, o) + o_list.append(o) + return o_list + + def update_item_embedding(self, item_embeddings, o): + if self.item_update_mode == "replace": + item_embeddings = o + elif self.item_update_mode == "plus": + item_embeddings = item_embeddings + o + elif self.item_update_mode == "replace_transform": + item_embeddings = tf.matmul(o, self.transform_matrix) + elif self.item_update_mode == "plus_transform": + item_embeddings = tf.matmul(item_embeddings + o, self.transform_matrix) + else: + raise Exception("Unknown item updating mode: " + self.item_update_mode) + return item_embeddings + + def predict(self, item_embeddings, o_list): + y = o_list[-1] + if self.using_all_hops: + for i in range(self.n_hop - 1): + y += o_list[i] + + # [batch_size] + scores = tf.reduce_sum(item_embeddings * y, axis=1) + return scores + + def _build_loss(self): + self.base_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.scores)) + + self.kge_loss = 0 + for hop in range(self.n_hop): + h_expanded = tf.expand_dims(self.h_emb_list[hop], axis=2) + t_expanded = tf.expand_dims(self.t_emb_list[hop], axis=3) + hRt = tf.squeeze(tf.matmul(tf.matmul(h_expanded, self.r_emb_list[hop]), t_expanded)) + self.kge_loss += tf.reduce_mean(tf.sigmoid(hRt)) + self.kge_loss = -self.kge_weight * self.kge_loss + + self.l2_loss = 0 + for hop in range(self.n_hop): + self.l2_loss += tf.reduce_mean(tf.reduce_sum(self.h_emb_list[hop] * self.h_emb_list[hop])) + self.l2_loss += tf.reduce_mean(tf.reduce_sum(self.t_emb_list[hop] * self.t_emb_list[hop])) + self.l2_loss += tf.reduce_mean(tf.reduce_sum(self.r_emb_list[hop] * self.r_emb_list[hop])) + if self.item_update_mode == "replace nonlinear" or self.item_update_mode == "plus nonlinear": + self.l2_loss += tf.nn.l2_loss(self.transform_matrix) + self.l2_loss = self.l2_weight * self.l2_loss + + self.loss = self.base_loss + self.kge_loss + self.l2_loss + + def _build_train(self): + self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss) + ''' + optimizer = tf.train.AdamOptimizer(self.lr) + gradients, variables = zip(*optimizer.compute_gradients(self.loss)) + gradients = [None if gradient is None else tf.clip_by_norm(gradient, clip_norm=5) + for gradient in gradients] + self.optimizer = optimizer.apply_gradients(zip(gradients, variables)) + ''' + + def train(self, sess, feed_dict): + return sess.run([self.optimizer, self.loss], feed_dict) + + def eval(self, sess, feed_dict): + labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) + auc = roc_auc_score(y_true=labels, y_score=scores) + predictions = [1 if i >= 0.5 else 0 for i in scores] + acc = np.mean(np.equal(predictions, labels)) + return auc, acc diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py new file mode 100644 index 0000000000..68b7c3b3cf --- /dev/null +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -0,0 +1,102 @@ +import argparse +import numpy as np + +def read_item_index_to_entity_id_file(file, sep): + # file = '../data/' + DATASET + '/item_index2entity_id_rehashed.txt' + item_index_old2new = dict() + entity_id2index = dict() + print('reading item index to entity id file: ' + file + ' ...') + i = 0 + for line in open(file, encoding='utf-8').readlines(): + line = line.strip().split(sep) + item_index = str(line[0]) + satori_id = str(line[1]) + item_index_old2new[item_index] = i + entity_id2index[satori_id] = i + i += 1 + return item_index_old2new, entity_id2index + + +def convert_rating(file, sep, item_index_old2new, threshold, path_out): + + print('reading rating file ...') + item_set = set(item_index_old2new.values()) + user_pos_ratings = dict() + user_neg_ratings = dict() + + for line in open(file, encoding='utf-8').readlines()[1:]: + array = line.strip().split(sep) + + item_index_old = array[1] + if item_index_old not in item_index_old2new: # the item is not in the final item set + continue + item_index = item_index_old2new[item_index_old] + + user_index_old = int(array[0]) + + rating = float(array[2]) + if rating >= threshold: + if user_index_old not in user_pos_ratings: + user_pos_ratings[user_index_old] = set() + user_pos_ratings[user_index_old].add(item_index) + else: + if user_index_old not in user_neg_ratings: + user_neg_ratings[user_index_old] = set() + user_neg_ratings[user_index_old].add(item_index) + + print('converting rating file ...') + writer = open(path_out + '/ratings_final.txt', 'w', encoding='utf-8') + user_cnt = 0 + user_index_old2new = dict() + for user_index_old, pos_item_set in user_pos_ratings.items(): + if user_index_old not in user_index_old2new: + user_index_old2new[user_index_old] = user_cnt + user_cnt += 1 + user_index = user_index_old2new[user_index_old] + + for item in pos_item_set: + writer.write('%d\t%d\t1\n' % (user_index, item)) + unwatched_set = item_set - pos_item_set + if user_index_old in user_neg_ratings: + unwatched_set -= user_neg_ratings[user_index_old] + for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): + writer.write('%d\t%d\t0\n' % (user_index, item)) + writer.close() + print('number of users: %d' % user_cnt) + print('number of items: %d' % len(item_set)) + + +def convert_kg(file, sep, entity_id2index, path_out): + print('converting kg file ...') + entity_cnt = len(entity_id2index) + relation_cnt = 0 + relation_id2index = dict() + + writer = open(path_out + '/kg_final.txt', 'w', encoding='utf-8') + + for line in open(file, encoding='utf-8'): + array = line.strip().split(sep) + head_old = array[0] + relation_old = array[1] + tail_old = array[2] + + if head_old not in entity_id2index: + entity_id2index[head_old] = entity_cnt + entity_cnt += 1 + head = entity_id2index[head_old] + + if tail_old not in entity_id2index: + entity_id2index[tail_old] = entity_cnt + entity_cnt += 1 + tail = entity_id2index[tail_old] + + if relation_old not in relation_id2index: + relation_id2index[relation_old] = relation_cnt + relation_cnt += 1 + relation = relation_id2index[relation_old] + + writer.write('%d\t%d\t%d\n' % (head, relation, tail)) + + writer.close() + print('number of entities (containing items): %d' % entity_cnt) + print('number of relations: %d' % relation_cnt) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py new file mode 100644 index 0000000000..2786767398 --- /dev/null +++ b/reco_utils/recommender/ripplenet/train.py @@ -0,0 +1,58 @@ +import tensorflow as tf +import numpy as np +from reco_utils.recommender.ripplenet.model import RippleNet + + +def train(args, data_info, show_loss): + train_data = data_info[0] + eval_data = data_info[1] + test_data = data_info[2] + n_entity = data_info[3] + n_relation = data_info[4] + ripple_set = data_info[5] + + model = RippleNet(args, n_entity, n_relation) + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for step in range(args.n_epoch): + # training + np.random.shuffle(train_data) + start = 0 + while start < train_data.shape[0]: + _, loss = model.train( + sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size)) + start += args.batch_size + if show_loss: + print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) + + # evaluation + train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size) + eval_auc, eval_acc = evaluation(sess, args, model, eval_data, ripple_set, args.batch_size) + test_auc, test_acc = evaluation(sess, args, model, test_data, ripple_set, args.batch_size) + + print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' + % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) + + +def get_feed_dict(args, model, data, ripple_set, start, end): + feed_dict = dict() + feed_dict[model.items] = data[start:end, 1] + feed_dict[model.labels] = data[start:end, 2] + for i in range(args.n_hop): + feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] + feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] + feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] + return feed_dict + + +def evaluation(sess, args, model, data, ripple_set, batch_size): + start = 0 + auc_list = [] + acc_list = [] + while start < data.shape[0]: + auc, acc = model.eval(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size)) + auc_list.append(auc) + acc_list.append(acc) + start += batch_size + return float(np.mean(auc_list)), float(np.mean(acc_list)) From bca8e06fcfe4532b7aa2a5495b1c650a35b9545b Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 10 Nov 2019 22:16:30 +0100 Subject: [PATCH 02/75] modified scripts to avoid intermeditate files --- notebooks/02_model/rippleNet_deep_dive.ipynb | 47 ++++++-------- .../recommender/ripplenet/data_loader.py | 61 ++++++------------- .../recommender/ripplenet/preprocess.py | 57 +++++++++-------- 3 files changed, 65 insertions(+), 100 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 52ea01e57c..99e62ffe36 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -34,7 +34,8 @@ " convert_rating, \n", " convert_kg)\n", "\n", - "from reco_utils.recommender.ripplenet.data_loader import (load_rating, \n", + "from reco_utils.recommender.ripplenet.data_loader import (\n", + " dataset_split,\n", " load_kg, \n", " get_ripple_set)\n", "\n", @@ -101,8 +102,7 @@ "entities_id = entities_id.rename(columns = {\"index\": \"unified_id\"})\n", "\n", "item_to_entity = kg_original[[var_id, \"original_entity\"]].drop_duplicates().reset_index().drop(columns = \"index\")\n", - "item_to_entity = transform_id(item_to_entity, entities_id, \"original_entity\")\n", - "item_to_entity.to_csv(path_out+\"item_to_entity.csv\", index = False, header = False)" + "item_to_entity = transform_id(item_to_entity, entities_id, \"original_entity\")" ] }, { @@ -115,17 +115,17 @@ "kg = transform_id(kg, entities_id, \"original_entity\", \"original_entity_id\")\n", "kg = transform_id(kg, entities_id, \"linked_entities\", \"linked_entities_id\")\n", "kg[\"relation\"] = 1\n", - "kg[[\"original_entity_id\",\"relation\", \"linked_entities_id\"]].to_csv(path_out+\"kg_wikidata.csv\", index = False, header = False)" + "kg_wikidata = kg[[\"original_entity_id\",\"relation\", \"linked_entities_id\"]]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "vars_movielens = [\"UserId\", \"ItemId\", \"Rating\", \"Timestamp\"]\n", - "ratings_original[vars_movielens].sort_values(vars_movielens[1]).to_csv(path_out+\"ratings.csv\", index = False)" + "ratings = ratings_original[vars_movielens].sort_values(vars_movielens[1])" ] }, { @@ -137,32 +137,22 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reading item index to entity id file: ../../reco_utils/recommender/ripplenet/data/movielens100k/item_to_entity.csv ...\n" - ] - } - ], + "outputs": [], "source": [ - "file = path_out+\"item_to_entity.csv\"\n", - "item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(file, sep = \",\")" + "item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(item_to_entity)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "reading rating file ...\n", "converting rating file ...\n", "number of users: 943\n", "number of items: 1677\n" @@ -170,13 +160,12 @@ } ], "source": [ - "file = path_out+\"ratings.csv\"\n", - "convert_rating(file, sep = \",\", item_index_old2new = item_index_old2new, threshold = 1, path_out=path_out)" + "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new, threshold = 1)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -192,8 +181,7 @@ } ], "source": [ - "file = path_out + \"kg_wikidata.csv\"\n", - "convert_kg(file, sep =\",\", entity_id2index = entity_id2index, path_out=path_out)" + "kg_final = convert_kg(kg_wikidata, entity_id2index = entity_id2index)" ] }, { @@ -205,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -226,14 +214,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "reading rating file ...\n", "splitting dataset ...\n", "reading KG file ...\n", "constructing knowledge graph ...\n", @@ -242,8 +229,8 @@ } ], "source": [ - "train_data, eval_data, test_data, user_history_dict = load_rating(path_out, args)\n", - "n_entity, n_relation, kg = load_kg(path_out, args)\n", + "train_data, eval_data, test_data, user_history_dict = dataset_split(ratings_final)\n", + "n_entity, n_relation, kg = load_kg(kg_final)\n", "ripple_set = get_ripple_set(args, kg, user_history_dict)" ] }, diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index 8e36334feb..2837c8688f 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -3,29 +3,12 @@ import numpy as np -def load_data(path_data, args): - train_data, eval_data, test_data, user_history_dict = load_rating(path_data, args) - n_entity, n_relation, kg = load_kg(path_data, args) +def load_data(ratings_final, kg_final, args): + train_data, eval_data, test_data, user_history_dict = dataset_split(ratings_final) + n_entity, n_relation, kg = load_kg(kg_final) ripple_set = get_ripple_set(args, kg, user_history_dict) return train_data, eval_data, test_data, n_entity, n_relation, ripple_set - -def load_rating(path_data, args): - print('reading rating file ...') - - # reading rating file - rating_file = path_data + '/ratings_final' - if os.path.exists(rating_file + '.npy'): - rating_np = np.load(rating_file + '.npy') - else: - rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32) - np.save(rating_file + '.npy', rating_np) - - # n_user = len(set(rating_np[:, 0])) - # n_item = len(set(rating_np[:, 1])) - return dataset_split(rating_np) - - def dataset_split(rating_np): print('splitting dataset ...') @@ -43,41 +26,33 @@ def dataset_split(rating_np): # traverse training data, only keeping the users with positive ratings user_history_dict = dict() for i in train_indices: - user = rating_np[i][0] - item = rating_np[i][1] - rating = rating_np[i][2] + user = rating_np.iloc[i][0] + item = rating_np.iloc[i][1] + rating = rating_np.iloc[i][2] if rating == 1: if user not in user_history_dict: user_history_dict[user] = [] user_history_dict[user].append(item) - train_indices = [i for i in train_indices if rating_np[i][0] in user_history_dict] - eval_indices = [i for i in eval_indices if rating_np[i][0] in user_history_dict] - test_indices = [i for i in test_indices if rating_np[i][0] in user_history_dict] + train_indices = [i for i in train_indices if rating_np.iloc[i][0] in user_history_dict] + eval_indices = [i for i in eval_indices if rating_np.iloc[i][0] in user_history_dict] + test_indices = [i for i in test_indices if rating_np.iloc[i][0] in user_history_dict] # print(len(train_indices), len(eval_indices), len(test_indices)) - train_data = rating_np[train_indices] - eval_data = rating_np[eval_indices] - test_data = rating_np[test_indices] + train_data = rating_np.iloc[train_indices] + eval_data = rating_np.iloc[eval_indices] + test_data = rating_np.iloc[test_indices] return train_data, eval_data, test_data, user_history_dict -def load_kg(path_data, args): +def load_kg(kg_final): print('reading KG file ...') - # reading kg file - kg_file = path_data + '/kg_final' - if os.path.exists(kg_file + '.npy'): - kg_np = np.load(kg_file + '.npy') - else: - kg_np = np.loadtxt(kg_file + '.txt', dtype=np.int32) - np.save(kg_file + '.npy', kg_np) - - n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2])) - n_relation = len(set(kg_np[:, 1])) + n_entity = len(set(kg_final.iloc[:, 0]) | set(kg_final.iloc[:, 2])) + n_relation = len(set(kg_final.iloc[:, 1])) - kg = construct_kg(kg_np) + kg = construct_kg(kg_final) return n_entity, n_relation, kg @@ -85,8 +60,8 @@ def load_kg(path_data, args): def construct_kg(kg_np): print('constructing knowledge graph ...') kg = collections.defaultdict(list) - for head, relation, tail in kg_np: - kg[head].append((tail, relation)) + for index, row in kg_np.iterrows(): + kg[row["head"]].append((row["tail"], row["relation"])) return kg diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 68b7c3b3cf..4ef53a8bff 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -1,40 +1,36 @@ import argparse import numpy as np +import pandas as pd -def read_item_index_to_entity_id_file(file, sep): +def read_item_index_to_entity_id_file(item_to_entity): # file = '../data/' + DATASET + '/item_index2entity_id_rehashed.txt' item_index_old2new = dict() entity_id2index = dict() - print('reading item index to entity id file: ' + file + ' ...') i = 0 - for line in open(file, encoding='utf-8').readlines(): - line = line.strip().split(sep) - item_index = str(line[0]) - satori_id = str(line[1]) + for index, row in item_to_entity.iterrows(): + item_index = str(row[0]) + satori_id = str(row[1]) item_index_old2new[item_index] = i entity_id2index[satori_id] = i i += 1 return item_index_old2new, entity_id2index -def convert_rating(file, sep, item_index_old2new, threshold, path_out): +def convert_rating(ratings, item_index_old2new, threshold): - print('reading rating file ...') item_set = set(item_index_old2new.values()) user_pos_ratings = dict() user_neg_ratings = dict() - for line in open(file, encoding='utf-8').readlines()[1:]: - array = line.strip().split(sep) - - item_index_old = array[1] + for index, row in ratings.iterrows(): + item_index_old = str(int(row[1])) if item_index_old not in item_index_old2new: # the item is not in the final item set continue item_index = item_index_old2new[item_index_old] - user_index_old = int(array[0]) + user_index_old = int(row[0]) - rating = float(array[2]) + rating = float(row[2]) if rating >= threshold: if user_index_old not in user_pos_ratings: user_pos_ratings[user_index_old] = set() @@ -45,7 +41,7 @@ def convert_rating(file, sep, item_index_old2new, threshold, path_out): user_neg_ratings[user_index_old].add(item_index) print('converting rating file ...') - writer = open(path_out + '/ratings_final.txt', 'w', encoding='utf-8') + writer = [] user_cnt = 0 user_index_old2new = dict() for user_index_old, pos_item_set in user_pos_ratings.items(): @@ -55,30 +51,34 @@ def convert_rating(file, sep, item_index_old2new, threshold, path_out): user_index = user_index_old2new[user_index_old] for item in pos_item_set: - writer.write('%d\t%d\t1\n' % (user_index, item)) + writer.append({"user_index": user_index, + "item": item, + "rating": 1}) unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: unwatched_set -= user_neg_ratings[user_index_old] for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): - writer.write('%d\t%d\t0\n' % (user_index, item)) - writer.close() + writer.append({"user_index": user_index, + "item": item, + "rating": 0}) + ratings_final = pd.DataFrame(writer) print('number of users: %d' % user_cnt) print('number of items: %d' % len(item_set)) + return ratings_final -def convert_kg(file, sep, entity_id2index, path_out): +def convert_kg(kg, entity_id2index): print('converting kg file ...') entity_cnt = len(entity_id2index) relation_cnt = 0 relation_id2index = dict() - writer = open(path_out + '/kg_final.txt', 'w', encoding='utf-8') + writer = [] - for line in open(file, encoding='utf-8'): - array = line.strip().split(sep) - head_old = array[0] - relation_old = array[1] - tail_old = array[2] + for index, row in kg.iterrows(): + head_old = str(int(row[0])) + relation_old = row[1] + tail_old = str(int(row[2])) if head_old not in entity_id2index: entity_id2index[head_old] = entity_cnt @@ -95,8 +95,11 @@ def convert_kg(file, sep, entity_id2index, path_out): relation_cnt += 1 relation = relation_id2index[relation_old] - writer.write('%d\t%d\t%d\n' % (head, relation, tail)) + writer.append({"head": head, + "relation": relation, + "tail": tail}) - writer.close() + kg_final = pd.DataFrame(writer) print('number of entities (containing items): %d' % entity_cnt) print('number of relations: %d' % relation_cnt) + return kg_final From b025e6816960dbdf1f50335a99ac1a84f3c7accd Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 10 Nov 2019 22:19:20 +0100 Subject: [PATCH 03/75] added reference to RippleNet --- reco_utils/recommender/ripplenet/data_loader.py | 3 +++ reco_utils/recommender/ripplenet/main.py | 3 +++ reco_utils/recommender/ripplenet/model.py | 4 +++- reco_utils/recommender/ripplenet/preprocess.py | 3 +++ reco_utils/recommender/ripplenet/train.py | 4 +++- 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index 2837c8688f..be4ef71506 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -1,3 +1,6 @@ +# This code is modified from RippleNet +# Online code of RippleNet: https://github.com/hwwang55/RippleNet + import collections import os import numpy as np diff --git a/reco_utils/recommender/ripplenet/main.py b/reco_utils/recommender/ripplenet/main.py index deea001efd..26f3e3708f 100644 --- a/reco_utils/recommender/ripplenet/main.py +++ b/reco_utils/recommender/ripplenet/main.py @@ -1,3 +1,6 @@ +# This code is modified from RippleNet +# Online code of RippleNet: https://github.com/hwwang55/RippleNet + import argparse import numpy as np from data_loader import load_data diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 1fb806833e..cf69782b93 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -1,8 +1,10 @@ +# This code is modified from RippleNet +# Online code of RippleNet: https://github.com/hwwang55/RippleNet + import tensorflow as tf import numpy as np from sklearn.metrics import roc_auc_score - class RippleNet(object): def __init__(self, args, n_entity, n_relation): self._parse_args(args, n_entity, n_relation) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 4ef53a8bff..328f17b4ef 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -1,3 +1,6 @@ +# This code is modified from RippleNet +# Online code of RippleNet: https://github.com/hwwang55/RippleNet + import argparse import numpy as np import pandas as pd diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index 2786767398..b546a7e522 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -1,8 +1,10 @@ +# This code is modified from RippleNet +# Online code of RippleNet: https://github.com/hwwang55/RippleNet + import tensorflow as tf import numpy as np from reco_utils.recommender.ripplenet.model import RippleNet - def train(args, data_info, show_loss): train_data = data_info[0] eval_data = data_info[1] From 27421f80281b2ef9cc805a567aab4ddbe977143f Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 30 Nov 2019 13:41:00 +0100 Subject: [PATCH 04/75] read directly from online source files --- notebooks/02_model/rippleNet_deep_dive.ipynb | 49 ++++++++++++-------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 99e62ffe36..b06192e1f6 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -29,6 +29,7 @@ "import numpy as np\n", "import os\n", "import argparse \n", + "from reco_utils.dataset import movielens\n", "\n", "from reco_utils.recommender.ripplenet.preprocess import (read_item_index_to_entity_id_file, \n", " convert_rating, \n", @@ -48,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": { "tags": [ "parameters" @@ -56,7 +57,8 @@ }, "outputs": [], "source": [ - "MOVIELENS_DATASET = \"100k\"" + "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", + "MOVIELENS_DATA_SIZE = '100k'" ] }, { @@ -68,20 +70,29 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.19kKB/s]\n" + ] + } + ], "source": [ - "kg_original = pd.read_csv(\"../../reco_utils/recommender/ripplenet/data/movielens_100k_wikidata.csv\")\n", - "ratings_original = pd.read_csv(\"../../reco_utils/recommender/ripplenet/data/ratings_movielens_100k.csv\")\n", - "path_out = \"../../reco_utils/recommender/ripplenet/data/movielens100k/\"\n", - "if(os.path.exists(path_out)==False):\n", - " os.mkdir(path_out)" + "kg_original = pd.read_csv(\"https://recostorage.blob.core.windows.net/movielens/movielens_{}_wikidata.csv\".format(MOVIELENS_DATA_SIZE))\n", + "ratings_original = movielens.load_pandas_df(MOVIELENS_DATA_SIZE,\n", + " ('UserId', 'ItemId', 'Rating', 'Timestamp'),\n", + " title_col='Title',\n", + " genres_col='Genres',\n", + " year_col='Year')" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -93,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -107,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -120,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -137,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -146,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -165,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "scrolled": true }, @@ -174,9 +185,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "converting kg file ...\n", - "number of entities (containing items): 22994\n", - "number of relations: 1\n" + "converting kg file ...\n" ] } ], From 6629614db6cfd0a21a5963ddfba666f553906e53 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 30 Nov 2019 14:21:53 +0100 Subject: [PATCH 05/75] train method expects numpy arrays --- notebooks/02_model/rippleNet_deep_dive.ipynb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index b06192e1f6..77ae208b75 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -176,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "scrolled": true }, @@ -185,7 +185,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "converting kg file ...\n" + "converting kg file ...\n", + "number of entities (containing items): 22994\n", + "number of relations: 1\n" ] } ], @@ -202,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -223,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -249,7 +251,7 @@ "metadata": {}, "outputs": [], "source": [ - "data_info = train_data, eval_data, test_data, n_entity, n_relation, ripple_set\n", + "data_info = train_data.to_numpy(), eval_data.to_numpy(), test_data.to_numpy(), n_entity, n_relation, ripple_set\n", "show_loss = False" ] }, From 73c3641212801a3dce434a1cd5260038ee5ffb68 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 30 Nov 2019 14:34:24 +0100 Subject: [PATCH 06/75] updated blob name to find all files --- notebooks/02_model/rippleNet_deep_dive.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 77ae208b75..950bc211f0 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -82,7 +82,7 @@ } ], "source": [ - "kg_original = pd.read_csv(\"https://recostorage.blob.core.windows.net/movielens/movielens_{}_wikidata.csv\".format(MOVIELENS_DATA_SIZE))\n", + "kg_original = pd.read_csv(\"https://recodatasets.blob.core.windows.net/wikidata/movielens_{}_wikidata.csv\".format(MOVIELENS_DATA_SIZE))\n", "ratings_original = movielens.load_pandas_df(MOVIELENS_DATA_SIZE,\n", " ('UserId', 'ItemId', 'Rating', 'Timestamp'),\n", " title_col='Title',\n", From e7baa3041246ee676f2116752ddc21963f1b18d1 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 30 Nov 2019 14:50:55 +0100 Subject: [PATCH 07/75] updated threshold value --- notebooks/02_model/rippleNet_deep_dive.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 950bc211f0..d3d4eb4252 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -87,7 +87,8 @@ " ('UserId', 'ItemId', 'Rating', 'Timestamp'),\n", " title_col='Title',\n", " genres_col='Genres',\n", - " year_col='Year')" + " year_col='Year')\n", + "rating_threshold = 4" ] }, { @@ -171,7 +172,7 @@ } ], "source": [ - "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new, threshold = 1)" + "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new, threshold = rating_threshold)" ] }, { From 0f175c79043e9caf6452e469d2ad198d2413a329 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 17:04:31 +0100 Subject: [PATCH 08/75] new function to retrieve scores --- reco_utils/recommender/ripplenet/model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index cf69782b93..0caa569324 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -167,3 +167,7 @@ def eval(self, sess, feed_dict): predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) return auc, acc + + def return_scores(self, sess, feed_dict): + labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) + return labels, scores From e4856c4be87e701d5278c4077e683f2808a228b7 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 17:06:36 +0100 Subject: [PATCH 09/75] new functions to train and predict --- reco_utils/recommender/ripplenet/train.py | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index b546a7e522..5d6dae49b6 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -36,6 +36,25 @@ def train(args, data_info, show_loss): print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) +def fit(args, model, train_data, ripple_set, show_loss): + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for step in range(args.n_epoch): + # training + np.random.shuffle(train_data) + start = 0 + while start < train_data.shape[0]: + _, loss = model.train( + sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size)) + start += args.batch_size + if show_loss: + print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) + + train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size) + + print('epoch %d train auc: %.4f acc: %.4f' + % (step, train_auc, train_acc)) + return model, sess def get_feed_dict(args, model, data, ripple_set, start, end): feed_dict = dict() @@ -58,3 +77,14 @@ def evaluation(sess, args, model, data, ripple_set, batch_size): acc_list.append(acc) start += batch_size return float(np.mean(auc_list)), float(np.mean(acc_list)) + +def predict(sess, args, model, data, ripple_set, batch_size): + start = 0 + labels_list = [] + scores_list = [] + while start < data.shape[0]: + labels, scores = model.return_scores(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size)) + labels_list.append(labels) + scores_list.append(scores) + predictions_list = [1 if i >= 0.5 else 0 for i in scores_list] + return labels_list, scores_list, predictions_list \ No newline at end of file From 8bf6917ebd365c0e0a75794b5ef738a3ceee477e Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 17:26:31 +0100 Subject: [PATCH 10/75] externalise tf session --- reco_utils/recommender/ripplenet/train.py | 33 +++++++++++------------ 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index 5d6dae49b6..51c2bad8ba 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -36,25 +36,24 @@ def train(args, data_info, show_loss): print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) -def fit(args, model, train_data, ripple_set, show_loss): - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - for step in range(args.n_epoch): - # training - np.random.shuffle(train_data) - start = 0 - while start < train_data.shape[0]: - _, loss = model.train( - sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size)) - start += args.batch_size - if show_loss: - print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) +def fit(sess, args, model, train_data, ripple_set, show_loss): + sess.run(tf.global_variables_initializer()) + for step in range(args.n_epoch): + # training + np.random.shuffle(train_data) + start = 0 + while start < train_data.shape[0]: + _, loss = model.train( + sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size)) + start += args.batch_size + if show_loss: + print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size) + train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size) - print('epoch %d train auc: %.4f acc: %.4f' - % (step, train_auc, train_acc)) - return model, sess + print('epoch %d train auc: %.4f acc: %.4f' + % (step, train_auc, train_acc)) + return model def get_feed_dict(args, model, data, ripple_set, start, end): feed_dict = dict() From 50b164614094a5547bb1f8f123b319e784ea17d3 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 17:39:18 +0100 Subject: [PATCH 11/75] updated batch_size --- reco_utils/recommender/ripplenet/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index 51c2bad8ba..7d0eb7cfdc 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -77,7 +77,8 @@ def evaluation(sess, args, model, data, ripple_set, batch_size): start += batch_size return float(np.mean(auc_list)), float(np.mean(acc_list)) -def predict(sess, args, model, data, ripple_set, batch_size): +def predict(sess, args, model, data, ripple_set): + batch_size = args.batch_size start = 0 labels_list = [] scores_list = [] @@ -85,5 +86,6 @@ def predict(sess, args, model, data, ripple_set, batch_size): labels, scores = model.return_scores(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size)) labels_list.append(labels) scores_list.append(scores) + start += batch_size predictions_list = [1 if i >= 0.5 else 0 for i in scores_list] return labels_list, scores_list, predictions_list \ No newline at end of file From 865b305bc1c0012156a72874def4605f9b999d49 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 17:48:22 +0100 Subject: [PATCH 12/75] remove predictions return --- reco_utils/recommender/ripplenet/train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index 7d0eb7cfdc..c1ca5c3366 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -87,5 +87,4 @@ def predict(sess, args, model, data, ripple_set): labels_list.append(labels) scores_list.append(scores) start += batch_size - predictions_list = [1 if i >= 0.5 else 0 for i in scores_list] - return labels_list, scores_list, predictions_list \ No newline at end of file + return labels_list, scores_list \ No newline at end of file From 5177055d770aa8a4509d3102450f93140bd3f040 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 18:14:32 +0100 Subject: [PATCH 13/75] predict to output lists --- reco_utils/recommender/ripplenet/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index c1ca5c3366..b80f1ff8d2 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -87,4 +87,5 @@ def predict(sess, args, model, data, ripple_set): labels_list.append(labels) scores_list.append(scores) start += batch_size - return labels_list, scores_list \ No newline at end of file + + return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) \ No newline at end of file From e7d544ecf63d0fce616f0573adc3ac9217ce90ee Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Tue, 3 Dec 2019 18:14:50 +0100 Subject: [PATCH 14/75] updated notebook for new structure and functions --- notebooks/02_model/rippleNet_deep_dive.ipynb | 82 +++++++++++++++++--- 1 file changed, 72 insertions(+), 10 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index d3d4eb4252..e9154c61d5 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -27,9 +27,12 @@ "sys.path.append(\"../../\")\n", "import pandas as pd\n", "import numpy as np\n", + "import tensorflow as tf\n", "import os\n", "import argparse \n", + "from reco_utils.evaluation.python_evaluation import auc\n", "from reco_utils.dataset import movielens\n", + "from reco_utils.dataset.python_splitters import python_stratified_split\n", "\n", "from reco_utils.recommender.ripplenet.preprocess import (read_item_index_to_entity_id_file, \n", " convert_rating, \n", @@ -40,8 +43,9 @@ " load_kg, \n", " get_ripple_set)\n", "\n", - "from reco_utils.recommender.ripplenet.train import (train)\n", + "from reco_utils.recommender.ripplenet.train import (fit, predict)\n", "\n", + "from reco_utils.recommender.ripplenet.model import RippleNet\n", "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))" @@ -77,7 +81,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.19kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.27kKB/s]\n" ] } ], @@ -166,7 +170,7 @@ "output_type": "stream", "text": [ "converting rating file ...\n", - "number of users: 943\n", + "number of users: 942\n", "number of items: 1677\n" ] } @@ -200,7 +204,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Load data" + "## Split Data" ] }, { @@ -208,6 +212,25 @@ "execution_count": 14, "metadata": {}, "outputs": [], + "source": [ + "train_data, test_eval_data = python_stratified_split(ratings_final, ratio=0.6, col_user='user_index', col_item='item', seed=42)\n", + "test_data, eval_data = python_stratified_split(ratings_final, ratio=0.5, col_user='user_index', col_item='item', seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "user_history_dict = train_data.loc[train_data.rating == 1].groupby('user_index')['item'].apply(list).to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], "source": [ "parser = argparse.ArgumentParser() \n", "parser.add_argument('--dataset', type=str, default='movielens100k', help='which dataset to use') \n", @@ -226,14 +249,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "splitting dataset ...\n", "reading KG file ...\n", "constructing knowledge graph ...\n", "constructing ripple set ...\n" @@ -241,28 +263,68 @@ } ], "source": [ - "train_data, eval_data, test_data, user_history_dict = dataset_split(ratings_final)\n", "n_entity, n_relation, kg = load_kg(kg_final)\n", "ripple_set = get_ripple_set(args, kg, user_history_dict)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build model" + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "data_info = train_data.to_numpy(), eval_data.to_numpy(), test_data.to_numpy(), n_entity, n_relation, ripple_set\n", "show_loss = False" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "data_info = train_data.to_numpy(), eval_data.to_numpy(), test_data.to_numpy(), n_entity, n_relation, ripple_set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train and test and eval metrics at once\n", + "# train(args, data_info, show_loss)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ripple = RippleNet(args, n_entity, n_relation)\n", + "\n", + "with tf.Session() as sess:\n", + " model = fit(sess, args, ripple, train_data.to_numpy(), ripple_set, show_loss)\n", + " labels, scores = predict(sess, args, model, test_data.to_numpy(), ripple_set)\n", + "\n", + "predictions = [1 if i >= 0.5 else 0 for i in scores]" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "train(args, data_info, show_loss)" + "from sklearn.metrics import roc_auc_score\n", + "roc_auc_score(y_true=labels, y_score=scores)" ] } ], From 134a3a9e449c18d735febecf9e67e133f9c2a7cc Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 11:59:03 +0100 Subject: [PATCH 15/75] substituted args by explicit parameters --- reco_utils/recommender/ripplenet/data_loader.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index be4ef71506..f41167ffaa 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -6,10 +6,10 @@ import numpy as np -def load_data(ratings_final, kg_final, args): +def load_data(ratings_final, kg_final, n_hop, n_memory): train_data, eval_data, test_data, user_history_dict = dataset_split(ratings_final) n_entity, n_relation, kg = load_kg(kg_final) - ripple_set = get_ripple_set(args, kg, user_history_dict) + ripple_set = get_ripple_set(kg, user_history_dict, n_hop, n_memory) return train_data, eval_data, test_data, n_entity, n_relation, ripple_set def dataset_split(rating_np): @@ -68,14 +68,14 @@ def construct_kg(kg_np): return kg -def get_ripple_set(args, kg, user_history_dict): +def get_ripple_set(kg, user_history_dict, n_hop, n_memory): print('constructing ripple set ...') # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...] ripple_set = collections.defaultdict(list) for user in user_history_dict: - for h in range(args.n_hop): + for h in range(n_hop): memories_h = [] memories_r = [] memories_t = [] @@ -98,8 +98,8 @@ def get_ripple_set(args, kg, user_history_dict): ripple_set[user].append(ripple_set[user][-1]) else: # sample a fixed-size 1-hop memory for each user - replace = len(memories_h) < args.n_memory - indices = np.random.choice(len(memories_h), size=args.n_memory, replace=replace) + replace = len(memories_h) < n_memory + indices = np.random.choice(len(memories_h), size=n_memory, replace=replace) memories_h = [memories_h[i] for i in indices] memories_r = [memories_r[i] for i in indices] memories_t = [memories_t[i] for i in indices] From f7fbfb22f53260002c934cbcfe13988599e923b9 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 11:59:28 +0100 Subject: [PATCH 16/75] substituted args by explicit parameters --- reco_utils/recommender/ripplenet/train.py | 46 +++++++++++++---------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index b80f1ff8d2..0afe9d701a 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -5,7 +5,11 @@ import numpy as np from reco_utils.recommender.ripplenet.model import RippleNet -def train(args, data_info, show_loss): +def train(n_epoch, batch_size, + dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, + data_info, show_loss): + train_data = data_info[0] eval_data = data_info[1] test_data = data_info[2] @@ -13,77 +17,79 @@ def train(args, data_info, show_loss): n_relation = data_info[4] ripple_set = data_info[5] - model = RippleNet(args, n_entity, n_relation) + model = RippleNet(dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, n_entity, n_relation) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) - for step in range(args.n_epoch): + for step in range(n_epoch): # training np.random.shuffle(train_data) start = 0 while start < train_data.shape[0]: _, loss = model.train( - sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size)) - start += args.batch_size + sess, get_feed_dict(n_hop, model, train_data, ripple_set, start, start + batch_size)) + start += batch_size if show_loss: print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) # evaluation - train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size) - eval_auc, eval_acc = evaluation(sess, args, model, eval_data, ripple_set, args.batch_size) - test_auc, test_acc = evaluation(sess, args, model, test_data, ripple_set, args.batch_size) + train_auc, train_acc = evaluation(sess, n_hop, model, train_data, ripple_set, batch_size) + eval_auc, eval_acc = evaluation(sess, n_hop, model, eval_data, ripple_set, batch_size) + test_auc, test_acc = evaluation(sess, n_hop, model, test_data, ripple_set, batch_size) print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) -def fit(sess, args, model, train_data, ripple_set, show_loss): +def fit(sess, + n_epoch, batch_size,n_hop, + model, train_data, ripple_set, show_loss): sess.run(tf.global_variables_initializer()) - for step in range(args.n_epoch): + for step in range(n_epoch): # training np.random.shuffle(train_data) start = 0 while start < train_data.shape[0]: _, loss = model.train( - sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size)) - start += args.batch_size + sess, get_feed_dict(n_hop, model, train_data, ripple_set, start, start + batch_size)) + start += batch_size if show_loss: print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size) + train_auc, train_acc = evaluation(sess, n_hop, model, train_data, ripple_set, batch_size) print('epoch %d train auc: %.4f acc: %.4f' % (step, train_auc, train_acc)) return model -def get_feed_dict(args, model, data, ripple_set, start, end): +def get_feed_dict(n_hop, model, data, ripple_set, start, end): feed_dict = dict() feed_dict[model.items] = data[start:end, 1] feed_dict[model.labels] = data[start:end, 2] - for i in range(args.n_hop): + for i in range(n_hop): feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] return feed_dict -def evaluation(sess, args, model, data, ripple_set, batch_size): +def evaluation(sess, n_hop, model, data, ripple_set, batch_size): start = 0 auc_list = [] acc_list = [] while start < data.shape[0]: - auc, acc = model.eval(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size)) + auc, acc = model.eval(sess, get_feed_dict(n_hop, model, data, ripple_set, start, start + batch_size)) auc_list.append(auc) acc_list.append(acc) start += batch_size return float(np.mean(auc_list)), float(np.mean(acc_list)) -def predict(sess, args, model, data, ripple_set): - batch_size = args.batch_size +def predict(sess, batch_size, n_hop, model, data, ripple_set): start = 0 labels_list = [] scores_list = [] while start < data.shape[0]: - labels, scores = model.return_scores(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size)) + labels, scores = model.return_scores(sess, get_feed_dict(n_hop, model, data, ripple_set, start, start + batch_size)) labels_list.append(labels) scores_list.append(scores) start += batch_size From 355e2f1a62200720cf97928522610c047a54c509 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 11:59:52 +0100 Subject: [PATCH 17/75] substituted args by explicit paramenters in init --- reco_utils/recommender/ripplenet/model.py | 26 +++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 0caa569324..0530ade8eb 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -6,25 +6,29 @@ from sklearn.metrics import roc_auc_score class RippleNet(object): - def __init__(self, args, n_entity, n_relation): - self._parse_args(args, n_entity, n_relation) + def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, n_entity, n_relation): + self._parse_args(dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, n_entity, n_relation) self._build_inputs() self._build_embeddings() self._build_model() self._build_loss() self._build_train() - def _parse_args(self, args, n_entity, n_relation): + def _parse_args(self, dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, + n_entity, n_relation): self.n_entity = n_entity self.n_relation = n_relation - self.dim = args.dim - self.n_hop = args.n_hop - self.kge_weight = args.kge_weight - self.l2_weight = args.l2_weight - self.lr = args.lr - self.n_memory = args.n_memory - self.item_update_mode = args.item_update_mode - self.using_all_hops = args.using_all_hops + self.dim = dim + self.n_hop = n_hop + self.kge_weight = kge_weight + self.l2_weight = l2_weight + self.lr = lr + self.n_memory = n_memory + self.item_update_mode = item_update_mode + self.using_all_hops = using_all_hops def _build_inputs(self): self.items = tf.placeholder(dtype=tf.int32, shape=[None], name="items") From ea069c933cc9afe78f0d579fecd1502a71bea949 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 12:00:04 +0100 Subject: [PATCH 18/75] functions expect explicit parameters --- notebooks/02_model/rippleNet_deep_dive.ipynb | 99 ++++++++------------ 1 file changed, 40 insertions(+), 59 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index e9154c61d5..32b1df136c 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -62,7 +62,18 @@ "outputs": [], "source": [ "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'" + "MOVIELENS_DATA_SIZE = '100k'\n", + "# Ripple parameters\n", + "n_epoch = 10 #the number of epochs\n", + "batch_size = 1024 #batch size\n", + "dim = 16 #dimension of entity and relation embeddings\n", + "n_hop = 2 #maximum hops\n", + "kge_weight = 0.01 #weight of the KGE term\n", + "l2_weight = 1e-7 #weight of the l2 regularization term\n", + "lr = 0.02 #learning rate\n", + "n_memory = 32 #size of ripple set for each hop\n", + "item_update_mode = 'plus_transform' #how to update item at the end of each hop\n", + "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction" ] }, { @@ -74,14 +85,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.27kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.20kKB/s]\n" ] } ], @@ -97,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -123,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -136,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -153,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -181,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -209,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -219,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -228,28 +239,7 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "parser = argparse.ArgumentParser() \n", - "parser.add_argument('--dataset', type=str, default='movielens100k', help='which dataset to use') \n", - "parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings') \n", - "parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') \n", - "parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term') \n", - "parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term') \n", - "parser.add_argument('--lr', type=float, default=0.02, help='learning rate') \n", - "parser.add_argument('--batch_size', type=int, default=1024, help='batch size') \n", - "parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') \n", - "parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') \n", - "parser.add_argument('--item_update_mode', type=str, default='plus_transform', help='how to update item at the end of each hop') \n", - "parser.add_argument('--using_all_hops', type=bool, default=True, help='whether using outputs of all hops or just the last hop when making prediction') \n", - "args = parser.parse_args(\"--dataset movielens100k\".split())" - ] - }, - { - "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -264,7 +254,7 @@ ], "source": [ "n_entity, n_relation, kg = load_kg(kg_final)\n", - "ripple_set = get_ripple_set(args, kg, user_history_dict)" + "ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)" ] }, { @@ -283,36 +273,27 @@ "show_loss = False" ] }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "data_info = train_data.to_numpy(), eval_data.to_numpy(), test_data.to_numpy(), n_entity, n_relation, ripple_set" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train and test and eval metrics at once\n", - "# train(args, data_info, show_loss)" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "ripple = RippleNet(args, n_entity, n_relation)\n", + "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", + " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", + " n_memory=n_memory,\n", + " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", + " n_entity=n_entity,n_relation=n_relation)\n", "\n", "with tf.Session() as sess:\n", - " model = fit(sess, args, ripple, train_data.to_numpy(), ripple_set, show_loss)\n", - " labels, scores = predict(sess, args, model, test_data.to_numpy(), ripple_set)\n", + " model = fit(sess=sess, \n", + " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", + " model=ripple, train_data=train_data.to_numpy(), \n", + " ripple_set=ripple_set, show_loss=show_loss)\n", + " labels, scores = predict(sess=sess, \n", + " batch_size=batch_size, n_hop=n_hop, \n", + " model=model, data=test_data.to_numpy(),\n", + " ripple_set=ripple_set)\n", "\n", "predictions = [1 if i >= 0.5 else 0 for i in scores]" ] From 1499f3d913bfd162adfe19182f8c0cda48352c55 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 13:34:02 +0100 Subject: [PATCH 19/75] add exception for lack of data --- reco_utils/recommender/ripplenet/train.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index 0afe9d701a..c513b57a71 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -67,9 +67,12 @@ def get_feed_dict(n_hop, model, data, ripple_set, start, end): feed_dict[model.items] = data[start:end, 1] feed_dict[model.labels] = data[start:end, 2] for i in range(n_hop): - feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] - feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] - feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] + try: + feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] + feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] + feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] + except: + print("Skipping user for lack of data") return feed_dict From 195f1cf539d4a3faa8e4baceb9b6f3521537d2ff Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 13:40:40 +0100 Subject: [PATCH 20/75] undo last commit --- reco_utils/recommender/ripplenet/train.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index c513b57a71..0afe9d701a 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -67,12 +67,9 @@ def get_feed_dict(n_hop, model, data, ripple_set, start, end): feed_dict[model.items] = data[start:end, 1] feed_dict[model.labels] = data[start:end, 2] for i in range(n_hop): - try: - feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] - feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] - feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] - except: - print("Skipping user for lack of data") + feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] + feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] + feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] return feed_dict From 9de04d14ee7dd022f81090b4a408226c0160e83a Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Wed, 4 Dec 2019 14:02:09 +0100 Subject: [PATCH 21/75] format and updated seed --- notebooks/02_model/rippleNet_deep_dive.ipynb | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 32b1df136c..057da363b7 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -224,8 +224,8 @@ "metadata": {}, "outputs": [], "source": [ - "train_data, test_eval_data = python_stratified_split(ratings_final, ratio=0.6, col_user='user_index', col_item='item', seed=42)\n", - "test_data, eval_data = python_stratified_split(ratings_final, ratio=0.5, col_user='user_index', col_item='item', seed=42)" + "train_data, test_eval_data = python_stratified_split(ratings_final, ratio=0.6, col_user='user_index', col_item='item', seed=12)\n", + "test_data, eval_data = python_stratified_split(ratings_final, ratio=0.5, col_user='user_index', col_item='item', seed=12)" ] }, { @@ -257,6 +257,17 @@ "ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feed_dict = dict()\n", + "for user in data[start:end, 0]:\n", + " feed_dict[model.memories_h[i]] = ripple_set[user][i][0]" + ] + }, { "cell_type": "markdown", "metadata": {}, From 5d5baacf21b1d8370ab04c38e35c6773757bdfb2 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 12:46:09 +0100 Subject: [PATCH 22/75] remove unused file --- reco_utils/recommender/ripplenet/main.py | 48 ------------------------ 1 file changed, 48 deletions(-) delete mode 100644 reco_utils/recommender/ripplenet/main.py diff --git a/reco_utils/recommender/ripplenet/main.py b/reco_utils/recommender/ripplenet/main.py deleted file mode 100644 index 26f3e3708f..0000000000 --- a/reco_utils/recommender/ripplenet/main.py +++ /dev/null @@ -1,48 +0,0 @@ -# This code is modified from RippleNet -# Online code of RippleNet: https://github.com/hwwang55/RippleNet - -import argparse -import numpy as np -from data_loader import load_data -from train import train - -np.random.seed(555) - -parser = argparse.ArgumentParser() -parser.add_argument('--dataset', type=str, default='movie', help='which dataset to use') -parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings') -parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') -parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term') -parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term') -parser.add_argument('--lr', type=float, default=0.02, help='learning rate') -parser.add_argument('--batch_size', type=int, default=1024, help='batch size') -parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') -parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') -parser.add_argument('--item_update_mode', type=str, default='plus_transform', - help='how to update item at the end of each hop') -parser.add_argument('--using_all_hops', type=bool, default=True, - help='whether using outputs of all hops or just the last hop when making prediction') - -''' -# default settings for Book-Crossing -parser = argparse.ArgumentParser() -parser.add_argument('--dataset', type=str, default='book', help='which dataset to use') -parser.add_argument('--dim', type=int, default=4, help='dimension of entity and relation embeddings') -parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') -parser.add_argument('--kge_weight', type=float, default=1e-2, help='weight of the KGE term') -parser.add_argument('--l2_weight', type=float, default=1e-5, help='weight of the l2 regularization term') -parser.add_argument('--lr', type=float, default=1e-3, help='learning rate') -parser.add_argument('--batch_size', type=int, default=1024, help='batch size') -parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') -parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') -parser.add_argument('--item_update_mode', type=str, default='plus_transform', - help='how to update item at the end of each hop') -parser.add_argument('--using_all_hops', type=bool, default=True, - help='whether using outputs of all hops or just the last hop when making prediction') -''' - -args = parser.parse_args() - -show_loss = False -data_info = load_data(args) -train(args, data_info, show_loss) From bf1eaddcdd4fdea233e2d2b8d20bed18d050ee9e Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 12:46:28 +0100 Subject: [PATCH 23/75] function documentation --- .../recommender/ripplenet/preprocess.py | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 328f17b4ef..10e33b2b9b 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -6,7 +6,15 @@ import pandas as pd def read_item_index_to_entity_id_file(item_to_entity): - # file = '../data/' + DATASET + '/item_index2entity_id_rehashed.txt' + """Standarize indexes for items and entities + + Args: + item_to_entity: KG dataframe with original item and entity IDs + + Returns: + item_index_old2new: dictionary, conversion from original item ID to internal item ID + entity_id2index: dictionary, conversion from original entity ID to internal entity ID + """ item_index_old2new = dict() entity_id2index = dict() i = 0 @@ -20,7 +28,18 @@ def read_item_index_to_entity_id_file(item_to_entity): def convert_rating(ratings, item_index_old2new, threshold): - + """Apply item standarization to ratings dataset. + Use rating threshold to determite positive ratings + + Args: + ratings: dataframe, ratings with columns ["UserId", "ItemId", "Rating"] + item_index_old2new: dictionary, conversion from original item ID to internal item ID + threshold: minimum valur for the rating to be considered positive + + Returns: + ratings_final: dataframe, ratings converted with columns userID, + internal item ID and binary rating (1, 0) + """ item_set = set(item_index_old2new.values()) user_pos_ratings = dict() user_neg_ratings = dict() @@ -71,6 +90,15 @@ def convert_rating(ratings, item_index_old2new, threshold): def convert_kg(kg, entity_id2index): + """Apply entity standarization to KG dataset + Args: + kg: dataframe, knowledge graph with columns ["original_entity_id", "relation", "linked_entities_id"] + entity_id2index: dictionary, conversion from original entity ID to internal entity ID + + Returns: + kg_final: dataframe, knowledge graph converted with columns head, + relation and tail, with internal entity IDs + """ print('converting kg file ...') entity_cnt = len(entity_id2index) relation_cnt = 0 From 19e5a94d76e4c36f715e686586b4f6b28d525793 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 13:01:42 +0100 Subject: [PATCH 24/75] added docu --- .../recommender/ripplenet/data_loader.py | 76 ++++++------------- 1 file changed, 25 insertions(+), 51 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index f41167ffaa..542f44334b 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -6,69 +6,43 @@ import numpy as np -def load_data(ratings_final, kg_final, n_hop, n_memory): - train_data, eval_data, test_data, user_history_dict = dataset_split(ratings_final) - n_entity, n_relation, kg = load_kg(kg_final) - ripple_set = get_ripple_set(kg, user_history_dict, n_hop, n_memory) - return train_data, eval_data, test_data, n_entity, n_relation, ripple_set - -def dataset_split(rating_np): - print('splitting dataset ...') - - # train:eval:test = 6:2:2 - eval_ratio = 0.2 - test_ratio = 0.2 - n_ratings = rating_np.shape[0] - - eval_indices = np.random.choice(n_ratings, size=int(n_ratings * eval_ratio), replace=False) - left = set(range(n_ratings)) - set(eval_indices) - test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False) - train_indices = list(left - set(test_indices)) - # print(len(train_indices), len(eval_indices), len(test_indices)) - - # traverse training data, only keeping the users with positive ratings - user_history_dict = dict() - for i in train_indices: - user = rating_np.iloc[i][0] - item = rating_np.iloc[i][1] - rating = rating_np.iloc[i][2] - if rating == 1: - if user not in user_history_dict: - user_history_dict[user] = [] - user_history_dict[user].append(item) - - train_indices = [i for i in train_indices if rating_np.iloc[i][0] in user_history_dict] - eval_indices = [i for i in eval_indices if rating_np.iloc[i][0] in user_history_dict] - test_indices = [i for i in test_indices if rating_np.iloc[i][0] in user_history_dict] - # print(len(train_indices), len(eval_indices), len(test_indices)) - - train_data = rating_np.iloc[train_indices] - eval_data = rating_np.iloc[eval_indices] - test_data = rating_np.iloc[test_indices] - - return train_data, eval_data, test_data, user_history_dict +def load_kg(kg_final): + """Standarize indexes for items and entities + Args: + kg_final: dataframe, knowledge graph converted with columns head, + relation and tail, with internal entity IDs -def load_kg(kg_final): + Returns: + n_entity: int, number of entities in KG + n_relation: int, number of relations in KG + kg: KG in dictionary shape + """ print('reading KG file ...') n_entity = len(set(kg_final.iloc[:, 0]) | set(kg_final.iloc[:, 2])) n_relation = len(set(kg_final.iloc[:, 1])) - kg = construct_kg(kg_final) - - return n_entity, n_relation, kg - - -def construct_kg(kg_np): - print('constructing knowledge graph ...') kg = collections.defaultdict(list) - for index, row in kg_np.iterrows(): + for index, row in kg_final.iterrows(): kg[row["head"]].append((row["tail"], row["relation"])) - return kg + + return n_entity, n_relation, kg def get_ripple_set(kg, user_history_dict, n_hop, n_memory): + """Build Ripple Set, dictionary for the related entities in the KG + given the paths of users, number of hops and memory + + Args: + kg: KG in dictionary shape + user_history_dict: dataframe, train rating data with positive ratings + n_hop: int, maximum hops in the KG + n_memory: int, size of ripple set for each hop + + Returns: + ripple_set: set of knowledge triples per user positive rating, from 0 until n_hop + """ print('constructing ripple set ...') # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...] From 5c6cd3bda32c3f232817d9d7e72fc6e0d714d50d Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 13:08:57 +0100 Subject: [PATCH 25/75] added intro and acc calculation --- notebooks/02_model/rippleNet_deep_dive.ipynb | 74 +++++++++++++------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 057da363b7..c1ae27089e 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, "metadata": { "tags": [ "parameters" @@ -76,6 +76,28 @@ "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems\n", + "> Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo\n", + "> The 27th ACM International Conference on Information and Knowledge Management (CIKM 2018)\n", + "\n", + "Online code of RippleNet: https://github.com/hwwang55/RippleNet\n", + "\n", + "To address the sparsity and cold start problem of collaborative filtering, researchers usually make use of side information, such as social networks or item attributes, to improve recommendation performance. This paper considers the knowledge graph as the source of side information. To address the limitations of existing embedding-based and path-based methods for knowledge-graph-aware recommendation, we propose RippleNet, an end-to-end framework that naturally incorporates the knowledge graph into recommender systems. Similar to actual ripples propagating on the water, RippleNet stimulates the propagation of user preferences over the set of knowledge entities by automatically and iteratively extending a user’s potential interests along links in the knowledge graph. The multiple \"ripples\" activated by a user’s historically clicked items are thus superposed to form the preference distribution of the user with respect to a candidate item, which could be used for predicting the final clicking probability. Through extensive experiments on real-world datasets, we demonstrate that RippleNet achieves substantial gains in a variety of scenarios, including movie, book and news recommendation, over several state-of-the-art baselines.\n", + "\n", + "![alt text](https://github.com/hwwang55/RippleNet/raw/master/framework.jpg)\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -85,14 +107,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.20kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.04kKB/s]\n" ] } ], @@ -108,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -120,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -134,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -147,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -164,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -173,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -192,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 28, "metadata": { "scrolled": true }, @@ -220,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -230,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -239,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -257,17 +279,6 @@ "ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feed_dict = dict()\n", - "for user in data[start:end, 0]:\n", - " feed_dict[model.memories_h[i]] = ripple_set[user][i][0]" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -316,7 +327,18 @@ "outputs": [], "source": [ "from sklearn.metrics import roc_auc_score\n", - "roc_auc_score(y_true=labels, y_score=scores)" + "auc = roc_auc_score(y_true=labels, y_score=scores)\n", + "acc = np.mean(np.equal(predictions, labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"AUC: \", auc)\n", + "print(\"ACC: \", acc)" ] } ], From c5d87c51145d46357288e7277f83fe3b0bd7a847 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 13:09:54 +0100 Subject: [PATCH 26/75] modified eval tu use internal function --- reco_utils/recommender/ripplenet/model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 0530ade8eb..35d0cc049d 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -164,14 +164,14 @@ def _build_train(self): def train(self, sess, feed_dict): return sess.run([self.optimizer, self.loss], feed_dict) + + def return_scores(self, sess, feed_dict): + labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) + return labels, scores def eval(self, sess, feed_dict): - labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) + labels, scores = return_scores(self, sess, feed_dict) auc = roc_auc_score(y_true=labels, y_score=scores) predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) return auc, acc - - def return_scores(self, sess, feed_dict): - labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) - return labels, scores From eaff00e249d95dc9e99526907d8c38a1991308c8 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 13:10:15 +0100 Subject: [PATCH 27/75] reordered functions --- reco_utils/recommender/ripplenet/train.py | 111 +++++++++++----------- 1 file changed, 55 insertions(+), 56 deletions(-) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py index 0afe9d701a..b020cac240 100644 --- a/reco_utils/recommender/ripplenet/train.py +++ b/reco_utils/recommender/ripplenet/train.py @@ -5,41 +5,15 @@ import numpy as np from reco_utils.recommender.ripplenet.model import RippleNet -def train(n_epoch, batch_size, - dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, - data_info, show_loss): - - train_data = data_info[0] - eval_data = data_info[1] - test_data = data_info[2] - n_entity = data_info[3] - n_relation = data_info[4] - ripple_set = data_info[5] - - model = RippleNet(dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, n_entity, n_relation) - - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - for step in range(n_epoch): - # training - np.random.shuffle(train_data) - start = 0 - while start < train_data.shape[0]: - _, loss = model.train( - sess, get_feed_dict(n_hop, model, train_data, ripple_set, start, start + batch_size)) - start += batch_size - if show_loss: - print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - - # evaluation - train_auc, train_acc = evaluation(sess, n_hop, model, train_data, ripple_set, batch_size) - eval_auc, eval_acc = evaluation(sess, n_hop, model, eval_data, ripple_set, batch_size) - test_auc, test_acc = evaluation(sess, n_hop, model, test_data, ripple_set, batch_size) - - print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' - % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) +def get_feed_dict(n_hop, model, data, ripple_set, start, end): + feed_dict = dict() + feed_dict[model.items] = data[start:end, 1] + feed_dict[model.labels] = data[start:end, 2] + for i in range(n_hop): + feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] + feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] + feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] + return feed_dict def fit(sess, n_epoch, batch_size,n_hop, @@ -62,16 +36,17 @@ def fit(sess, % (step, train_auc, train_acc)) return model -def get_feed_dict(n_hop, model, data, ripple_set, start, end): - feed_dict = dict() - feed_dict[model.items] = data[start:end, 1] - feed_dict[model.labels] = data[start:end, 2] - for i in range(n_hop): - feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] - feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] - feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] - return feed_dict - +def predict(sess, batch_size, n_hop, model, data, ripple_set): + start = 0 + labels_list = [] + scores_list = [] + while start < data.shape[0]: + labels, scores = model.return_scores(sess, get_feed_dict(n_hop, model, data, ripple_set, start, start + batch_size)) + labels_list.append(labels) + scores_list.append(scores) + start += batch_size + + return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) def evaluation(sess, n_hop, model, data, ripple_set, batch_size): start = 0 @@ -84,14 +59,38 @@ def evaluation(sess, n_hop, model, data, ripple_set, batch_size): start += batch_size return float(np.mean(auc_list)), float(np.mean(acc_list)) -def predict(sess, batch_size, n_hop, model, data, ripple_set): - start = 0 - labels_list = [] - scores_list = [] - while start < data.shape[0]: - labels, scores = model.return_scores(sess, get_feed_dict(n_hop, model, data, ripple_set, start, start + batch_size)) - labels_list.append(labels) - scores_list.append(scores) - start += batch_size - - return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) \ No newline at end of file +def train(n_epoch, batch_size, + dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, + data_info, show_loss): + + train_data = data_info[0] + eval_data = data_info[1] + test_data = data_info[2] + n_entity = data_info[3] + n_relation = data_info[4] + ripple_set = data_info[5] + + model = RippleNet(dim, n_hop, kge_weight, l2_weight, lr, + n_memory, item_update_mode, using_all_hops, n_entity, n_relation) + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for step in range(n_epoch): + # training + np.random.shuffle(train_data) + start = 0 + while start < train_data.shape[0]: + _, loss = model.train( + sess, get_feed_dict(n_hop, model, train_data, ripple_set, start, start + batch_size)) + start += batch_size + if show_loss: + print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) + + # evaluation + train_auc, train_acc = evaluation(sess, n_hop, model, train_data, ripple_set, batch_size) + eval_auc, eval_acc = evaluation(sess, n_hop, model, eval_data, ripple_set, batch_size) + test_auc, test_acc = evaluation(sess, n_hop, model, test_data, ripple_set, batch_size) + + print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' + % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) \ No newline at end of file From 701c9e913ae5f53228360ba03f2d85288f20d663 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 14:46:54 +0100 Subject: [PATCH 28/75] return scores --- reco_utils/recommender/ripplenet/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 35d0cc049d..5d7299c424 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -170,7 +170,7 @@ def return_scores(self, sess, feed_dict): return labels, scores def eval(self, sess, feed_dict): - labels, scores = return_scores(self, sess, feed_dict) + labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) auc = roc_auc_score(y_true=labels, y_score=scores) predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) From a617d77eb0fbec647a93ef4597be18cc7632f1ff Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 14:54:40 +0100 Subject: [PATCH 29/75] added data types --- .../recommender/ripplenet/data_loader.py | 18 ++++++++--------- .../recommender/ripplenet/preprocess.py | 20 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index 542f44334b..fb9b988f20 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -10,13 +10,13 @@ def load_kg(kg_final): """Standarize indexes for items and entities Args: - kg_final: dataframe, knowledge graph converted with columns head, + kg_final (pd.DataFrame): knowledge graph converted with columns head, relation and tail, with internal entity IDs Returns: - n_entity: int, number of entities in KG - n_relation: int, number of relations in KG - kg: KG in dictionary shape + n_entity (int): number of entities in KG + n_relation (int): number of relations in KG + kg (dictionary): KG in dictionary shape """ print('reading KG file ...') @@ -35,13 +35,13 @@ def get_ripple_set(kg, user_history_dict, n_hop, n_memory): given the paths of users, number of hops and memory Args: - kg: KG in dictionary shape - user_history_dict: dataframe, train rating data with positive ratings - n_hop: int, maximum hops in the KG - n_memory: int, size of ripple set for each hop + kg (dictionary): KG in dictionary shape + user_history_dict (dictionary): train rating data with positive ratings + n_hop (int): int, maximum hops in the KG + n_memory (int): int, size of ripple set for each hop Returns: - ripple_set: set of knowledge triples per user positive rating, from 0 until n_hop + ripple_set (dictionary): set of knowledge triples per user positive rating, from 0 until n_hop """ print('constructing ripple set ...') diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 10e33b2b9b..1f85efc5c6 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -9,11 +9,11 @@ def read_item_index_to_entity_id_file(item_to_entity): """Standarize indexes for items and entities Args: - item_to_entity: KG dataframe with original item and entity IDs + item_to_entity (pd.DataFrame): KG dataframe with original item and entity IDs Returns: - item_index_old2new: dictionary, conversion from original item ID to internal item ID - entity_id2index: dictionary, conversion from original entity ID to internal entity ID + item_index_old2new (dictionary): dictionary conversion from original item ID to internal item ID + entity_id2index (dictionary): dictionary conversion from original entity ID to internal entity ID """ item_index_old2new = dict() entity_id2index = dict() @@ -32,12 +32,12 @@ def convert_rating(ratings, item_index_old2new, threshold): Use rating threshold to determite positive ratings Args: - ratings: dataframe, ratings with columns ["UserId", "ItemId", "Rating"] - item_index_old2new: dictionary, conversion from original item ID to internal item ID - threshold: minimum valur for the rating to be considered positive + ratings (pd.DataFrame): ratings with columns ["UserId", "ItemId", "Rating"] + item_index_old2new (dictionary): dictionary, conversion from original item ID to internal item ID + threshold (int): minimum valur for the rating to be considered positive Returns: - ratings_final: dataframe, ratings converted with columns userID, + ratings_final (pd.DataFrame): ratings converted with columns userID, internal item ID and binary rating (1, 0) """ item_set = set(item_index_old2new.values()) @@ -92,11 +92,11 @@ def convert_rating(ratings, item_index_old2new, threshold): def convert_kg(kg, entity_id2index): """Apply entity standarization to KG dataset Args: - kg: dataframe, knowledge graph with columns ["original_entity_id", "relation", "linked_entities_id"] - entity_id2index: dictionary, conversion from original entity ID to internal entity ID + kg (pd.DataFrame): knowledge graph with columns ["original_entity_id", "relation", "linked_entities_id"] + entity_id2index (dictionary): dictionary, conversion from original entity ID to internal entity ID Returns: - kg_final: dataframe, knowledge graph converted with columns head, + kg_final (pd.DataFrame): knowledge graph converted with columns head, relation and tail, with internal entity IDs """ print('converting kg file ...') From 2a61af536b9804ecbc00b2b8893ae81553f294d1 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 15:09:03 +0100 Subject: [PATCH 30/75] keep original rating value --- reco_utils/recommender/ripplenet/preprocess.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 1f85efc5c6..26fd0e02f7 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -56,11 +56,11 @@ def convert_rating(ratings, item_index_old2new, threshold): if rating >= threshold: if user_index_old not in user_pos_ratings: user_pos_ratings[user_index_old] = set() - user_pos_ratings[user_index_old].add(item_index) + user_pos_ratings[user_index_old].add((item_index, rating)) else: if user_index_old not in user_neg_ratings: user_neg_ratings[user_index_old] = set() - user_neg_ratings[user_index_old].add(item_index) + user_neg_ratings[user_index_old].add((item_index, rating)) print('converting rating file ...') writer = [] @@ -71,18 +71,19 @@ def convert_rating(ratings, item_index_old2new, threshold): user_index_old2new[user_index_old] = user_cnt user_cnt += 1 user_index = user_index_old2new[user_index_old] - - for item in pos_item_set: + for item, original_rating in pos_item_set: writer.append({"user_index": user_index, "item": item, - "rating": 1}) + "rating": 1, + "original_rating": original_rating}) unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: unwatched_set -= user_neg_ratings[user_index_old] for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): writer.append({"user_index": user_index, "item": item, - "rating": 0}) + "rating": 0, + "original_rating": 0}) ratings_final = pd.DataFrame(writer) print('number of users: %d' % user_cnt) print('number of items: %d' % len(item_set)) From 03b5c3ff28bdb19cab2756c94ff127cebc83bb59 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 15:27:15 +0100 Subject: [PATCH 31/75] undo last commit for trial --- notebooks/02_model/rippleNet_deep_dive.ipynb | 1 - reco_utils/recommender/ripplenet/preprocess.py | 13 ++++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index c1ae27089e..43fe820a55 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -39,7 +39,6 @@ " convert_kg)\n", "\n", "from reco_utils.recommender.ripplenet.data_loader import (\n", - " dataset_split,\n", " load_kg, \n", " get_ripple_set)\n", "\n", diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 26fd0e02f7..1f85efc5c6 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -56,11 +56,11 @@ def convert_rating(ratings, item_index_old2new, threshold): if rating >= threshold: if user_index_old not in user_pos_ratings: user_pos_ratings[user_index_old] = set() - user_pos_ratings[user_index_old].add((item_index, rating)) + user_pos_ratings[user_index_old].add(item_index) else: if user_index_old not in user_neg_ratings: user_neg_ratings[user_index_old] = set() - user_neg_ratings[user_index_old].add((item_index, rating)) + user_neg_ratings[user_index_old].add(item_index) print('converting rating file ...') writer = [] @@ -71,19 +71,18 @@ def convert_rating(ratings, item_index_old2new, threshold): user_index_old2new[user_index_old] = user_cnt user_cnt += 1 user_index = user_index_old2new[user_index_old] - for item, original_rating in pos_item_set: + + for item in pos_item_set: writer.append({"user_index": user_index, "item": item, - "rating": 1, - "original_rating": original_rating}) + "rating": 1}) unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: unwatched_set -= user_neg_ratings[user_index_old] for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): writer.append({"user_index": user_index, "item": item, - "rating": 0, - "original_rating": 0}) + "rating": 0}) ratings_final = pd.DataFrame(writer) print('number of users: %d' % user_cnt) print('number of items: %d' % len(item_set)) From c6091afad529eadb07f0fe6fd390df36fb544051 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 14 Dec 2019 16:06:18 +0100 Subject: [PATCH 32/75] ratings output with original rating --- reco_utils/recommender/ripplenet/preprocess.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 1f85efc5c6..c13ce853d3 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -56,11 +56,11 @@ def convert_rating(ratings, item_index_old2new, threshold): if rating >= threshold: if user_index_old not in user_pos_ratings: user_pos_ratings[user_index_old] = set() - user_pos_ratings[user_index_old].add(item_index) + user_pos_ratings[user_index_old].add((item_index, rating)) else: if user_index_old not in user_neg_ratings: user_neg_ratings[user_index_old] = set() - user_neg_ratings[user_index_old].add(item_index) + user_neg_ratings[user_index_old].add((item_index, rating)) print('converting rating file ...') writer = [] @@ -71,18 +71,20 @@ def convert_rating(ratings, item_index_old2new, threshold): user_index_old2new[user_index_old] = user_cnt user_cnt += 1 user_index = user_index_old2new[user_index_old] - - for item in pos_item_set: + for item, original_rating in pos_item_set: writer.append({"user_index": user_index, "item": item, - "rating": 1}) + "rating": 1, + "original_rating": original_rating}) + pos_item_set = set(i[0] for i in pos_item_set) unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: unwatched_set -= user_neg_ratings[user_index_old] for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): writer.append({"user_index": user_index, "item": item, - "rating": 0}) + "rating": 0, + "original_rating": 0}) ratings_final = pd.DataFrame(writer) print('number of users: %d' % user_cnt) print('number of items: %d' % len(item_set)) @@ -93,7 +95,7 @@ def convert_kg(kg, entity_id2index): """Apply entity standarization to KG dataset Args: kg (pd.DataFrame): knowledge graph with columns ["original_entity_id", "relation", "linked_entities_id"] - entity_id2index (dictionary): dictionary, conversion from original entity ID to internal entity ID + entity_id2index (pd.DataFrame): dictionary, conversion from original entity ID to internal entity ID Returns: kg_final (pd.DataFrame): knowledge graph converted with columns head, From a1984dd875a9839983c88ef1cb3609a401dd8766 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 11:44:51 +0100 Subject: [PATCH 33/75] added seed to random choice --- reco_utils/recommender/ripplenet/preprocess.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index c13ce853d3..ec479c8ee9 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -27,7 +27,7 @@ def read_item_index_to_entity_id_file(item_to_entity): return item_index_old2new, entity_id2index -def convert_rating(ratings, item_index_old2new, threshold): +def convert_rating(ratings, item_index_old2new, threshold, seed): """Apply item standarization to ratings dataset. Use rating threshold to determite positive ratings @@ -80,11 +80,12 @@ def convert_rating(ratings, item_index_old2new, threshold): unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: unwatched_set -= user_neg_ratings[user_index_old] + np.random.seed(seed) for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): writer.append({"user_index": user_index, "item": item, "rating": 0, - "original_rating": 0}) + "original_rating": 0}) ratings_final = pd.DataFrame(writer) print('number of users: %d' % user_cnt) print('number of items: %d' % len(item_set)) From 5a0621b02a0706ab3ad1a6164a0bac3ba6c94c93 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 11:47:02 +0100 Subject: [PATCH 34/75] remove negative ratings --- reco_utils/recommender/ripplenet/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index ec479c8ee9..83fd537bf3 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -60,7 +60,7 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): else: if user_index_old not in user_neg_ratings: user_neg_ratings[user_index_old] = set() - user_neg_ratings[user_index_old].add((item_index, rating)) + user_neg_ratings[user_index_old].add(item_index) print('converting rating file ...') writer = [] From ad6aa19ddae092875ce89b414209fcae075b8a54 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 11:57:54 +0100 Subject: [PATCH 35/75] added scores for precision_at_k --- notebooks/02_model/rippleNet_deep_dive.ipynb | 100 +++++++++++++++++-- 1 file changed, 91 insertions(+), 9 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 43fe820a55..7fd807689e 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -30,7 +30,7 @@ "import tensorflow as tf\n", "import os\n", "import argparse \n", - "from reco_utils.evaluation.python_evaluation import auc\n", + "from reco_utils.evaluation.python_evaluation import auc, precision_at_k, recall_at_k\n", "from reco_utils.dataset import movielens\n", "from reco_utils.dataset.python_splitters import python_stratified_split\n", "\n", @@ -72,7 +72,9 @@ "lr = 0.02 #learning rate\n", "n_memory = 32 #size of ripple set for each hop\n", "item_update_mode = 'plus_transform' #how to update item at the end of each hop\n", - "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction" + "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction\n", + "#Evaluation parameters\n", + "k = 5" ] }, { @@ -309,11 +311,11 @@ "with tf.Session() as sess:\n", " model = fit(sess=sess, \n", " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", - " model=ripple, train_data=train_data.to_numpy(), \n", + " model=ripple, train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", " ripple_set=ripple_set, show_loss=show_loss)\n", " labels, scores = predict(sess=sess, \n", " batch_size=batch_size, n_hop=n_hop, \n", - " model=model, data=test_data.to_numpy(),\n", + " model=model, data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", " ripple_set=ripple_set)\n", "\n", "predictions = [1 if i >= 0.5 else 0 for i in scores]" @@ -325,9 +327,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.metrics import roc_auc_score\n", - "auc = roc_auc_score(y_true=labels, y_score=scores)\n", - "acc = np.mean(np.equal(predictions, labels))" + "test_data['scores'] = scores" ] }, { @@ -336,8 +336,90 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"AUC: \", auc)\n", - "print(\"ACC: \", acc)" + "auc_score = auc(test_data, test_data, \n", + " col_user=\"user_index\",\n", + " col_item=\"item\",\n", + " col_rating=\"rating\",\n", + " col_prediction=\"scores\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The auc score is {}\".format(auc_score))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_score = acc(test_data, test_data, \n", + " col_user=\"user_index\",\n", + " col_item=\"item\",\n", + " col_rating=\"rating\",\n", + " col_prediction=\"scores\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The acc score is {}\".format(acc_score))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision_k_score = precision_at_k(test_data, test_data, \n", + " col_user=\"user_index\",\n", + " col_item=\"item\",\n", + " col_rating=\"original_rating\",\n", + " col_prediction=\"scores\",\n", + " relevancy_method=\"top_k\",\n", + " k=k)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The precision_k_score score at k = {}, is {}\".format(k, precision_k_score))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "recall_k_score = recall_at_k(test_data, test_data, \n", + " col_user=\"user_index\",\n", + " col_item=\"item\",\n", + " col_rating=\"original_rating\",\n", + " col_prediction=\"scores\",\n", + " relevancy_method=\"top_k\",\n", + " k=k)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"The recall_k_score score at k = {}, is {}\".format(k, recall_k_score))" ] } ], From 6eb10bee3670434336bea2051774b72954b739f8 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 11:58:24 +0100 Subject: [PATCH 36/75] import acc evaluation --- notebooks/02_model/rippleNet_deep_dive.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 7fd807689e..a6ab93f15c 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -30,7 +30,7 @@ "import tensorflow as tf\n", "import os\n", "import argparse \n", - "from reco_utils.evaluation.python_evaluation import auc, precision_at_k, recall_at_k\n", + "from reco_utils.evaluation.python_evaluation import auc, acc, precision_at_k, recall_at_k\n", "from reco_utils.dataset import movielens\n", "from reco_utils.dataset.python_splitters import python_stratified_split\n", "\n", From 1ec39090afbec810e853b33e8a37e78f87ae2c4b Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 12:14:25 +0100 Subject: [PATCH 37/75] accuracy score and seed to function --- notebooks/02_model/rippleNet_deep_dive.ipynb | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index a6ab93f15c..a12c8a5942 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -210,7 +210,8 @@ } ], "source": [ - "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new, threshold = rating_threshold)" + "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new,\n", + " threshold = rating_threshold, seed = 12)" ] }, { @@ -340,7 +341,7 @@ " col_user=\"user_index\",\n", " col_item=\"item\",\n", " col_rating=\"rating\",\n", - " col_prediction=\"scores\")\n" + " col_prediction=\"scores\")" ] }, { @@ -358,11 +359,7 @@ "metadata": {}, "outputs": [], "source": [ - "acc_score = acc(test_data, test_data, \n", - " col_user=\"user_index\",\n", - " col_item=\"item\",\n", - " col_rating=\"rating\",\n", - " col_prediction=\"scores\")" + "acc_score = np.mean(np.equal(predictions, labels))" ] }, { From 287b46d014ca2dd88e87129042fb5ac95bce526f Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 12:32:53 +0100 Subject: [PATCH 38/75] bug fix --- notebooks/02_model/rippleNet_deep_dive.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index a12c8a5942..9022b48f1f 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -249,7 +249,7 @@ "outputs": [], "source": [ "train_data, test_eval_data = python_stratified_split(ratings_final, ratio=0.6, col_user='user_index', col_item='item', seed=12)\n", - "test_data, eval_data = python_stratified_split(ratings_final, ratio=0.5, col_user='user_index', col_item='item', seed=12)" + "test_data, eval_data = python_stratified_split(test_eval_data, ratio=0.5, col_user='user_index', col_item='item', seed=12)" ] }, { From b0b6ac52b1a35eded3e8d5f8ca6edec6543f5180 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 15 Dec 2019 12:35:58 +0100 Subject: [PATCH 39/75] removed unexisting import --- notebooks/02_model/rippleNet_deep_dive.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 9022b48f1f..1e4e722fd8 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -30,7 +30,7 @@ "import tensorflow as tf\n", "import os\n", "import argparse \n", - "from reco_utils.evaluation.python_evaluation import auc, acc, precision_at_k, recall_at_k\n", + "from reco_utils.evaluation.python_evaluation import auc, precision_at_k, recall_at_k\n", "from reco_utils.dataset import movielens\n", "from reco_utils.dataset.python_splitters import python_stratified_split\n", "\n", From b485561b19edef1e342570d9d04d21ce6e0080bd Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 21 Dec 2019 18:20:10 +0100 Subject: [PATCH 40/75] change model structure to make fit and predict internal methods --- notebooks/02_model/rippleNet_deep_dive.ipynb | 42 ++++----- reco_utils/recommender/ripplenet/model.py | 72 ++++++++++++++- reco_utils/recommender/ripplenet/train.py | 96 -------------------- 3 files changed, 88 insertions(+), 122 deletions(-) delete mode 100644 reco_utils/recommender/ripplenet/train.py diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 1e4e722fd8..45bbdb9b89 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -42,8 +42,6 @@ " load_kg, \n", " get_ripple_set)\n", "\n", - "from reco_utils.recommender.ripplenet.train import (fit, predict)\n", - "\n", "from reco_utils.recommender.ripplenet.model import RippleNet\n", "\n", "print(\"System version: {}\".format(sys.version))\n", @@ -52,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -108,14 +106,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.04kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:04<00:00, 1.05kKB/s]\n" ] } ], @@ -131,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -143,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -157,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -187,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -196,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -216,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -244,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -254,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -263,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -271,7 +269,6 @@ "output_type": "stream", "text": [ "reading KG file ...\n", - "constructing knowledge graph ...\n", "constructing ripple set ...\n" ] } @@ -290,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -310,13 +307,14 @@ " n_entity=n_entity,n_relation=n_relation)\n", "\n", "with tf.Session() as sess:\n", - " model = fit(sess=sess, \n", + " ripple.fit(sess=sess, \n", " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", - " model=ripple, train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", + " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", " ripple_set=ripple_set, show_loss=show_loss)\n", - " labels, scores = predict(sess=sess, \n", + " \n", + " labels, scores = ripple.predict(sess=sess, \n", " batch_size=batch_size, n_hop=n_hop, \n", - " model=model, data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", " ripple_set=ripple_set)\n", "\n", "predictions = [1 if i >= 0.5 else 0 for i in scores]" diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 5d7299c424..a026754ff4 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -12,7 +12,7 @@ def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, n_memory, item_update_mode, using_all_hops, n_entity, n_relation) self._build_inputs() self._build_embeddings() - self._build_model() + self._build_self() self._build_loss() self._build_train() @@ -53,7 +53,7 @@ def _build_embeddings(self): shape=[self.n_relation, self.dim, self.dim], initializer=tf.contrib.layers.xavier_initializer()) - def _build_model(self): + def _build_self(self): # transformation matrix for updating item embeddings at the end of each hop self.transform_matrix = tf.get_variable(name="transform_matrix", shape=[self.dim, self.dim], dtype=tf.float64, initializer=tf.contrib.layers.xavier_initializer()) @@ -76,7 +76,7 @@ def _build_model(self): o_list = self._key_addressing() - self.scores = tf.squeeze(self.predict(self.item_embeddings, o_list)) + self.scores = tf.squeeze(self.predict_scores(self.item_embeddings, o_list)) self.scores_normalized = tf.sigmoid(self.scores) def _key_addressing(self): @@ -120,7 +120,7 @@ def update_item_embedding(self, item_embeddings, o): raise Exception("Unknown item updating mode: " + self.item_update_mode) return item_embeddings - def predict(self, item_embeddings, o_list): + def predict_scores(self, item_embeddings, o_list): y = o_list[-1] if self.using_all_hops: for i in range(self.n_hop - 1): @@ -175,3 +175,67 @@ def eval(self, sess, feed_dict): predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) return auc, acc + + def get_feed_dict(self, n_hop, data, ripple_set, start, end): + feed_dict = dict() + feed_dict[self.items] = data[start:end, 1] + feed_dict[self.labels] = data[start:end, 2] + for i in range(n_hop): + feed_dict[self.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] + feed_dict[self.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] + feed_dict[self.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] + return feed_dict + + def print_metrics_evaluation(self, sess, n_hop, data, ripple_set, batch_size): + start = 0 + auc_list = [] + acc_list = [] + while start < data.shape[0]: + auc, acc = self.eval(sess, self.get_feed_dict(n_hop, + data, ripple_set, + start, start + batch_size)) + auc_list.append(auc) + acc_list.append(acc) + start += batch_size + return float(np.mean(auc_list)), float(np.mean(acc_list)) + + def fit(self, + sess, + n_epoch, batch_size, n_hop, + train_data, ripple_set, show_loss): + sess.run(tf.global_variables_initializer()) + for step in range(n_epoch): + # training + np.random.shuffle(train_data) + start = 0 + while start < train_data.shape[0]: + _, loss = self.train(sess, + self.get_feed_dict(n_hop, + train_data, ripple_set, + start, start + batch_size)) + start += batch_size + if show_loss: + print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) + + train_auc, train_acc = self.print_metrics_evaluation(sess, n_hop, + train_data, ripple_set, + batch_size) + + print('epoch %d train auc: %.4f acc: %.4f' + % (step, train_auc, train_acc)) + + def predict(self, sess, + batch_size, n_hop, data, ripple_set): + start = 0 + labels_list = [] + scores_list = [] + while start < data.shape[0]: + labels, scores = self.return_scores(sess, + self.get_feed_dict(n_hop, + data, ripple_set, + start, start + batch_size)) + labels_list.append(labels) + scores_list.append(scores) + start += batch_size + + return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) diff --git a/reco_utils/recommender/ripplenet/train.py b/reco_utils/recommender/ripplenet/train.py deleted file mode 100644 index b020cac240..0000000000 --- a/reco_utils/recommender/ripplenet/train.py +++ /dev/null @@ -1,96 +0,0 @@ -# This code is modified from RippleNet -# Online code of RippleNet: https://github.com/hwwang55/RippleNet - -import tensorflow as tf -import numpy as np -from reco_utils.recommender.ripplenet.model import RippleNet - -def get_feed_dict(n_hop, model, data, ripple_set, start, end): - feed_dict = dict() - feed_dict[model.items] = data[start:end, 1] - feed_dict[model.labels] = data[start:end, 2] - for i in range(n_hop): - feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] - feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] - feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] - return feed_dict - -def fit(sess, - n_epoch, batch_size,n_hop, - model, train_data, ripple_set, show_loss): - sess.run(tf.global_variables_initializer()) - for step in range(n_epoch): - # training - np.random.shuffle(train_data) - start = 0 - while start < train_data.shape[0]: - _, loss = model.train( - sess, get_feed_dict(n_hop, model, train_data, ripple_set, start, start + batch_size)) - start += batch_size - if show_loss: - print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - - train_auc, train_acc = evaluation(sess, n_hop, model, train_data, ripple_set, batch_size) - - print('epoch %d train auc: %.4f acc: %.4f' - % (step, train_auc, train_acc)) - return model - -def predict(sess, batch_size, n_hop, model, data, ripple_set): - start = 0 - labels_list = [] - scores_list = [] - while start < data.shape[0]: - labels, scores = model.return_scores(sess, get_feed_dict(n_hop, model, data, ripple_set, start, start + batch_size)) - labels_list.append(labels) - scores_list.append(scores) - start += batch_size - - return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) - -def evaluation(sess, n_hop, model, data, ripple_set, batch_size): - start = 0 - auc_list = [] - acc_list = [] - while start < data.shape[0]: - auc, acc = model.eval(sess, get_feed_dict(n_hop, model, data, ripple_set, start, start + batch_size)) - auc_list.append(auc) - acc_list.append(acc) - start += batch_size - return float(np.mean(auc_list)), float(np.mean(acc_list)) - -def train(n_epoch, batch_size, - dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, - data_info, show_loss): - - train_data = data_info[0] - eval_data = data_info[1] - test_data = data_info[2] - n_entity = data_info[3] - n_relation = data_info[4] - ripple_set = data_info[5] - - model = RippleNet(dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, n_entity, n_relation) - - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - for step in range(n_epoch): - # training - np.random.shuffle(train_data) - start = 0 - while start < train_data.shape[0]: - _, loss = model.train( - sess, get_feed_dict(n_hop, model, train_data, ripple_set, start, start + batch_size)) - start += batch_size - if show_loss: - print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - - # evaluation - train_auc, train_acc = evaluation(sess, n_hop, model, train_data, ripple_set, batch_size) - eval_auc, eval_acc = evaluation(sess, n_hop, model, eval_data, ripple_set, batch_size) - test_auc, test_acc = evaluation(sess, n_hop, model, test_data, ripple_set, batch_size) - - print('epoch %d train auc: %.4f acc: %.4f eval auc: %.4f acc: %.4f test auc: %.4f acc: %.4f' - % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc)) \ No newline at end of file From 724689e3b09c0629ec4ffd29e17a4187326f2978 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 21 Dec 2019 18:28:02 +0100 Subject: [PATCH 41/75] function name change build_self to build_model --- reco_utils/recommender/ripplenet/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index a026754ff4..7298b7cfad 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -12,7 +12,7 @@ def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, n_memory, item_update_mode, using_all_hops, n_entity, n_relation) self._build_inputs() self._build_embeddings() - self._build_self() + self._build_model() self._build_loss() self._build_train() @@ -53,7 +53,7 @@ def _build_embeddings(self): shape=[self.n_relation, self.dim, self.dim], initializer=tf.contrib.layers.xavier_initializer()) - def _build_self(self): + def _build_model(self): # transformation matrix for updating item embeddings at the end of each hop self.transform_matrix = tf.get_variable(name="transform_matrix", shape=[self.dim, self.dim], dtype=tf.float64, initializer=tf.contrib.layers.xavier_initializer()) From e290b26d56151033b079025aa6cdc5052719e064 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 28 Dec 2019 12:32:39 +0100 Subject: [PATCH 42/75] clarified param for rippleset --- reco_utils/recommender/ripplenet/data_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index fb9b988f20..a8dd98234f 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -30,13 +30,13 @@ def load_kg(kg_final): return n_entity, n_relation, kg -def get_ripple_set(kg, user_history_dict, n_hop, n_memory): +def get_ripple_set(kg, user_history_dict, n_hop=2, n_memory=36): """Build Ripple Set, dictionary for the related entities in the KG given the paths of users, number of hops and memory Args: kg (dictionary): KG in dictionary shape - user_history_dict (dictionary): train rating data with positive ratings + user_history_dict (dictionary): positive ratings from train data, to build ripple structure n_hop (int): int, maximum hops in the KG n_memory (int): int, size of ripple set for each hop From c2dfad88c3a53aa11ea7b8a10811281a95cdcdf7 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 28 Dec 2019 12:35:26 +0100 Subject: [PATCH 43/75] improved notebook descriptions --- notebooks/02_model/rippleNet_deep_dive.ipynb | 545 +++++++++++++++++-- 1 file changed, 494 insertions(+), 51 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 45bbdb9b89..241af100b0 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -4,12 +4,58 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# RippleNet" + "# RippleNet on MovieLens using Wikidata (Python, GPU)¶" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we will walk through each step of the RippleNet algorithm.\n", + "RippleNet is an end-to-end framework that naturally incorporates the knowledge graphs into recommender systems.\n", + "To make the results of the paper reproducible we have used MovieLens as our dataset and Wikidata as our Knowledge Graph.\n", + "\n", + "> RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems\n", + "> Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo\n", + "> The 27th ACM International Conference on Information and Knowledge Management (CIKM 2018)\n", + "\n", + "Online code of RippleNet: https://github.com/hwwang55/RippleNet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To address the sparsity and cold start problem of collaborative filtering, researchers usually make use of side information, such as social networks or item attributes, to improve recommendation performance. This paper considers the knowledge graph as the source of side information. To address the limitations of existing embedding-based and path-based methods for knowledge-graph-aware recommendation, we propose RippleNet, an end-to-end framework that naturally incorporates the knowledge graph into recommender systems. Similar to actual ripples propagating on the water, RippleNet stimulates the propagation of user preferences over the set of knowledge entities by automatically and iteratively extending a user’s potential interests along links in the knowledge graph. The multiple \"ripples\" activated by a user’s historically clicked items are thus superposed to form the preference distribution of the user with respect to a candidate item, which could be used for predicting the final clicking probability. Through extensive experiments on real-world datasets, we demonstrate that RippleNet achieves substantial gains in a variety of scenarios, including movie, book and news recommendation, over several state-of-the-art baselines.\n", + "\n", + "![alt text](https://github.com/hwwang55/RippleNet/raw/master/framework.jpg)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implementation\n", + "Details of the python implementation can be found [here](https://github.com/microsoft/recommenders/tree/rippleNet/reco_utils/recommender/ripplenet). The implementation is based on the original code of RippleNet: https://github.com/hwwang55/RippleNet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RippleNet Movie Recommender" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -50,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 44, "metadata": { "tags": [ "parameters" @@ -60,6 +106,7 @@ "source": [ "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", "MOVIELENS_DATA_SIZE = '100k'\n", + "rating_threshold = 4 #Minimum rating of a movie to be considered positive\n", "# Ripple parameters\n", "n_epoch = 10 #the number of epochs\n", "batch_size = 1024 #batch size\n", @@ -69,67 +116,223 @@ "l2_weight = 1e-7 #weight of the l2 regularization term\n", "lr = 0.02 #learning rate\n", "n_memory = 32 #size of ripple set for each hop\n", - "item_update_mode = 'plus_transform' #how to update item at the end of each hop\n", + "item_update_mode = 'plus_transform' #how to update item at the end of each hop. \n", + " #possible options are replace, plus, plus_transform or replace transform\n", "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction\n", "#Evaluation parameters\n", - "k = 5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction" + "TOP_K = 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "> RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems\n", - "> Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo\n", - "> The 27th ACM International Conference on Information and Knowledge Management (CIKM 2018)\n", - "\n", - "Online code of RippleNet: https://github.com/hwwang55/RippleNet\n", - "\n", - "To address the sparsity and cold start problem of collaborative filtering, researchers usually make use of side information, such as social networks or item attributes, to improve recommendation performance. This paper considers the knowledge graph as the source of side information. To address the limitations of existing embedding-based and path-based methods for knowledge-graph-aware recommendation, we propose RippleNet, an end-to-end framework that naturally incorporates the knowledge graph into recommender systems. Similar to actual ripples propagating on the water, RippleNet stimulates the propagation of user preferences over the set of knowledge entities by automatically and iteratively extending a user’s potential interests along links in the knowledge graph. The multiple \"ripples\" activated by a user’s historically clicked items are thus superposed to form the preference distribution of the user with respect to a candidate item, which could be used for predicting the final clicking probability. Through extensive experiments on real-world datasets, we demonstrate that RippleNet achieves substantial gains in a variety of scenarios, including movie, book and news recommendation, over several state-of-the-art baselines.\n", - "\n", - "![alt text](https://github.com/hwwang55/RippleNet/raw/master/framework.jpg)\n" + "## Read original data and transform entity ids to numerical" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Read original data and transform entity ids to numerical" + "RippleNet is built on:\n", + "- Ratings from users on Movies\n", + "- Knowledge Graph (KG) linking Movies to their connected entities in Wikidata. See [this notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/01_prepare_data/wikidata_knowledge_graph.ipynb)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:04<00:00, 1.05kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:05<00:00, 812KB/s] \n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIdItemIdRatingTimestampTitleGenresYear
01962423.0881250949Kolya (1996)Comedy1996
1632423.0875747190Kolya (1996)Comedy1996
22262425.0883888671Kolya (1996)Comedy1996
\n", + "
" + ], + "text/plain": [ + " UserId ItemId Rating Timestamp Title Genres Year\n", + "0 196 242 3.0 881250949 Kolya (1996) Comedy 1996\n", + "1 63 242 3.0 875747190 Kolya (1996) Comedy 1996\n", + "2 226 242 5.0 883888671 Kolya (1996) Comedy 1996" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "kg_original = pd.read_csv(\"https://recodatasets.blob.core.windows.net/wikidata/movielens_{}_wikidata.csv\".format(MOVIELENS_DATA_SIZE))\n", "ratings_original = movielens.load_pandas_df(MOVIELENS_DATA_SIZE,\n", " ('UserId', 'ItemId', 'Rating', 'Timestamp'),\n", " title_col='Title',\n", " genres_col='Genres',\n", " year_col='Year')\n", - "rating_threshold = 4" + "ratings_original.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_entitylinked_entitiesname_linked_entitiesmovielens_titlemovielens_id
0Q1141186Q130232drama filmKolya (1996)242
1Q1141186Q157443comedy filmKolya (1996)242
2Q1141186Q10819887Andrei ChalimonKolya (1996)242
\n", + "
" + ], + "text/plain": [ + " original_entity linked_entities name_linked_entities movielens_title \\\n", + "0 Q1141186 Q130232 drama film Kolya (1996) \n", + "1 Q1141186 Q157443 comedy film Kolya (1996) \n", + "2 Q1141186 Q10819887 Andrei Chalimon Kolya (1996) \n", + "\n", + " movielens_id \n", + "0 242 \n", + "1 242 \n", + "2 242 " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kg_original = pd.read_csv(\"https://recodatasets.blob.core.windows.net/wikidata/movielens_{}_wikidata.csv\".format(MOVIELENS_DATA_SIZE))\n", + "kg_original.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To be able to link the Ratings and KG ids we create two dictionaries match the KG original IDs to homogeneous numerical IDs. This will be done in two steps:\n", + "1. Transforming both Rating ID and KG ID to numerical\n", + "2. Matching the IDs using a dictionary" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -141,34 +344,222 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unified_identity
00Q106871
11Q1167668
22Q466186
\n", + "
" + ], + "text/plain": [ + " unified_id entity\n", + "0 0 Q106871\n", + "1 1 Q1167668\n", + "2 2 Q466186" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "var_id = \"movielens_id\"\n", + "# Create Dictionary that matches KG Wikidata ID to internal numerical KG ID\n", "entities_id = pd.DataFrame({\"entity\":list(set(kg_original.original_entity)) + list(set(kg_original.linked_entities))}).reset_index()\n", "entities_id = entities_id.rename(columns = {\"index\": \"unified_id\"})\n", - "\n", - "item_to_entity = kg_original[[var_id, \"original_entity\"]].drop_duplicates().reset_index().drop(columns = \"index\")\n", - "item_to_entity = transform_id(item_to_entity, entities_id, \"original_entity\")" + "entities_id.head(3)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 49, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_entity_idrelationlinked_entities_id
01336116764
15908116764
2376116764
\n", + "
" + ], + "text/plain": [ + " original_entity_id relation linked_entities_id\n", + "0 1336 1 16764\n", + "1 5908 1 16764\n", + "2 376 1 16764" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "# Tranforming KG IDs to internal numerical KG IDs created above \n", "kg = kg_original[[\"original_entity\", \"linked_entities\"]].drop_duplicates()\n", "kg = transform_id(kg, entities_id, \"original_entity\", \"original_entity_id\")\n", "kg = transform_id(kg, entities_id, \"linked_entities\", \"linked_entities_id\")\n", "kg[\"relation\"] = 1\n", - "kg_wikidata = kg[[\"original_entity_id\",\"relation\", \"linked_entities_id\"]]" + "kg_wikidata = kg[[\"original_entity_id\",\"relation\", \"linked_entities_id\"]]\n", + "kg_wikidata.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movielens_idunified_id
02421336
12425908
2302376
\n", + "
" + ], + "text/plain": [ + " movielens_id unified_id\n", + "0 242 1336\n", + "1 242 5908\n", + "2 302 376" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create Dictionary matching Movielens ID to internal numerical KG ID created above\n", + "var_id = \"movielens_id\"\n", + "item_to_entity = kg_original[[var_id, \"original_entity\"]].drop_duplicates().reset_index().drop(columns = \"index\")\n", + "item_to_entity = transform_id(item_to_entity, entities_id, \"original_entity\")\n", + "item_to_entity.head(3)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -183,18 +574,26 @@ "## Preprocess module from RippleNet" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " The dictionaries created above will be used on the Ratings and KG dataframes and unify their IDs. Also the Ratings will be converted from a numerical rating (1-5) to a binary rating (0-1) using the rating_threshold" + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ + "# Use dictionary Movielens ID - numerical KG ID to extract two dictionaries to be used on Ratings and KG\n", "item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(item_to_entity)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -214,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 54, "metadata": { "scrolled": true }, @@ -237,44 +636,81 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Split Data" + "## Split Data and Build RippleSet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data is divided into train, test and evaluation" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ - "train_data, test_eval_data = python_stratified_split(ratings_final, ratio=0.6, col_user='user_index', col_item='item', seed=12)\n", - "test_data, eval_data = python_stratified_split(test_eval_data, ratio=0.5, col_user='user_index', col_item='item', seed=12)" + "train_data, test_data, eval_data = python_stratified_split(ratings_final, ratio=[0.6, 0.2, 0.2], col_user='user_index', col_item='item', seed=12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The original KG dataframe is transformed into a dictionary, and the number of entities and retaltions extracted as parameters" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading KG file ...\n" + ] + } + ], "source": [ - "user_history_dict = train_data.loc[train_data.rating == 1].groupby('user_index')['item'].apply(list).to_dict()" + "n_entity, n_relation, kg = load_kg(kg_final)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The rippleset dictionary is built on the positive ratings (relevant entities) of the training data, and using the KG to build set of knowledge triples per user positive rating, from 0 until n_hop.\n", + "\n", + "**Relevant entity**: Given interaction matrix Y and knowledge graph G, the set of k-hop relevant entities for user u is defined as\n", + "\n", + "$$E^{k}_{u} = \\{t\\ |\\ (h,r,t) ∈ G\\ and\\ h ∈ E^{k−1}_{u}\\}, k=1,2,...,H$$\n", + "\n", + "Where $E_{u} = V_{u} = \\{v|yuv =1\\}$ is the set of user’s clicked items in the past, which can be seen as the seed set of user $u$ in KG\n", + "\n", + "**RippleSet**: The k-hop rippleset of user $u$ is defined as the set of knowledge triples starting from $E_{k−1}$:\n", + "\n", + "$$S^{k}_{u} = \\{(h,r,t)\\ |\\ (h,r,t) ∈ G\\ and\\ h ∈ E^{k−1}_{u}\\}, k = 1,2,...,H$$" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "reading KG file ...\n", "constructing ripple set ...\n" ] } ], "source": [ - "n_entity, n_relation, kg = load_kg(kg_final)\n", + "user_history_dict = train_data.loc[train_data.rating == 1].groupby('user_index')['item'].apply(list).to_dict()\n", "ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)" ] }, @@ -282,7 +718,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Build model" + "## Build model and predict" ] }, { @@ -320,6 +756,13 @@ "predictions = [1 if i >= 0.5 else 0 for i in scores]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results and Evaluation" + ] + }, { "cell_type": "code", "execution_count": null, From 100352ff978b1e18f2a860ed49a8a166dc2f11e9 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 28 Dec 2019 12:38:07 +0100 Subject: [PATCH 44/75] description for image --- notebooks/02_model/rippleNet_deep_dive.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 241af100b0..a8e2bdba0b 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -35,7 +35,9 @@ "source": [ "To address the sparsity and cold start problem of collaborative filtering, researchers usually make use of side information, such as social networks or item attributes, to improve recommendation performance. This paper considers the knowledge graph as the source of side information. To address the limitations of existing embedding-based and path-based methods for knowledge-graph-aware recommendation, we propose RippleNet, an end-to-end framework that naturally incorporates the knowledge graph into recommender systems. Similar to actual ripples propagating on the water, RippleNet stimulates the propagation of user preferences over the set of knowledge entities by automatically and iteratively extending a user’s potential interests along links in the knowledge graph. The multiple \"ripples\" activated by a user’s historically clicked items are thus superposed to form the preference distribution of the user with respect to a candidate item, which could be used for predicting the final clicking probability. Through extensive experiments on real-world datasets, we demonstrate that RippleNet achieves substantial gains in a variety of scenarios, including movie, book and news recommendation, over several state-of-the-art baselines.\n", "\n", - "![alt text](https://github.com/hwwang55/RippleNet/raw/master/framework.jpg)\n" + "![alt text](https://github.com/hwwang55/RippleNet/raw/master/framework.jpg)\n", + "\n", + "The overall framework of the RippleNet. It takes one user and one item as input, and outputs the predicted probability that the user will click the item. The KGs in the upper part illustrate the corresponding ripple sets activated by the user’s click history." ] }, { From adf198064fd77608d01f655985f8bcb9973a525e Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 11 Jan 2020 19:23:33 +0100 Subject: [PATCH 45/75] adapted to logger --- reco_utils/recommender/ripplenet/data_loader.py | 7 +++++-- reco_utils/recommender/ripplenet/preprocess.py | 16 ++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index a8dd98234f..1cdd94ceb9 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -4,7 +4,10 @@ import collections import os import numpy as np +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) def load_kg(kg_final): """Standarize indexes for items and entities @@ -18,7 +21,7 @@ def load_kg(kg_final): n_relation (int): number of relations in KG kg (dictionary): KG in dictionary shape """ - print('reading KG file ...') + log.info('reading KG file ...') n_entity = len(set(kg_final.iloc[:, 0]) | set(kg_final.iloc[:, 2])) n_relation = len(set(kg_final.iloc[:, 1])) @@ -43,7 +46,7 @@ def get_ripple_set(kg, user_history_dict, n_hop=2, n_memory=36): Returns: ripple_set (dictionary): set of knowledge triples per user positive rating, from 0 until n_hop """ - print('constructing ripple set ...') + log.info('constructing ripple set ...') # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...] ripple_set = collections.defaultdict(list) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 83fd537bf3..990c89140f 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -4,6 +4,10 @@ import argparse import numpy as np import pandas as pd +import logging + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) def read_item_index_to_entity_id_file(item_to_entity): """Standarize indexes for items and entities @@ -62,7 +66,7 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): user_neg_ratings[user_index_old] = set() user_neg_ratings[user_index_old].add(item_index) - print('converting rating file ...') + log.info('converting rating file ...') writer = [] user_cnt = 0 user_index_old2new = dict() @@ -87,8 +91,8 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): "rating": 0, "original_rating": 0}) ratings_final = pd.DataFrame(writer) - print('number of users: %d' % user_cnt) - print('number of items: %d' % len(item_set)) + log.info('number of users: %d' % user_cnt) + log.info('number of items: %d' % len(item_set)) return ratings_final @@ -102,7 +106,7 @@ def convert_kg(kg, entity_id2index): kg_final (pd.DataFrame): knowledge graph converted with columns head, relation and tail, with internal entity IDs """ - print('converting kg file ...') + log.info('converting kg file ...') entity_cnt = len(entity_id2index) relation_cnt = 0 relation_id2index = dict() @@ -134,6 +138,6 @@ def convert_kg(kg, entity_id2index): "tail": tail}) kg_final = pd.DataFrame(writer) - print('number of entities (containing items): %d' % entity_cnt) - print('number of relations: %d' % relation_cnt) + log.info('number of entities (containing items): %d' % entity_cnt) + log.info('number of relations: %d' % relation_cnt) return kg_final From ea4560be70b4185965f922a284b4410b11ce84a0 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 11 Jan 2020 19:24:20 +0100 Subject: [PATCH 46/75] new param optimizer_method, removed tf.Session --- notebooks/02_model/rippleNet_deep_dive.ipynb | 156 +++++++++++-------- 1 file changed, 90 insertions(+), 66 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index a8e2bdba0b..cea17de2a2 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -121,6 +121,8 @@ "item_update_mode = 'plus_transform' #how to update item at the end of each hop. \n", " #possible options are replace, plus, plus_transform or replace transform\n", "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction\n", + "optimizer_method = \"adam\" #optimizer method from adam, adadelta, adagrad, ftrl (FtrlOptimizer),\n", + " #gd (GradientDescentOptimizer), rmsprop (RMSPropOptimizer)\n", "#Evaluation parameters\n", "TOP_K = 10" ] @@ -143,14 +145,14 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:05<00:00, 812KB/s] \n" + "100%|██████████| 4.81k/4.81k [00:05<00:00, 867KB/s] \n" ] }, { @@ -225,7 +227,7 @@ "2 226 242 5.0 883888671 Kolya (1996) Comedy 1996" ] }, - "execution_count": 45, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -241,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -313,7 +315,7 @@ "2 242 " ] }, - "execution_count": 46, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -334,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -346,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -378,17 +380,17 @@ " \n", " 0\n", " 0\n", - " Q106871\n", + " Q163038\n", " \n", " \n", " 1\n", " 1\n", - " Q1167668\n", + " Q7605252\n", " \n", " \n", " 2\n", " 2\n", - " Q466186\n", + " Q1413403\n", " \n", " \n", "\n", @@ -396,12 +398,12 @@ ], "text/plain": [ " unified_id entity\n", - "0 0 Q106871\n", - "1 1 Q1167668\n", - "2 2 Q466186" + "0 0 Q163038\n", + "1 1 Q7605252\n", + "2 2 Q1413403" ] }, - "execution_count": 48, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -415,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -447,21 +449,21 @@ " \n", " \n", " 0\n", - " 1336\n", + " 698\n", " 1\n", - " 16764\n", + " 15010\n", " \n", " \n", " 1\n", - " 5908\n", + " 19885\n", " 1\n", - " 16764\n", + " 15010\n", " \n", " \n", " 2\n", - " 376\n", + " 447\n", " 1\n", - " 16764\n", + " 15010\n", " \n", " \n", "\n", @@ -469,12 +471,12 @@ ], "text/plain": [ " original_entity_id relation linked_entities_id\n", - "0 1336 1 16764\n", - "1 5908 1 16764\n", - "2 376 1 16764" + "0 698 1 15010\n", + "1 19885 1 15010\n", + "2 447 1 15010" ] }, - "execution_count": 49, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -491,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -523,17 +525,17 @@ " \n", " 0\n", " 242\n", - " 1336\n", + " 698\n", " \n", " \n", " 1\n", " 242\n", - " 5908\n", + " 19885\n", " \n", " \n", " 2\n", " 302\n", - " 376\n", + " 447\n", " \n", " \n", "\n", @@ -541,12 +543,12 @@ ], "text/plain": [ " movielens_id unified_id\n", - "0 242 1336\n", - "1 242 5908\n", - "2 302 376" + "0 242 698\n", + "1 242 19885\n", + "2 302 447" ] }, - "execution_count": 50, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -561,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -585,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -595,16 +597,16 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 11, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "converting rating file ...\n", - "number of users: 942\n", - "number of items: 1677\n" + "INFO:reco_utils.recommender.ripplenet.preprocess:converting rating file ...\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of users: 942\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of items: 1677\n" ] } ], @@ -615,18 +617,18 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "converting kg file ...\n", - "number of entities (containing items): 22994\n", - "number of relations: 1\n" + "INFO:reco_utils.recommender.ripplenet.preprocess:converting kg file ...\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of entities (containing items): 22994\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of relations: 1\n" ] } ], @@ -650,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -666,14 +668,14 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "reading KG file ...\n" + "INFO:reco_utils.recommender.ripplenet.data_loader:reading KG file ...\n" ] } ], @@ -700,14 +702,14 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 15, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "constructing ripple set ...\n" + "INFO:reco_utils.recommender.ripplenet.data_loader:constructing ripple set ...\n" ] } ], @@ -725,13 +727,35 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "show_loss = False" ] }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:numexpr.utils:NumExpr defaulting to 4 threads.\n" + ] + } + ], + "source": [ + "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", + " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", + " n_memory=n_memory,\n", + " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", + " n_entity=n_entity,n_relation=n_relation,\n", + " optimizer_method=optimizer_method)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -742,18 +766,18 @@ " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", " n_memory=n_memory,\n", " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", - " n_entity=n_entity,n_relation=n_relation)\n", + " n_entity=n_entity,n_relation=n_relation,\n", + " optimizer_method=optimizer_method)\n", + "\n", + "ripple.fit(sess=sess, \n", + " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", + " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", + " ripple_set=ripple_set, show_loss=show_loss)\n", "\n", - "with tf.Session() as sess:\n", - " ripple.fit(sess=sess, \n", - " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", - " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", - " ripple_set=ripple_set, show_loss=show_loss)\n", - " \n", - " labels, scores = ripple.predict(sess=sess, \n", - " batch_size=batch_size, n_hop=n_hop, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", - " ripple_set=ripple_set)\n", + "labels, scores = ripple.predict(sess=sess, \n", + " batch_size=batch_size, n_hop=n_hop, \n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", + " ripple_set=ripple_set)\n", "\n", "predictions = [1 if i >= 0.5 else 0 for i in scores]" ] From 5d861b022d11df42503951568b545c49af35cf62 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 11 Jan 2020 19:26:44 +0100 Subject: [PATCH 47/75] logger, new optimizers, added seeds and session, reformated predict --- reco_utils/recommender/ripplenet/model.py | 66 +++++++++++++---------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 7298b7cfad..6e9749facd 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -3,22 +3,21 @@ import tensorflow as tf import numpy as np +import logging from sklearn.metrics import roc_auc_score +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + class RippleNet(object): def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, n_entity, n_relation): - self._parse_args(dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, n_entity, n_relation) - self._build_inputs() - self._build_embeddings() - self._build_model() - self._build_loss() - self._build_train() + n_memory, item_update_mode, using_all_hops, n_entity, n_relation, + optimizer_method="adam", + seed=None): + self.seed = seed + tf.set_random_seed(seed) + np.random.seed(seed) - def _parse_args(self, dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, - n_entity, n_relation): self.n_entity = n_entity self.n_relation = n_relation self.dim = dim @@ -29,6 +28,19 @@ def _parse_args(self, dim, n_hop, kge_weight, l2_weight, lr, self.n_memory = n_memory self.item_update_mode = item_update_mode self.using_all_hops = using_all_hops + self.optimizer_method = optimizer_method + + self._build_inputs() + self._build_embeddings() + self._build_model() + self._build_loss() + self._build_optimizer() + + self.init_op = tf.global_variables_initializer() + # set GPU use with demand growth + gpu_options = tf.GPUOptions(allow_growth=True) + self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + self.sess.run(self.init_op) def _build_inputs(self): self.items = tf.placeholder(dtype=tf.int32, shape=[None], name="items") @@ -152,15 +164,17 @@ def _build_loss(self): self.loss = self.base_loss + self.kge_loss + self.l2_loss - def _build_train(self): - self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss) - ''' - optimizer = tf.train.AdamOptimizer(self.lr) - gradients, variables = zip(*optimizer.compute_gradients(self.loss)) - gradients = [None if gradient is None else tf.clip_by_norm(gradient, clip_norm=5) - for gradient in gradients] - self.optimizer = optimizer.apply_gradients(zip(gradients, variables)) - ''' + def _build_optimizer(self): + + optimizers = { + "adam": tf.train.AdamOptimizer(self.lr).minimize(self.loss), + "adadelta": tf.train.AdadeltaOptimizer(self.lr).minimize(self.loss), + "adagrad": tf.train.AdagradOptimizer(self.lr).minimize(self.loss), + "ftrl": tf.train.FtrlOptimizer(self.lr).minimize(self.loss), + "gd": tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss), + "rmsprop": tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + } + self.optimizer = optimizers[self.optimizer_method] def train(self, sess, feed_dict): return sess.run([self.optimizer, self.loss], feed_dict) @@ -215,27 +229,25 @@ def fit(self, start, start + batch_size)) start += batch_size if show_loss: - print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) + log.info('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) train_auc, train_acc = self.print_metrics_evaluation(sess, n_hop, train_data, ripple_set, batch_size) - print('epoch %d train auc: %.4f acc: %.4f' + log.info('epoch %d train auc: %.4f acc: %.4f' % (step, train_auc, train_acc)) def predict(self, sess, batch_size, n_hop, data, ripple_set): start = 0 - labels_list = [] - scores_list = [] + labels = [0] * data.shape[0] + scores = [0] * data.shape[0] while start < data.shape[0]: - labels, scores = self.return_scores(sess, + labels[start:start + batch_size], scores[start:start + batch_size] = self.return_scores(sess, self.get_feed_dict(n_hop, data, ripple_set, start, start + batch_size)) - labels_list.append(labels) - scores_list.append(scores) start += batch_size return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) From 59fcd8be4104b8b7cec2c95ea28505ead640abe1 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 11 Jan 2020 19:26:53 +0100 Subject: [PATCH 48/75] added seed --- notebooks/02_model/rippleNet_deep_dive.ipynb | 25 ++------------------ 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index cea17de2a2..697f046339 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -734,28 +734,6 @@ "show_loss = False" ] }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:numexpr.utils:NumExpr defaulting to 4 threads.\n" - ] - } - ], - "source": [ - "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", - " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", - " n_memory=n_memory,\n", - " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", - " n_entity=n_entity,n_relation=n_relation,\n", - " optimizer_method=optimizer_method)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -767,7 +745,8 @@ " n_memory=n_memory,\n", " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", " n_entity=n_entity,n_relation=n_relation,\n", - " optimizer_method=optimizer_method)\n", + " optimizer_method=optimizer_method,\n", + " seed=12)\n", "\n", "ripple.fit(sess=sess, \n", " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", From f59c2deb0dd37d0c5d081aa365e2796394b8460e Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 11 Jan 2020 19:32:46 +0100 Subject: [PATCH 49/75] removed session from params in functions --- reco_utils/recommender/ripplenet/model.py | 33 +++++++++++------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 6e9749facd..29c816cc62 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -176,15 +176,15 @@ def _build_optimizer(self): } self.optimizer = optimizers[self.optimizer_method] - def train(self, sess, feed_dict): - return sess.run([self.optimizer, self.loss], feed_dict) + def train(self, feed_dict): + return self.sess.run([self.optimizer, self.loss], feed_dict) - def return_scores(self, sess, feed_dict): - labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) + def return_scores(self, feed_dict): + labels, scores = self.sess.run([self.labels, self.scores_normalized], feed_dict) return labels, scores - def eval(self, sess, feed_dict): - labels, scores = sess.run([self.labels, self.scores_normalized], feed_dict) + def eval(self, feed_dict): + labels, scores = self.sess.run([self.labels, self.scores_normalized], feed_dict) auc = roc_auc_score(y_true=labels, y_score=scores) predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) @@ -200,54 +200,51 @@ def get_feed_dict(self, n_hop, data, ripple_set, start, end): feed_dict[self.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] return feed_dict - def print_metrics_evaluation(self, sess, n_hop, data, ripple_set, batch_size): + def print_metrics_evaluation(self, n_hop, data, ripple_set, batch_size): start = 0 auc_list = [] acc_list = [] while start < data.shape[0]: - auc, acc = self.eval(sess, self.get_feed_dict(n_hop, - data, ripple_set, - start, start + batch_size)) + auc, acc = self.eval(self.get_feed_dict(n_hop, + data, ripple_set, + start, start + batch_size)) auc_list.append(auc) acc_list.append(acc) start += batch_size return float(np.mean(auc_list)), float(np.mean(acc_list)) def fit(self, - sess, n_epoch, batch_size, n_hop, train_data, ripple_set, show_loss): - sess.run(tf.global_variables_initializer()) for step in range(n_epoch): # training np.random.shuffle(train_data) start = 0 while start < train_data.shape[0]: - _, loss = self.train(sess, - self.get_feed_dict(n_hop, + _, loss = self.train(self.get_feed_dict(n_hop, train_data, ripple_set, start, start + batch_size)) start += batch_size if show_loss: log.info('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - train_auc, train_acc = self.print_metrics_evaluation(sess, n_hop, + train_auc, train_acc = self.print_metrics_evaluation(n_hop, train_data, ripple_set, batch_size) log.info('epoch %d train auc: %.4f acc: %.4f' % (step, train_auc, train_acc)) - def predict(self, sess, + def predict(self, batch_size, n_hop, data, ripple_set): start = 0 labels = [0] * data.shape[0] scores = [0] * data.shape[0] while start < data.shape[0]: - labels[start:start + batch_size], scores[start:start + batch_size] = self.return_scores(sess, + labels[start:start + batch_size], scores[start:start + batch_size] = self.return_scores( self.get_feed_dict(n_hop, data, ripple_set, start, start + batch_size)) start += batch_size - return list(np.concatenate(labels_list)), list(np.concatenate(scores_list)) + return labels, scores From 3a680794850f6dce4c6f57f09ebbd1309c7c26a2 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 11 Jan 2020 19:33:52 +0100 Subject: [PATCH 50/75] session now is and internal parameter --- notebooks/02_model/rippleNet_deep_dive.ipynb | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 697f046339..d55ad4299b 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -748,15 +748,13 @@ " optimizer_method=optimizer_method,\n", " seed=12)\n", "\n", - "ripple.fit(sess=sess, \n", - " n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", - " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", - " ripple_set=ripple_set, show_loss=show_loss)\n", + "ripple.fit(n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", + " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", + " ripple_set=ripple_set, show_loss=show_loss)\n", "\n", - "labels, scores = ripple.predict(sess=sess, \n", - " batch_size=batch_size, n_hop=n_hop, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", - " ripple_set=ripple_set)\n", + "labels, scores = ripple.predict(batch_size=batch_size, n_hop=n_hop, \n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", + " ripple_set=ripple_set)\n", "\n", "predictions = [1 if i >= 0.5 else 0 for i in scores]" ] From f1e590dbbdafe1c6bdc2193888dc2180d9346954 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 12 Jan 2020 12:42:42 +0100 Subject: [PATCH 51/75] removed dictionary structure from optimizers, the code failed --- reco_utils/recommender/ripplenet/model.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 29c816cc62..9f91e7becc 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -37,6 +37,7 @@ def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, self._build_optimizer() self.init_op = tf.global_variables_initializer() + # set GPU use with demand growth gpu_options = tf.GPUOptions(allow_growth=True) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) @@ -166,15 +167,18 @@ def _build_loss(self): def _build_optimizer(self): - optimizers = { - "adam": tf.train.AdamOptimizer(self.lr).minimize(self.loss), - "adadelta": tf.train.AdadeltaOptimizer(self.lr).minimize(self.loss), - "adagrad": tf.train.AdagradOptimizer(self.lr).minimize(self.loss), - "ftrl": tf.train.FtrlOptimizer(self.lr).minimize(self.loss), - "gd": tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss), - "rmsprop": tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) - } - self.optimizer = optimizers[self.optimizer_method] + if self.optimizer_method == 'adam': + self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss) + elif self.optimizer_method == 'adadelta': + self.optimizer = tf.train.AdadeltaOptimizer(self.lr).minimize(self.loss) + elif self.optimizer_method == "adagrad": + self.optimizer = tf.train.AdagradOptimizer(self.lr).minimize(self.loss) + elif self.optimizer_method == "ftrl": + self.optimizer = tf.train.FtrlOptimizer(self.lr).minimize(self.loss) + elif self.optimizer_method == "gd": + self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss) + elif self.optimizer_method == "rmsprop": + self.optimizer = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) def train(self, feed_dict): return self.sess.run([self.optimizer, self.loss], feed_dict) From 3b0d7a78d0746e079e8686b05a969af98ff10288 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 12 Jan 2020 12:42:52 +0100 Subject: [PATCH 52/75] added TOP_K in metrics --- notebooks/02_model/rippleNet_deep_dive.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index d55ad4299b..3f2729a8c2 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -827,7 +827,7 @@ " col_rating=\"original_rating\",\n", " col_prediction=\"scores\",\n", " relevancy_method=\"top_k\",\n", - " k=k)" + " k=TOP_K)" ] }, { @@ -851,7 +851,7 @@ " col_rating=\"original_rating\",\n", " col_prediction=\"scores\",\n", " relevancy_method=\"top_k\",\n", - " k=k)" + " k=TOP_K)" ] }, { From 14f1468b5d478d2ff97e7c6f2a89485beff43743 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 19 Jan 2020 12:13:51 +0100 Subject: [PATCH 53/75] added docstrings, made rest of functions interal, made n_hop and ripple_set internal params --- notebooks/02_model/rippleNet_deep_dive.ipynb | 100 ++++++++++++++++++- reco_utils/recommender/ripplenet/model.py | 93 ++++++++++++----- 2 files changed, 163 insertions(+), 30 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 3f2729a8c2..d5287bc1c2 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -659,6 +659,96 @@ "train_data, test_data, eval_data = python_stratified_split(ratings_final, ratio=[0.6, 0.2, 0.2], col_user='user_index', col_item='item', seed=12)" ] }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_indexitemratingoriginal_rating
1290328100.0
2310140700.0
52046114.0
2290327300.0
2500200700.0
\n", + "
" + ], + "text/plain": [ + " user_index item rating original_rating\n", + "129 0 3281 0 0.0\n", + "231 0 1407 0 0.0\n", + "52 0 461 1 4.0\n", + "229 0 3273 0 0.0\n", + "250 0 2007 0 0.0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -748,13 +838,13 @@ " optimizer_method=optimizer_method,\n", " seed=12)\n", "\n", - "ripple.fit(n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,\n", + "ripple.fit(n_epoch=n_epoch, batch_size=batch_size,\n", " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", - " ripple_set=ripple_set, show_loss=show_loss)\n", + " ripple_set=ripple_set,\n", + " show_loss=show_loss)\n", "\n", - "labels, scores = ripple.predict(batch_size=batch_size, n_hop=n_hop, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", - " ripple_set=ripple_set)\n", + "labels, scores = ripple.predict(batch_size=batch_size, \n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy())\n", "\n", "predictions = [1 if i >= 0.5 else 0 for i in scores]" ] diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 9f91e7becc..65b2e91cc0 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -10,10 +10,37 @@ log = logging.getLogger(__name__) class RippleNet(object): + """RippleNet Implementation. RippleNet is an end-to-end framework that naturally + incorporates the knowledge graphs into recommender systems. + Similar to actual ripples propagating on the water, RippleNet stimulates the propagation + of user preferences over the set of knowledge entities by automatically and iteratively + extending a user’s potential interests along links in the knowledge graph. + """ + def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, n_memory, item_update_mode, using_all_hops, n_entity, n_relation, optimizer_method="adam", seed=None): + + """Initialize model parameters + + Args: + dim (int): dimension of entity and relation embeddings + n_hop (int): maximum hops to create ripples using the KG + kge_weight (float): weight of the KGE term + l2_weight (float): weight of the l2 regularization term + lr (float): learning rate + n_memory (int): size of ripple set for each hop + item_update_mode (string): how to update item at the end of each hop. + possible options are replace, plus, plus_transform or replace transform + using_all_hops (bool): whether to use outputs of all hops or just the + last hop when making prediction + n_entity (int): number of entitites in the KG + n_relation (int): number of types of relations in the KG + optimizer_method (string): optimizer method from adam, adadelta, adagrad, ftrl (FtrlOptimizer), + #gd (GradientDescentOptimizer), rmsprop (RMSPropOptimizer) + seed (int): initial seed value + """ self.seed = seed tf.set_random_seed(seed) np.random.seed(seed) @@ -89,7 +116,7 @@ def _build_model(self): o_list = self._key_addressing() - self.scores = tf.squeeze(self.predict_scores(self.item_embeddings, o_list)) + self.scores = tf.squeeze(self._predict_scores(self.item_embeddings, o_list)) self.scores_normalized = tf.sigmoid(self.scores) def _key_addressing(self): @@ -116,11 +143,12 @@ def _key_addressing(self): # [batch_size, dim] o = tf.reduce_sum(self.t_emb_list[hop] * probs_expanded, axis=1) - self.item_embeddings = self.update_item_embedding(self.item_embeddings, o) + self.item_embeddings = self._update_item_embedding(self.item_embeddings, o) o_list.append(o) return o_list - def update_item_embedding(self, item_embeddings, o): + def _update_item_embedding(self, item_embeddings, o): + if self.item_update_mode == "replace": item_embeddings = o elif self.item_update_mode == "plus": @@ -133,13 +161,12 @@ def update_item_embedding(self, item_embeddings, o): raise Exception("Unknown item updating mode: " + self.item_update_mode) return item_embeddings - def predict_scores(self, item_embeddings, o_list): + def _predict_scores(self, item_embeddings, o_list): y = o_list[-1] if self.using_all_hops: for i in range(self.n_hop - 1): y += o_list[i] - # [batch_size] scores = tf.reduce_sum(item_embeddings * y, axis=1) return scores @@ -180,37 +207,36 @@ def _build_optimizer(self): elif self.optimizer_method == "rmsprop": self.optimizer = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) - def train(self, feed_dict): + def _train(self, feed_dict): return self.sess.run([self.optimizer, self.loss], feed_dict) - def return_scores(self, feed_dict): + def _return_scores(self, feed_dict): labels, scores = self.sess.run([self.labels, self.scores_normalized], feed_dict) return labels, scores - def eval(self, feed_dict): + def _eval(self, feed_dict): labels, scores = self.sess.run([self.labels, self.scores_normalized], feed_dict) auc = roc_auc_score(y_true=labels, y_score=scores) predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) return auc, acc - def get_feed_dict(self, n_hop, data, ripple_set, start, end): + def _get_feed_dict(self, data, start, end): feed_dict = dict() feed_dict[self.items] = data[start:end, 1] feed_dict[self.labels] = data[start:end, 2] - for i in range(n_hop): - feed_dict[self.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]] - feed_dict[self.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]] - feed_dict[self.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]] + for i in range(self.n_hop): + feed_dict[self.memories_h[i]] = [self.ripple_set[user][i][0] for user in data[start:end, 0]] + feed_dict[self.memories_r[i]] = [self.ripple_set[user][i][1] for user in data[start:end, 0]] + feed_dict[self.memories_t[i]] = [self.ripple_set[user][i][2] for user in data[start:end, 0]] return feed_dict - def print_metrics_evaluation(self, n_hop, data, ripple_set, batch_size): + def _print_metrics_evaluation(self, n_hop, data, batch_size): start = 0 auc_list = [] acc_list = [] while start < data.shape[0]: - auc, acc = self.eval(self.get_feed_dict(n_hop, - data, ripple_set, + auc, acc = self._eval(self._get_feed_dict(data, start, start + batch_size)) auc_list.append(auc) acc_list.append(acc) @@ -218,36 +244,53 @@ def print_metrics_evaluation(self, n_hop, data, ripple_set, batch_size): return float(np.mean(auc_list)), float(np.mean(acc_list)) def fit(self, - n_epoch, batch_size, n_hop, + n_epoch, batch_size, train_data, ripple_set, show_loss): + """Main fit method for RippleNet. + + Args: + n_epoch (int): the number of epochs + batch_size (int): batch size + train_data (pd.DataFrame): User id, item and rating dataframe + ripple_set (dictionary): set of knowledge triples per user positive rating, from 0 until n_hop + show_loss (bool): whether to show loss update + """ + self.ripple_set = ripple_set for step in range(n_epoch): # training np.random.shuffle(train_data) start = 0 while start < train_data.shape[0]: - _, loss = self.train(self.get_feed_dict(n_hop, - train_data, ripple_set, + _, loss = self._train(self._get_feed_dict( + train_data, start, start + batch_size)) start += batch_size if show_loss: log.info('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) - train_auc, train_acc = self.print_metrics_evaluation(n_hop, - train_data, ripple_set, + train_auc, train_acc = self._print_metrics_evaluation(train_data, batch_size) log.info('epoch %d train auc: %.4f acc: %.4f' % (step, train_auc, train_acc)) def predict(self, - batch_size, n_hop, data, ripple_set): + batch_size, data): + """Main predict method for RippleNet. + + Args: + batch_size (int): batch size + data (pd.DataFrame): User id, item and rating dataframe + + Returns: + (pd.DataFrame, pd.DataFrame): real labels of the predicted items, predicted scores of the predicted items + """ start = 0 labels = [0] * data.shape[0] scores = [0] * data.shape[0] while start < data.shape[0]: - labels[start:start + batch_size], scores[start:start + batch_size] = self.return_scores( - self.get_feed_dict(n_hop, - data, ripple_set, + labels[start:start + batch_size], scores[start:start + batch_size] = self._return_scores( + self._get_feed_dict(data, start, start + batch_size)) start += batch_size From 577ceca13547822f1f7020b5f0054744071a88d8 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 19 Jan 2020 12:32:02 +0100 Subject: [PATCH 54/75] formatted code using black --- .../recommender/ripplenet/data_loader.py | 9 +- reco_utils/recommender/ripplenet/model.py | 191 ++++++++++++------ .../recommender/ripplenet/preprocess.py | 49 +++-- 3 files changed, 168 insertions(+), 81 deletions(-) diff --git a/reco_utils/recommender/ripplenet/data_loader.py b/reco_utils/recommender/ripplenet/data_loader.py index 1cdd94ceb9..d8cddc91b6 100644 --- a/reco_utils/recommender/ripplenet/data_loader.py +++ b/reco_utils/recommender/ripplenet/data_loader.py @@ -9,6 +9,7 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) + def load_kg(kg_final): """Standarize indexes for items and entities @@ -21,7 +22,7 @@ def load_kg(kg_final): n_relation (int): number of relations in KG kg (dictionary): KG in dictionary shape """ - log.info('reading KG file ...') + log.info("reading KG file ...") n_entity = len(set(kg_final.iloc[:, 0]) | set(kg_final.iloc[:, 2])) n_relation = len(set(kg_final.iloc[:, 1])) @@ -46,7 +47,7 @@ def get_ripple_set(kg, user_history_dict, n_hop=2, n_memory=36): Returns: ripple_set (dictionary): set of knowledge triples per user positive rating, from 0 until n_hop """ - log.info('constructing ripple set ...') + log.info("constructing ripple set ...") # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...] ripple_set = collections.defaultdict(list) @@ -76,7 +77,9 @@ def get_ripple_set(kg, user_history_dict, n_hop=2, n_memory=36): else: # sample a fixed-size 1-hop memory for each user replace = len(memories_h) < n_memory - indices = np.random.choice(len(memories_h), size=n_memory, replace=replace) + indices = np.random.choice( + len(memories_h), size=n_memory, replace=replace + ) memories_h = [memories_h[i] for i in indices] memories_r = [memories_r[i] for i in indices] memories_t = [memories_t[i] for i in indices] diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 65b2e91cc0..06172fe16c 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -9,6 +9,7 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) + class RippleNet(object): """RippleNet Implementation. RippleNet is an end-to-end framework that naturally incorporates the knowledge graphs into recommender systems. @@ -17,11 +18,22 @@ class RippleNet(object): extending a user’s potential interests along links in the knowledge graph. """ - def __init__(self, dim, n_hop, kge_weight, l2_weight, lr, - n_memory, item_update_mode, using_all_hops, n_entity, n_relation, - optimizer_method="adam", - seed=None): - + def __init__( + self, + dim, + n_hop, + kge_weight, + l2_weight, + lr, + n_memory, + item_update_mode, + using_all_hops, + n_entity, + n_relation, + optimizer_method="adam", + seed=None, + ): + """Initialize model parameters Args: @@ -79,40 +91,73 @@ def _build_inputs(self): for hop in range(self.n_hop): self.memories_h.append( - tf.placeholder(dtype=tf.int32, shape=[None, self.n_memory], name="memories_h_" + str(hop))) + tf.placeholder( + dtype=tf.int32, + shape=[None, self.n_memory], + name="memories_h_" + str(hop), + ) + ) self.memories_r.append( - tf.placeholder(dtype=tf.int32, shape=[None, self.n_memory], name="memories_r_" + str(hop))) + tf.placeholder( + dtype=tf.int32, + shape=[None, self.n_memory], + name="memories_r_" + str(hop), + ) + ) self.memories_t.append( - tf.placeholder(dtype=tf.int32, shape=[None, self.n_memory], name="memories_t_" + str(hop))) + tf.placeholder( + dtype=tf.int32, + shape=[None, self.n_memory], + name="memories_t_" + str(hop), + ) + ) def _build_embeddings(self): - self.entity_emb_matrix = tf.get_variable(name="entity_emb_matrix", dtype=tf.float64, - shape=[self.n_entity, self.dim], - initializer=tf.contrib.layers.xavier_initializer()) - self.relation_emb_matrix = tf.get_variable(name="relation_emb_matrix", dtype=tf.float64, - shape=[self.n_relation, self.dim, self.dim], - initializer=tf.contrib.layers.xavier_initializer()) + self.entity_emb_matrix = tf.get_variable( + name="entity_emb_matrix", + dtype=tf.float64, + shape=[self.n_entity, self.dim], + initializer=tf.contrib.layers.xavier_initializer(), + ) + self.relation_emb_matrix = tf.get_variable( + name="relation_emb_matrix", + dtype=tf.float64, + shape=[self.n_relation, self.dim, self.dim], + initializer=tf.contrib.layers.xavier_initializer(), + ) def _build_model(self): # transformation matrix for updating item embeddings at the end of each hop - self.transform_matrix = tf.get_variable(name="transform_matrix", shape=[self.dim, self.dim], dtype=tf.float64, - initializer=tf.contrib.layers.xavier_initializer()) + self.transform_matrix = tf.get_variable( + name="transform_matrix", + shape=[self.dim, self.dim], + dtype=tf.float64, + initializer=tf.contrib.layers.xavier_initializer(), + ) # [batch size, dim] - self.item_embeddings = tf.nn.embedding_lookup(self.entity_emb_matrix, self.items) + self.item_embeddings = tf.nn.embedding_lookup( + self.entity_emb_matrix, self.items + ) self.h_emb_list = [] self.r_emb_list = [] self.t_emb_list = [] for i in range(self.n_hop): # [batch size, n_memory, dim] - self.h_emb_list.append(tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_h[i])) + self.h_emb_list.append( + tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_h[i]) + ) # [batch size, n_memory, dim, dim] - self.r_emb_list.append(tf.nn.embedding_lookup(self.relation_emb_matrix, self.memories_r[i])) + self.r_emb_list.append( + tf.nn.embedding_lookup(self.relation_emb_matrix, self.memories_r[i]) + ) # [batch size, n_memory, dim] - self.t_emb_list.append(tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i])) + self.t_emb_list.append( + tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i]) + ) o_list = self._key_addressing() @@ -148,7 +193,7 @@ def _key_addressing(self): return o_list def _update_item_embedding(self, item_embeddings, o): - + if self.item_update_mode == "replace": item_embeddings = o elif self.item_update_mode == "plus": @@ -171,22 +216,37 @@ def _predict_scores(self, item_embeddings, o_list): return scores def _build_loss(self): - self.base_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.scores)) + self.base_loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits( + labels=self.labels, logits=self.scores + ) + ) self.kge_loss = 0 for hop in range(self.n_hop): h_expanded = tf.expand_dims(self.h_emb_list[hop], axis=2) t_expanded = tf.expand_dims(self.t_emb_list[hop], axis=3) - hRt = tf.squeeze(tf.matmul(tf.matmul(h_expanded, self.r_emb_list[hop]), t_expanded)) + hRt = tf.squeeze( + tf.matmul(tf.matmul(h_expanded, self.r_emb_list[hop]), t_expanded) + ) self.kge_loss += tf.reduce_mean(tf.sigmoid(hRt)) self.kge_loss = -self.kge_weight * self.kge_loss self.l2_loss = 0 for hop in range(self.n_hop): - self.l2_loss += tf.reduce_mean(tf.reduce_sum(self.h_emb_list[hop] * self.h_emb_list[hop])) - self.l2_loss += tf.reduce_mean(tf.reduce_sum(self.t_emb_list[hop] * self.t_emb_list[hop])) - self.l2_loss += tf.reduce_mean(tf.reduce_sum(self.r_emb_list[hop] * self.r_emb_list[hop])) - if self.item_update_mode == "replace nonlinear" or self.item_update_mode == "plus nonlinear": + self.l2_loss += tf.reduce_mean( + tf.reduce_sum(self.h_emb_list[hop] * self.h_emb_list[hop]) + ) + self.l2_loss += tf.reduce_mean( + tf.reduce_sum(self.t_emb_list[hop] * self.t_emb_list[hop]) + ) + self.l2_loss += tf.reduce_mean( + tf.reduce_sum(self.r_emb_list[hop] * self.r_emb_list[hop]) + ) + if ( + self.item_update_mode == "replace nonlinear" + or self.item_update_mode == "plus nonlinear" + ): self.l2_loss += tf.nn.l2_loss(self.transform_matrix) self.l2_loss = self.l2_weight * self.l2_loss @@ -194,22 +254,24 @@ def _build_loss(self): def _build_optimizer(self): - if self.optimizer_method == 'adam': + if self.optimizer_method == "adam": self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss) - elif self.optimizer_method == 'adadelta': + elif self.optimizer_method == "adadelta": self.optimizer = tf.train.AdadeltaOptimizer(self.lr).minimize(self.loss) - elif self.optimizer_method == "adagrad": + elif self.optimizer_method == "adagrad": self.optimizer = tf.train.AdagradOptimizer(self.lr).minimize(self.loss) - elif self.optimizer_method == "ftrl": + elif self.optimizer_method == "ftrl": self.optimizer = tf.train.FtrlOptimizer(self.lr).minimize(self.loss) - elif self.optimizer_method == "gd": - self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss) - elif self.optimizer_method == "rmsprop": + elif self.optimizer_method == "gd": + self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize( + self.loss + ) + elif self.optimizer_method == "rmsprop": self.optimizer = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) def _train(self, feed_dict): return self.sess.run([self.optimizer, self.loss], feed_dict) - + def _return_scores(self, feed_dict): labels, scores = self.sess.run([self.labels, self.scores_normalized], feed_dict) return labels, scores @@ -220,32 +282,37 @@ def _eval(self, feed_dict): predictions = [1 if i >= 0.5 else 0 for i in scores] acc = np.mean(np.equal(predictions, labels)) return auc, acc - + def _get_feed_dict(self, data, start, end): feed_dict = dict() feed_dict[self.items] = data[start:end, 1] feed_dict[self.labels] = data[start:end, 2] for i in range(self.n_hop): - feed_dict[self.memories_h[i]] = [self.ripple_set[user][i][0] for user in data[start:end, 0]] - feed_dict[self.memories_r[i]] = [self.ripple_set[user][i][1] for user in data[start:end, 0]] - feed_dict[self.memories_t[i]] = [self.ripple_set[user][i][2] for user in data[start:end, 0]] + feed_dict[self.memories_h[i]] = [ + self.ripple_set[user][i][0] for user in data[start:end, 0] + ] + feed_dict[self.memories_r[i]] = [ + self.ripple_set[user][i][1] for user in data[start:end, 0] + ] + feed_dict[self.memories_t[i]] = [ + self.ripple_set[user][i][2] for user in data[start:end, 0] + ] return feed_dict - def _print_metrics_evaluation(self, n_hop, data, batch_size): + def _print_metrics_evaluation(self, data, batch_size): start = 0 auc_list = [] acc_list = [] while start < data.shape[0]: - auc, acc = self._eval(self._get_feed_dict(data, - start, start + batch_size)) + auc, acc = self._eval( + self._get_feed_dict(data=data, start=start, end=start + batch_size) + ) auc_list.append(auc) acc_list.append(acc) start += batch_size return float(np.mean(auc_list)), float(np.mean(acc_list)) - def fit(self, - n_epoch, batch_size, - train_data, ripple_set, show_loss): + def fit(self, n_epoch, batch_size, train_data, ripple_set, show_loss): """Main fit method for RippleNet. Args: @@ -261,21 +328,24 @@ def fit(self, np.random.shuffle(train_data) start = 0 while start < train_data.shape[0]: - _, loss = self._train(self._get_feed_dict( - train_data, - start, start + batch_size)) + _, loss = self._train( + self._get_feed_dict( + data=train_data, start=start, end=start + batch_size + ) + ) start += batch_size if show_loss: - log.info('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss)) + log.info("%.1f%% %.4f" % (start / train_data.shape[0] * 100, loss)) - train_auc, train_acc = self._print_metrics_evaluation(train_data, - batch_size) + train_auc, train_acc = self._print_metrics_evaluation( + train_data, batch_size + ) - log.info('epoch %d train auc: %.4f acc: %.4f' - % (step, train_auc, train_acc)) + log.info( + "epoch %d train auc: %.4f acc: %.4f" % (step, train_auc, train_acc) + ) - def predict(self, - batch_size, data): + def predict(self, batch_size, data): """Main predict method for RippleNet. Args: @@ -289,9 +359,12 @@ def predict(self, labels = [0] * data.shape[0] scores = [0] * data.shape[0] while start < data.shape[0]: - labels[start:start + batch_size], scores[start:start + batch_size] = self._return_scores( - self._get_feed_dict(data, - start, start + batch_size)) + ( + labels[start : start + batch_size], + scores[start : start + batch_size], + ) = self._return_scores( + self._get_feed_dict(data, start, start + batch_size) + ) start += batch_size - + return labels, scores diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 990c89140f..7af0a92520 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -9,6 +9,7 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) + def read_item_index_to_entity_id_file(item_to_entity): """Standarize indexes for items and entities @@ -50,7 +51,9 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): for index, row in ratings.iterrows(): item_index_old = str(int(row[1])) - if item_index_old not in item_index_old2new: # the item is not in the final item set + if ( + item_index_old not in item_index_old2new + ): # the item is not in the final item set continue item_index = item_index_old2new[item_index_old] @@ -66,7 +69,7 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): user_neg_ratings[user_index_old] = set() user_neg_ratings[user_index_old].add(item_index) - log.info('converting rating file ...') + log.info("converting rating file ...") writer = [] user_cnt = 0 user_index_old2new = dict() @@ -76,23 +79,33 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): user_cnt += 1 user_index = user_index_old2new[user_index_old] for item, original_rating in pos_item_set: - writer.append({"user_index": user_index, - "item": item, - "rating": 1, - "original_rating": original_rating}) + writer.append( + { + "user_index": user_index, + "item": item, + "rating": 1, + "original_rating": original_rating, + } + ) pos_item_set = set(i[0] for i in pos_item_set) unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: unwatched_set -= user_neg_ratings[user_index_old] np.random.seed(seed) - for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False): - writer.append({"user_index": user_index, - "item": item, - "rating": 0, - "original_rating": 0}) + for item in np.random.choice( + list(unwatched_set), size=len(pos_item_set), replace=False + ): + writer.append( + { + "user_index": user_index, + "item": item, + "rating": 0, + "original_rating": 0, + } + ) ratings_final = pd.DataFrame(writer) - log.info('number of users: %d' % user_cnt) - log.info('number of items: %d' % len(item_set)) + log.info("number of users: %d" % user_cnt) + log.info("number of items: %d" % len(item_set)) return ratings_final @@ -106,7 +119,7 @@ def convert_kg(kg, entity_id2index): kg_final (pd.DataFrame): knowledge graph converted with columns head, relation and tail, with internal entity IDs """ - log.info('converting kg file ...') + log.info("converting kg file ...") entity_cnt = len(entity_id2index) relation_cnt = 0 relation_id2index = dict() @@ -133,11 +146,9 @@ def convert_kg(kg, entity_id2index): relation_cnt += 1 relation = relation_id2index[relation_old] - writer.append({"head": head, - "relation": relation, - "tail": tail}) + writer.append({"head": head, "relation": relation, "tail": tail}) kg_final = pd.DataFrame(writer) - log.info('number of entities (containing items): %d' % entity_cnt) - log.info('number of relations: %d' % relation_cnt) + log.info("number of entities (containing items): %d" % entity_cnt) + log.info("number of relations: %d" % relation_cnt) return kg_final From 79833d6f5add3245278d8a9574099cac497d0b8a Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 19 Jan 2020 12:35:32 +0100 Subject: [PATCH 55/75] call functions with explicit args --- reco_utils/recommender/ripplenet/model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 06172fe16c..15ac53f47c 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -338,7 +338,7 @@ def fit(self, n_epoch, batch_size, train_data, ripple_set, show_loss): log.info("%.1f%% %.4f" % (start / train_data.shape[0] * 100, loss)) train_auc, train_acc = self._print_metrics_evaluation( - train_data, batch_size + data=train_data, batch_size=batch_size ) log.info( @@ -363,7 +363,9 @@ def predict(self, batch_size, data): labels[start : start + batch_size], scores[start : start + batch_size], ) = self._return_scores( - self._get_feed_dict(data, start, start + batch_size) + feed_dict=self._get_feed_dict( + data=data, start=start, end=start + batch_size + ) ) start += batch_size From 589b79c2d9b00fb57ea3d91b89489f118c3f2a82 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 19 Jan 2020 12:45:16 +0100 Subject: [PATCH 56/75] added clarification to re-create Ripple and fixed TOP_K print evaluation --- notebooks/02_model/rippleNet_deep_dive.ipynb | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index d5287bc1c2..4bbed639c5 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -849,6 +849,15 @@ "predictions = [1 if i >= 0.5 else 0 for i in scores]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In case you need to re-create the RippleNet again, simply run:\n", + "```python\n", + "tf.reset_default_graph()```" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -926,7 +935,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"The precision_k_score score at k = {}, is {}\".format(k, precision_k_score))" + "print(\"The precision_k_score score at k = {}, is {}\".format(TOP_K, precision_k_score))" ] }, { @@ -950,7 +959,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"The recall_k_score score at k = {}, is {}\".format(k, recall_k_score))" + "print(\"The recall_k_score score at k = {}, is {}\".format(TOP_K, recall_k_score))" ] } ], From b272cdcca28f3da0cd35d986f0e6dc7a835dcf13 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 19 Jan 2020 12:52:01 +0100 Subject: [PATCH 57/75] add exception to unknown optimizer_method --- reco_utils/recommender/ripplenet/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 15ac53f47c..e4cdb46ecc 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -268,6 +268,8 @@ def _build_optimizer(self): ) elif self.optimizer_method == "rmsprop": self.optimizer = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + else: + raise Exception("Unkown optimizer method: " + self.optimizer_method) def _train(self, feed_dict): return self.sess.run([self.optimizer, self.loss], feed_dict) From b8c335c7008484bcb308616ec2be7182f8fad9ce Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 19 Jan 2020 12:58:42 +0100 Subject: [PATCH 58/75] clarified movielens sizes --- notebooks/02_model/rippleNet_deep_dive.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 4bbed639c5..2de0bad233 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -106,7 +106,7 @@ }, "outputs": [], "source": [ - "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", + "# Select MovieLens data size: 100k, 1M, 10M\n", "MOVIELENS_DATA_SIZE = '100k'\n", "rating_threshold = 4 #Minimum rating of a movie to be considered positive\n", "# Ripple parameters\n", From 69f4fb32f78d49fa4fdebd9d53496780bcdb0afb Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 21 Jan 2020 14:18:35 +0000 Subject: [PATCH 59/75] run with V100 but got error fyi @almudenasanz --- notebooks/02_model/rippleNet_deep_dive.ipynb | 253 +++++++++++-------- 1 file changed, 145 insertions(+), 108 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 2de0bad233..d11cd430cc 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -60,13 +60,32 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", + "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", + "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", + "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", + "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", + "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", - "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n", - "Pandas version: 0.25.1\n" + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "[GCC 7.3.0]\n", + "Pandas version: 0.25.3\n", + "Tensorflow version: 1.12.0\n" ] } ], @@ -77,23 +96,21 @@ "import numpy as np\n", "import tensorflow as tf\n", "import os\n", - "import argparse \n", - "from reco_utils.evaluation.python_evaluation import auc, precision_at_k, recall_at_k\n", + "import papermill as pm\n", + "\n", + "from reco_utils.common.timer import Timer\n", "from reco_utils.dataset import movielens\n", "from reco_utils.dataset.python_splitters import python_stratified_split\n", - "\n", "from reco_utils.recommender.ripplenet.preprocess import (read_item_index_to_entity_id_file, \n", " convert_rating, \n", " convert_kg)\n", - "\n", - "from reco_utils.recommender.ripplenet.data_loader import (\n", - " load_kg, \n", - " get_ripple_set)\n", - "\n", + "from reco_utils.recommender.ripplenet.data_loader import load_kg, get_ripple_set\n", "from reco_utils.recommender.ripplenet.model import RippleNet\n", + "from reco_utils.evaluation.python_evaluation import auc, precision_at_k, recall_at_k\n", "\n", "print(\"System version: {}\".format(sys.version))\n", - "print(\"Pandas version: {}\".format(pd.__version__))" + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))" ] }, { @@ -109,6 +126,7 @@ "# Select MovieLens data size: 100k, 1M, 10M\n", "MOVIELENS_DATA_SIZE = '100k'\n", "rating_threshold = 4 #Minimum rating of a movie to be considered positive\n", + "\n", "# Ripple parameters\n", "n_epoch = 10 #the number of epochs\n", "batch_size = 1024 #batch size\n", @@ -123,8 +141,11 @@ "using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction\n", "optimizer_method = \"adam\" #optimizer method from adam, adadelta, adagrad, ftrl (FtrlOptimizer),\n", " #gd (GradientDescentOptimizer), rmsprop (RMSPropOptimizer)\n", + "show_loss = False #whether or not to show the loss\n", + "seed = 12\n", + "\n", "#Evaluation parameters\n", - "TOP_K = 10" + "TOP_K = 10\n" ] }, { @@ -152,7 +173,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:05<00:00, 867KB/s] \n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 16.2kKB/s]\n" ] }, { @@ -187,7 +208,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 196\n", " 242\n", " 3.0\n", @@ -197,7 +218,7 @@ " 1996\n", " \n", " \n", - " 1\n", + " 1\n", " 63\n", " 242\n", " 3.0\n", @@ -207,7 +228,7 @@ " 1996\n", " \n", " \n", - " 2\n", + " 2\n", " 226\n", " 242\n", " 5.0\n", @@ -276,7 +297,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " Q1141186\n", " Q130232\n", " drama film\n", @@ -284,7 +305,7 @@ " 242\n", " \n", " \n", - " 1\n", + " 1\n", " Q1141186\n", " Q157443\n", " comedy film\n", @@ -292,7 +313,7 @@ " 242\n", " \n", " \n", - " 2\n", + " 2\n", " Q1141186\n", " Q10819887\n", " Andrei Chalimon\n", @@ -378,19 +399,19 @@ " \n", " \n", " \n", + " 0\n", " 0\n", - " 0\n", - " Q163038\n", + " Q509628\n", " \n", " \n", + " 1\n", " 1\n", - " 1\n", - " Q7605252\n", + " Q4984790\n", " \n", " \n", + " 2\n", " 2\n", - " 2\n", - " Q1413403\n", + " Q2463968\n", " \n", " \n", "\n", @@ -398,9 +419,9 @@ ], "text/plain": [ " unified_id entity\n", - "0 0 Q163038\n", - "1 1 Q7605252\n", - "2 2 Q1413403" + "0 0 Q509628\n", + "1 1 Q4984790\n", + "2 2 Q2463968" ] }, "execution_count": 6, @@ -448,22 +469,22 @@ " \n", " \n", " \n", - " 0\n", - " 698\n", + " 0\n", + " 1072\n", " 1\n", - " 15010\n", + " 6449\n", " \n", " \n", + " 1\n", + " 10417\n", " 1\n", - " 19885\n", - " 1\n", - " 15010\n", + " 6449\n", " \n", " \n", - " 2\n", - " 447\n", + " 2\n", + " 1378\n", " 1\n", - " 15010\n", + " 6449\n", " \n", " \n", "\n", @@ -471,9 +492,9 @@ ], "text/plain": [ " original_entity_id relation linked_entities_id\n", - "0 698 1 15010\n", - "1 19885 1 15010\n", - "2 447 1 15010" + "0 1072 1 6449\n", + "1 10417 1 6449\n", + "2 1378 1 6449" ] }, "execution_count": 7, @@ -523,19 +544,19 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 242\n", - " 698\n", + " 1072\n", " \n", " \n", - " 1\n", + " 1\n", " 242\n", - " 19885\n", + " 10417\n", " \n", " \n", - " 2\n", + " 2\n", " 302\n", - " 447\n", + " 1378\n", " \n", " \n", "\n", @@ -543,9 +564,9 @@ ], "text/plain": [ " movielens_id unified_id\n", - "0 242 698\n", - "1 242 19885\n", - "2 302 447" + "0 242 1072\n", + "1 242 10417\n", + "2 302 1378" ] }, "execution_count": 8, @@ -661,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -693,35 +714,35 @@ " \n", " \n", " \n", - " 129\n", + " 129\n", " 0\n", " 3281\n", " 0\n", " 0.0\n", " \n", " \n", - " 231\n", + " 231\n", " 0\n", " 1407\n", " 0\n", " 0.0\n", " \n", " \n", - " 52\n", + " 52\n", " 0\n", " 461\n", " 1\n", " 4.0\n", " \n", " \n", - " 229\n", + " 229\n", " 0\n", " 3273\n", " 0\n", " 0.0\n", " \n", " \n", - " 250\n", + " 250\n", " 0\n", " 2007\n", " 0\n", @@ -740,7 +761,7 @@ "250 0 2007 0 0.0" ] }, - "execution_count": 35, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -758,7 +779,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -792,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -817,11 +838,56 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "show_loss = False" + "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", + " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", + " n_memory=n_memory,\n", + " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", + " n_entity=n_entity,n_relation=n_relation,\n", + " optimizer_method=optimizer_method,\n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "ename": "InvalidArgumentError", + "evalue": "indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n\nCaused by op 'embedding_lookup_6', defined at:\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\n \"__main__\", mod_spec)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 85, in _run_code\n exec(code, run_globals)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py\", line 16, in \n app.launch_new_instance()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/traitlets/config/application.py\", line 664, in launch_instance\n app.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelapp.py\", line 563, in start\n self.io_loop.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/platform/asyncio.py\", line 148, in start\n self.asyncio_loop.run_forever()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 438, in run_forever\n self._run_once()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 1451, in _run_once\n handle._run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/events.py\", line 145, in _run\n self._callback(*self._args)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 690, in \n lambda f: self._run_callback(functools.partial(callback, future))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 743, in _run_callback\n ret = callback()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 787, in inner\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 377, in dispatch_queue\n yield self.process_one()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 225, in wrapper\n runner = Runner(result, future, yielded)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 714, in __init__\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 361, in process_one\n yield gen.maybe_future(dispatch(*args))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 268, in dispatch_shell\n yield gen.maybe_future(handler(stream, idents, msg))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 541, in execute_request\n user_expressions, allow_stdin,\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/ipkernel.py\", line 300, in do_execute\n res = shell.run_cell(code, store_history=store_history, silent=silent)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/zmqshell.py\", line 536, in run_cell\n return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2848, in run_cell\n raw_cell, store_history, silent, shell_futures)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2874, in _run_cell\n return runner(coro)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/async_helpers.py\", line 68, in _pseudo_sync_runner\n coro.send(None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3051, in run_cell_async\n interactivity=interactivity, compiler=compiler, result=result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3242, in run_ast_nodes\n if (await self.run_code(code, result, async_=asy)):\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3319, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"\", line 7, in \n seed=seed)\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 74, in __init__\n self._build_model()\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 159, in _build_model\n tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i])\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 313, in embedding_lookup\n transform_fn=None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 133, in _embedding_lookup_and_transform\n result = _clip(array_ops.gather(params[0], ids, name=name),\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py\", line 2675, in gather\n return gen_array_ops.gather_v2(params, indices, axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py\", line 3332, in gather_v2\n \"GatherV2\", params=params, indices=indices, axis=axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py\", line 787, in _apply_op_helper\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py\", line 488, in new_func\n return func(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 3274, in create_op\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 1770, in __init__\n self._traceback = tf_stack.extract_stack()\n\nInvalidArgumentError (see above for traceback): indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mInvalidArgumentError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1333\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1334\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1335\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1318\u001b[0m return self._call_tf_sessionrun(\n\u001b[0;32m-> 1319\u001b[0;31m options, feed_dict, fetch_list, target_list, run_metadata)\n\u001b[0m\u001b[1;32m 1320\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_call_tf_sessionrun\u001b[0;34m(self, options, feed_dict, fetch_list, target_list, run_metadata)\u001b[0m\n\u001b[1;32m 1406\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1407\u001b[0;31m run_metadata)\n\u001b[0m\u001b[1;32m 1408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mInvalidArgumentError\u001b[0m: indices[6,12] = 22991 is not in [0, 22908)\n\t [[{{node embedding_lookup_6}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mInvalidArgumentError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtrain_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"user_index\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"item\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rating\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mripple_set\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mripple_set\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m show_loss=show_loss)\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Took {} seconds for training.\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_time\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minterval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/home/hoaphumanoid/notebooks/repos/recommenders/reco_utils/recommender/ripplenet/model.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, n_epoch, batch_size, train_data, ripple_set, show_loss)\u001b[0m\n\u001b[1;32m 333\u001b[0m _, loss = self._train(\n\u001b[1;32m 334\u001b[0m self._get_feed_dict(\n\u001b[0;32m--> 335\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstart\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 336\u001b[0m )\n\u001b[1;32m 337\u001b[0m )\n", + "\u001b[0;32m/data/home/hoaphumanoid/notebooks/repos/recommenders/reco_utils/recommender/ripplenet/model.py\u001b[0m in \u001b[0;36m_train\u001b[0;34m(self, feed_dict)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_return_scores\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 927\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 928\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 929\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 930\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 931\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m-> 1152\u001b[0;31m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1327\u001b[0m return self._do_call(_run_fn, feeds, fetches, targets, options,\n\u001b[0;32m-> 1328\u001b[0;31m run_metadata)\n\u001b[0m\u001b[1;32m 1329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1330\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1346\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1347\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_interpolation\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minterpolate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_graph\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1348\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnode_def\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1349\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1350\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_extend_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mInvalidArgumentError\u001b[0m: indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n\nCaused by op 'embedding_lookup_6', defined at:\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\n \"__main__\", mod_spec)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 85, in _run_code\n exec(code, run_globals)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py\", line 16, in \n app.launch_new_instance()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/traitlets/config/application.py\", line 664, in launch_instance\n app.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelapp.py\", line 563, in start\n self.io_loop.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/platform/asyncio.py\", line 148, in start\n self.asyncio_loop.run_forever()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 438, in run_forever\n self._run_once()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 1451, in _run_once\n handle._run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/events.py\", line 145, in _run\n self._callback(*self._args)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 690, in \n lambda f: self._run_callback(functools.partial(callback, future))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 743, in _run_callback\n ret = callback()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 787, in inner\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 377, in dispatch_queue\n yield self.process_one()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 225, in wrapper\n runner = Runner(result, future, yielded)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 714, in __init__\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 361, in process_one\n yield gen.maybe_future(dispatch(*args))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 268, in dispatch_shell\n yield gen.maybe_future(handler(stream, idents, msg))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 541, in execute_request\n user_expressions, allow_stdin,\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/ipkernel.py\", line 300, in do_execute\n res = shell.run_cell(code, store_history=store_history, silent=silent)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/zmqshell.py\", line 536, in run_cell\n return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2848, in run_cell\n raw_cell, store_history, silent, shell_futures)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2874, in _run_cell\n return runner(coro)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/async_helpers.py\", line 68, in _pseudo_sync_runner\n coro.send(None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3051, in run_cell_async\n interactivity=interactivity, compiler=compiler, result=result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3242, in run_ast_nodes\n if (await self.run_code(code, result, async_=asy)):\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3319, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"\", line 7, in \n seed=seed)\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 74, in __init__\n self._build_model()\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 159, in _build_model\n tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i])\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 313, in embedding_lookup\n transform_fn=None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 133, in _embedding_lookup_and_transform\n result = _clip(array_ops.gather(params[0], ids, name=name),\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py\", line 2675, in gather\n return gen_array_ops.gather_v2(params, indices, axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py\", line 3332, in gather_v2\n \"GatherV2\", params=params, indices=indices, axis=axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py\", line 787, in _apply_op_helper\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py\", line 488, in new_func\n return func(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 3274, in create_op\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 1770, in __init__\n self._traceback = tf_stack.extract_stack()\n\nInvalidArgumentError (see above for traceback): indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n" + ] + } + ], + "source": [ + "with Timer() as train_time:\n", + " ripple.fit(n_epoch=n_epoch, batch_size=batch_size,\n", + " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", + " ripple_set=ripple_set,\n", + " show_loss=show_loss)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" ] }, { @@ -830,23 +896,12 @@ "metadata": {}, "outputs": [], "source": [ - "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", - " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", - " n_memory=n_memory,\n", - " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", - " n_entity=n_entity,n_relation=n_relation,\n", - " optimizer_method=optimizer_method,\n", - " seed=12)\n", - "\n", - "ripple.fit(n_epoch=n_epoch, batch_size=batch_size,\n", - " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", - " ripple_set=ripple_set,\n", - " show_loss=show_loss)\n", - "\n", - "labels, scores = ripple.predict(batch_size=batch_size, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy())\n", - "\n", - "predictions = [1 if i >= 0.5 else 0 for i in scores]" + "with Timer() as test_time:\n", + " labels, scores = ripple.predict(batch_size=batch_size, \n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy())\n", + " predictions = [1 if i >= 0.5 else 0 for i in scores]\n", + " \n", + "print(\"Took {} seconds for prediction.\".format(test_time.interval))" ] }, { @@ -884,15 +939,7 @@ " col_user=\"user_index\",\n", " col_item=\"item\",\n", " col_rating=\"rating\",\n", - " col_prediction=\"scores\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " col_prediction=\"scores\")\n", "print(\"The auc score is {}\".format(auc_score))" ] }, @@ -902,15 +949,7 @@ "metadata": {}, "outputs": [], "source": [ - "acc_score = np.mean(np.equal(predictions, labels))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "acc_score = np.mean(np.equal(predictions, labels))\n", "print(\"The acc score is {}\".format(acc_score))" ] }, @@ -926,15 +965,7 @@ " col_rating=\"original_rating\",\n", " col_prediction=\"scores\",\n", " relevancy_method=\"top_k\",\n", - " k=TOP_K)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " k=TOP_K)\n", "print(\"The precision_k_score score at k = {}, is {}\".format(TOP_K, precision_k_score))" ] }, @@ -950,7 +981,8 @@ " col_rating=\"original_rating\",\n", " col_prediction=\"scores\",\n", " relevancy_method=\"top_k\",\n", - " k=TOP_K)" + " k=TOP_K)\n", + "print(\"The recall_k_score score at k = {}, is {}\".format(TOP_K, recall_k_score))" ] }, { @@ -959,14 +991,19 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"The recall_k_score score at k = {}, is {}\".format(TOP_K, recall_k_score))" + "# Record results with papermill for tests - ignore this cell\n", + "pm.record(\"auc\", auc_score)\n", + "pm.record(\"precision\", precision_k_score)\n", + "pm.record(\"recall\", recall_k_score)\n", + "pm.record(\"train_time\", train_time.interval)\n", + "pm.record(\"test_time\", test_time.interval)" ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco)", + "display_name": "Python (reco_base)", "language": "python", "name": "reco_base" }, From b4c6f106e06deba9b0d194777a6de26eb66362c6 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 22 Jan 2020 09:19:36 +0000 Subject: [PATCH 60/75] code running now --- notebooks/02_model/rippleNet_deep_dive.ipynb | 384 ++++++++++--------- 1 file changed, 210 insertions(+), 174 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index d11cd430cc..0889845953 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -57,27 +57,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", - "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", - "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", - "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", - "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", - "/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -115,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "tags": [ "parameters" @@ -124,7 +106,7 @@ "outputs": [], "source": [ "# Select MovieLens data size: 100k, 1M, 10M\n", - "MOVIELENS_DATA_SIZE = '100k'\n", + "MOVIELENS_DATA_SIZE = '1M'\n", "rating_threshold = 4 #Minimum rating of a movie to be considered positive\n", "\n", "# Ripple parameters\n", @@ -166,14 +148,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 16.2kKB/s]\n" + "100%|██████████| 5.78k/5.78k [00:00<00:00, 17.9kKB/s]\n" ] }, { @@ -209,46 +191,51 @@ " \n", " \n", " 0\n", - " 196\n", - " 242\n", - " 3.0\n", - " 881250949\n", - " Kolya (1996)\n", - " Comedy\n", - " 1996\n", + " 1\n", + " 1193\n", + " 5.0\n", + " 978300760\n", + " One Flew Over the Cuckoo's Nest (1975)\n", + " Drama\n", + " 1975\n", " \n", " \n", " 1\n", - " 63\n", - " 242\n", - " 3.0\n", - " 875747190\n", - " Kolya (1996)\n", - " Comedy\n", - " 1996\n", + " 2\n", + " 1193\n", + " 5.0\n", + " 978298413\n", + " One Flew Over the Cuckoo's Nest (1975)\n", + " Drama\n", + " 1975\n", " \n", " \n", " 2\n", - " 226\n", - " 242\n", - " 5.0\n", - " 883888671\n", - " Kolya (1996)\n", - " Comedy\n", - " 1996\n", + " 12\n", + " 1193\n", + " 4.0\n", + " 978220179\n", + " One Flew Over the Cuckoo's Nest (1975)\n", + " Drama\n", + " 1975\n", " \n", " \n", "\n", "" ], "text/plain": [ - " UserId ItemId Rating Timestamp Title Genres Year\n", - "0 196 242 3.0 881250949 Kolya (1996) Comedy 1996\n", - "1 63 242 3.0 875747190 Kolya (1996) Comedy 1996\n", - "2 226 242 5.0 883888671 Kolya (1996) Comedy 1996" + " UserId ItemId Rating Timestamp Title \\\n", + "0 1 1193 5.0 978300760 One Flew Over the Cuckoo's Nest (1975) \n", + "1 2 1193 5.0 978298413 One Flew Over the Cuckoo's Nest (1975) \n", + "2 12 1193 4.0 978220179 One Flew Over the Cuckoo's Nest (1975) \n", + "\n", + " Genres Year \n", + "0 Drama 1975 \n", + "1 Drama 1975 \n", + "2 Drama 1975 " ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -264,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -298,45 +285,45 @@ " \n", " \n", " 0\n", - " Q1141186\n", - " Q130232\n", - " drama film\n", - " Kolya (1996)\n", - " 242\n", + " Q857313\n", + " Q7005314\n", + " New American Library\n", + " One Flew Over the Cuckoo's Nest (1975)\n", + " 1193\n", " \n", " \n", " 1\n", - " Q1141186\n", - " Q157443\n", - " comedy film\n", - " Kolya (1996)\n", - " 242\n", + " Q857313\n", + " Q921536\n", + " Viking Press\n", + " One Flew Over the Cuckoo's Nest (1975)\n", + " 1193\n", " \n", " \n", " 2\n", - " Q1141186\n", - " Q10819887\n", - " Andrei Chalimon\n", - " Kolya (1996)\n", - " 242\n", + " Q857313\n", + " Q113013\n", + " postmodern literature\n", + " One Flew Over the Cuckoo's Nest (1975)\n", + " 1193\n", " \n", " \n", "\n", "" ], "text/plain": [ - " original_entity linked_entities name_linked_entities movielens_title \\\n", - "0 Q1141186 Q130232 drama film Kolya (1996) \n", - "1 Q1141186 Q157443 comedy film Kolya (1996) \n", - "2 Q1141186 Q10819887 Andrei Chalimon Kolya (1996) \n", + " original_entity linked_entities name_linked_entities \\\n", + "0 Q857313 Q7005314 New American Library \n", + "1 Q857313 Q921536 Viking Press \n", + "2 Q857313 Q113013 postmodern literature \n", "\n", - " movielens_id \n", - "0 242 \n", - "1 242 \n", - "2 242 " + " movielens_title movielens_id \n", + "0 One Flew Over the Cuckoo's Nest (1975) 1193 \n", + "1 One Flew Over the Cuckoo's Nest (1975) 1193 \n", + "2 One Flew Over the Cuckoo's Nest (1975) 1193 " ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -357,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -369,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -401,17 +388,17 @@ " \n", " 0\n", " 0\n", - " Q509628\n", + " Q1503215\n", " \n", " \n", " 1\n", " 1\n", - " Q4984790\n", + " Q271189\n", " \n", " \n", " 2\n", " 2\n", - " Q2463968\n", + " Q832444\n", " \n", " \n", "\n", @@ -419,12 +406,12 @@ ], "text/plain": [ " unified_id entity\n", - "0 0 Q509628\n", - "1 1 Q4984790\n", - "2 2 Q2463968" + "0 0 Q1503215\n", + "1 1 Q271189\n", + "2 2 Q832444" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -438,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -470,21 +457,21 @@ " \n", " \n", " 0\n", - " 1072\n", + " 3357\n", " 1\n", - " 6449\n", + " 22016\n", " \n", " \n", " 1\n", - " 10417\n", + " 26376\n", " 1\n", - " 6449\n", + " 22016\n", " \n", " \n", " 2\n", - " 1378\n", + " 3357\n", " 1\n", - " 6449\n", + " 12264\n", " \n", " \n", "\n", @@ -492,12 +479,12 @@ ], "text/plain": [ " original_entity_id relation linked_entities_id\n", - "0 1072 1 6449\n", - "1 10417 1 6449\n", - "2 1378 1 6449" + "0 3357 1 22016\n", + "1 26376 1 22016\n", + "2 3357 1 12264" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -514,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -545,18 +532,18 @@ " \n", " \n", " 0\n", - " 242\n", - " 1072\n", + " 1193\n", + " 3357\n", " \n", " \n", " 1\n", - " 242\n", - " 10417\n", + " 1193\n", + " 26376\n", " \n", " \n", " 2\n", - " 302\n", - " 1378\n", + " 661\n", + " 493\n", " \n", " \n", "\n", @@ -564,12 +551,12 @@ ], "text/plain": [ " movielens_id unified_id\n", - "0 242 1072\n", - "1 242 10417\n", - "2 302 1378" + "0 1193 3357\n", + "1 1193 26376\n", + "2 661 493" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -584,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -608,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -618,7 +605,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -626,8 +613,8 @@ "output_type": "stream", "text": [ "INFO:reco_utils.recommender.ripplenet.preprocess:converting rating file ...\n", - "INFO:reco_utils.recommender.ripplenet.preprocess:number of users: 942\n", - "INFO:reco_utils.recommender.ripplenet.preprocess:number of items: 1677\n" + "INFO:reco_utils.recommender.ripplenet.preprocess:number of users: 6038\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of items: 3689\n" ] } ], @@ -638,7 +625,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": { "scrolled": true }, @@ -648,7 +635,7 @@ "output_type": "stream", "text": [ "INFO:reco_utils.recommender.ripplenet.preprocess:converting kg file ...\n", - "INFO:reco_utils.recommender.ripplenet.preprocess:number of entities (containing items): 22994\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of entities (containing items): 39915\n", "INFO:reco_utils.recommender.ripplenet.preprocess:number of relations: 1\n" ] } @@ -673,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -682,7 +669,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -714,37 +701,37 @@ " \n", " \n", " \n", - " 129\n", + " 145\n", " 0\n", - " 3281\n", - " 0\n", - " 0.0\n", + " 341\n", + " 1\n", + " 5.0\n", " \n", " \n", - " 231\n", + " 64\n", " 0\n", - " 1407\n", - " 0\n", - " 0.0\n", + " 3591\n", + " 1\n", + " 4.0\n", " \n", " \n", - " 52\n", + " 287\n", " 0\n", - " 461\n", - " 1\n", - " 4.0\n", + " 5725\n", + " 0\n", + " 0.0\n", " \n", " \n", - " 229\n", + " 276\n", " 0\n", - " 3273\n", + " 1403\n", " 0\n", " 0.0\n", " \n", " \n", - " 250\n", + " 223\n", " 0\n", - " 2007\n", + " 1581\n", " 0\n", " 0.0\n", " \n", @@ -754,14 +741,14 @@ ], "text/plain": [ " user_index item rating original_rating\n", - "129 0 3281 0 0.0\n", - "231 0 1407 0 0.0\n", - "52 0 461 1 4.0\n", - "229 0 3273 0 0.0\n", - "250 0 2007 0 0.0" + "145 0 341 1 5.0\n", + "64 0 3591 1 4.0\n", + "287 0 5725 0 0.0\n", + "276 0 1403 0 0.0\n", + "223 0 1581 0 0.0" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -779,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -813,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -838,9 +825,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:numexpr.utils:Note: NumExpr detected 24 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", + "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n" + ] + } + ], "source": [ "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", @@ -853,30 +849,30 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { - "ename": "InvalidArgumentError", - "evalue": "indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n\nCaused by op 'embedding_lookup_6', defined at:\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\n \"__main__\", mod_spec)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 85, in _run_code\n exec(code, run_globals)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py\", line 16, in \n app.launch_new_instance()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/traitlets/config/application.py\", line 664, in launch_instance\n app.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelapp.py\", line 563, in start\n self.io_loop.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/platform/asyncio.py\", line 148, in start\n self.asyncio_loop.run_forever()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 438, in run_forever\n self._run_once()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 1451, in _run_once\n handle._run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/events.py\", line 145, in _run\n self._callback(*self._args)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 690, in \n lambda f: self._run_callback(functools.partial(callback, future))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 743, in _run_callback\n ret = callback()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 787, in inner\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 377, in dispatch_queue\n yield self.process_one()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 225, in wrapper\n runner = Runner(result, future, yielded)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 714, in __init__\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 361, in process_one\n yield gen.maybe_future(dispatch(*args))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 268, in dispatch_shell\n yield gen.maybe_future(handler(stream, idents, msg))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 541, in execute_request\n user_expressions, allow_stdin,\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/ipkernel.py\", line 300, in do_execute\n res = shell.run_cell(code, store_history=store_history, silent=silent)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/zmqshell.py\", line 536, in run_cell\n return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2848, in run_cell\n raw_cell, store_history, silent, shell_futures)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2874, in _run_cell\n return runner(coro)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/async_helpers.py\", line 68, in _pseudo_sync_runner\n coro.send(None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3051, in run_cell_async\n interactivity=interactivity, compiler=compiler, result=result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3242, in run_ast_nodes\n if (await self.run_code(code, result, async_=asy)):\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3319, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"\", line 7, in \n seed=seed)\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 74, in __init__\n self._build_model()\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 159, in _build_model\n tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i])\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 313, in embedding_lookup\n transform_fn=None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 133, in _embedding_lookup_and_transform\n result = _clip(array_ops.gather(params[0], ids, name=name),\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py\", line 2675, in gather\n return gen_array_ops.gather_v2(params, indices, axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py\", line 3332, in gather_v2\n \"GatherV2\", params=params, indices=indices, axis=axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py\", line 787, in _apply_op_helper\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py\", line 488, in new_func\n return func(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 3274, in create_op\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 1770, in __init__\n self._traceback = tf_stack.extract_stack()\n\nInvalidArgumentError (see above for traceback): indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mInvalidArgumentError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1333\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1334\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1335\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1318\u001b[0m return self._call_tf_sessionrun(\n\u001b[0;32m-> 1319\u001b[0;31m options, feed_dict, fetch_list, target_list, run_metadata)\n\u001b[0m\u001b[1;32m 1320\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_call_tf_sessionrun\u001b[0;34m(self, options, feed_dict, fetch_list, target_list, run_metadata)\u001b[0m\n\u001b[1;32m 1406\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1407\u001b[0;31m run_metadata)\n\u001b[0m\u001b[1;32m 1408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mInvalidArgumentError\u001b[0m: indices[6,12] = 22991 is not in [0, 22908)\n\t [[{{node embedding_lookup_6}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mInvalidArgumentError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtrain_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"user_index\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"item\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rating\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mripple_set\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mripple_set\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m show_loss=show_loss)\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Took {} seconds for training.\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_time\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minterval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/home/hoaphumanoid/notebooks/repos/recommenders/reco_utils/recommender/ripplenet/model.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, n_epoch, batch_size, train_data, ripple_set, show_loss)\u001b[0m\n\u001b[1;32m 333\u001b[0m _, loss = self._train(\n\u001b[1;32m 334\u001b[0m self._get_feed_dict(\n\u001b[0;32m--> 335\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstart\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 336\u001b[0m )\n\u001b[1;32m 337\u001b[0m )\n", - "\u001b[0;32m/data/home/hoaphumanoid/notebooks/repos/recommenders/reco_utils/recommender/ripplenet/model.py\u001b[0m in \u001b[0;36m_train\u001b[0;34m(self, feed_dict)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_return_scores\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 927\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 928\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 929\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 930\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 931\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m-> 1152\u001b[0;31m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1327\u001b[0m return self._do_call(_run_fn, feeds, fetches, targets, options,\n\u001b[0;32m-> 1328\u001b[0;31m run_metadata)\n\u001b[0m\u001b[1;32m 1329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1330\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1346\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1347\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_interpolation\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minterpolate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_graph\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1348\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnode_def\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1349\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1350\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_extend_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mInvalidArgumentError\u001b[0m: indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n\nCaused by op 'embedding_lookup_6', defined at:\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\n \"__main__\", mod_spec)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/runpy.py\", line 85, in _run_code\n exec(code, run_globals)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py\", line 16, in \n app.launch_new_instance()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/traitlets/config/application.py\", line 664, in launch_instance\n app.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelapp.py\", line 563, in start\n self.io_loop.start()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/platform/asyncio.py\", line 148, in start\n self.asyncio_loop.run_forever()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 438, in run_forever\n self._run_once()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/base_events.py\", line 1451, in _run_once\n handle._run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/asyncio/events.py\", line 145, in _run\n self._callback(*self._args)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 690, in \n lambda f: self._run_callback(functools.partial(callback, future))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/ioloop.py\", line 743, in _run_callback\n ret = callback()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 787, in inner\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 377, in dispatch_queue\n yield self.process_one()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 225, in wrapper\n runner = Runner(result, future, yielded)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 714, in __init__\n self.run()\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 748, in run\n yielded = self.gen.send(value)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 361, in process_one\n yield gen.maybe_future(dispatch(*args))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 268, in dispatch_shell\n yield gen.maybe_future(handler(stream, idents, msg))\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/kernelbase.py\", line 541, in execute_request\n user_expressions, allow_stdin,\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tornado/gen.py\", line 209, in wrapper\n yielded = next(result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/ipkernel.py\", line 300, in do_execute\n res = shell.run_cell(code, store_history=store_history, silent=silent)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/ipykernel/zmqshell.py\", line 536, in run_cell\n return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2848, in run_cell\n raw_cell, store_history, silent, shell_futures)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2874, in _run_cell\n return runner(coro)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/async_helpers.py\", line 68, in _pseudo_sync_runner\n coro.send(None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3051, in run_cell_async\n interactivity=interactivity, compiler=compiler, result=result)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3242, in run_ast_nodes\n if (await self.run_code(code, result, async_=asy)):\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3319, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"\", line 7, in \n seed=seed)\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 74, in __init__\n self._build_model()\n File \"../../reco_utils/recommender/ripplenet/model.py\", line 159, in _build_model\n tf.nn.embedding_lookup(self.entity_emb_matrix, self.memories_t[i])\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 313, in embedding_lookup\n transform_fn=None)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py\", line 133, in _embedding_lookup_and_transform\n result = _clip(array_ops.gather(params[0], ids, name=name),\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py\", line 2675, in gather\n return gen_array_ops.gather_v2(params, indices, axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py\", line 3332, in gather_v2\n \"GatherV2\", params=params, indices=indices, axis=axis, name=name)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py\", line 787, in _apply_op_helper\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py\", line 488, in new_func\n return func(*args, **kwargs)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 3274, in create_op\n op_def=op_def)\n File \"/data/anaconda/envs/reco_base/lib/python3.6/site-packages/tensorflow/python/framework/ops.py\", line 1770, in __init__\n self._traceback = tf_stack.extract_stack()\n\nInvalidArgumentError (see above for traceback): indices[6,12] = 22991 is not in [0, 22908)\n\t [[node embedding_lookup_6 (defined at ../../reco_utils/recommender/ripplenet/model.py:159) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_DOUBLE, _class=[\"loc:@Adam/Assign_1\"], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](entity_emb_matrix/read, _arg_memories_t_1_0_7, embedding_lookup/axis)]]\n" + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:reco_utils.recommender.ripplenet.model:epoch 0 train auc: 0.9063 acc: 0.8241\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 1 train auc: 0.9303 acc: 0.8534\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 2 train auc: 0.9386 acc: 0.8638\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 3 train auc: 0.9456 acc: 0.8737\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 4 train auc: 0.9499 acc: 0.8788\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 5 train auc: 0.9522 acc: 0.8822\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 6 train auc: 0.9547 acc: 0.8860\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 7 train auc: 0.9578 acc: 0.8900\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 8 train auc: 0.9607 acc: 0.8955\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 9 train auc: 0.9624 acc: 0.8980\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 650.0238825449924 seconds for training.\n" ] } ], @@ -892,9 +888,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 5.9928844239912 seconds for prediction.\n" + ] + } + ], "source": [ "with Timer() as test_time:\n", " labels, scores = ripple.predict(batch_size=batch_size, \n", @@ -922,7 +926,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -931,9 +935,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The auc score is 0.9229666674514663\n" + ] + } + ], "source": [ "auc_score = auc(test_data, test_data, \n", " col_user=\"user_index\",\n", @@ -945,9 +957,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The acc score is 0.8491929906968855\n" + ] + } + ], "source": [ "acc_score = np.mean(np.equal(predictions, labels))\n", "print(\"The acc score is {}\".format(acc_score))" @@ -955,9 +975,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The precision_k_score score at k = 10, is 0.9307883405101026\n" + ] + } + ], "source": [ "precision_k_score = precision_at_k(test_data, test_data, \n", " col_user=\"user_index\",\n", @@ -971,9 +999,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The recall_k_score score at k = 10, is 0.5144293430105117\n" + ] + } + ], "source": [ "recall_k_score = recall_at_k(test_data, test_data, \n", " col_user=\"user_index\",\n", @@ -1003,9 +1039,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco_base)", + "display_name": "Python (reco_gpu)", "language": "python", - "name": "reco_base" + "name": "reco_gpu" }, "language_info": { "codemirror_mode": { From 6367beded7ef397b0fc5aeae8cf7b40b596abd83 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 22 Jan 2020 09:54:46 +0000 Subject: [PATCH 61/75] some formatting --- notebooks/02_model/rippleNet_deep_dive.ipynb | 62 ++++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 0889845953..f07b9b4b4c 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -11,15 +11,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this example, we will walk through each step of the RippleNet algorithm.\n", - "RippleNet is an end-to-end framework that naturally incorporates the knowledge graphs into recommender systems.\n", - "To make the results of the paper reproducible we have used MovieLens as our dataset and Wikidata as our Knowledge Graph.\n", - "\n", - "> RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems\n", - "> Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo\n", - "> The 27th ACM International Conference on Information and Knowledge Management (CIKM 2018)\n", - "\n", - "Online code of RippleNet: https://github.com/hwwang55/RippleNet" + "In this example, we will walk through each step of the [RippleNet](https://arxiv.org/pdf/1803.03467.pdf) algorithm.\n", + "RippleNet is an end-to-end framework that naturally incorporates knowledge graphs into recommender systems.\n", + "To make the results of the paper reproducible we have used MovieLens as our dataset and Wikidata as our Knowledge Graph.\n" ] }, { @@ -143,7 +137,7 @@ "source": [ "RippleNet is built on:\n", "- Ratings from users on Movies\n", - "- Knowledge Graph (KG) linking Movies to their connected entities in Wikidata. See [this notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/01_prepare_data/wikidata_knowledge_graph.ipynb)" + "- Knowledge Graph (KG) linking Movies to their connected entities in Wikidata. See [this notebook](../01_prepare_data/wikidata_knowledge_graph.ipynb) to understand better how the knowledge graph was created." ] }, { @@ -761,12 +755,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The original KG dataframe is transformed into a dictionary, and the number of entities and retaltions extracted as parameters" + "The original KG dataframe is transformed into a dictionary, and the number of entities and relations extracted as parameters" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -775,17 +769,27 @@ "text": [ "INFO:reco_utils.recommender.ripplenet.data_loader:reading KG file ...\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of entities: 39799\n", + "Number of relations: 1\n" + ] } ], "source": [ - "n_entity, n_relation, kg = load_kg(kg_final)" + "n_entity, n_relation, kg = load_kg(kg_final)\n", + "print(\"Number of entities:\", n_entity)\n", + "print(\"Number of relations:\", n_relation)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The rippleset dictionary is built on the positive ratings (relevant entities) of the training data, and using the KG to build set of knowledge triples per user positive rating, from 0 until n_hop.\n", + "The rippleset dictionary is built on the positive ratings (relevant entities) of the training data, and using the KG to build set of knowledge triples per user positive rating, from 0 until `n_hop`.\n", "\n", "**Relevant entity**: Given interaction matrix Y and knowledge graph G, the set of k-hop relevant entities for user u is defined as\n", "\n", @@ -838,11 +842,16 @@ } ], "source": [ - "ripple = RippleNet(dim=dim,n_hop=n_hop,\n", - " kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,\n", + "ripple = RippleNet(dim=dim,\n", + " n_hop=n_hop,\n", + " kge_weight=kge_weight, \n", + " l2_weight=l2_weight, \n", + " lr=lr,\n", " n_memory=n_memory,\n", - " item_update_mode=item_update_mode, using_all_hops=using_all_hops,\n", - " n_entity=n_entity,n_relation=n_relation,\n", + " item_update_mode=item_update_mode, \n", + " using_all_hops=using_all_hops,\n", + " n_entity=n_entity,\n", + " n_relation=n_relation,\n", " optimizer_method=optimizer_method,\n", " seed=seed)" ] @@ -1034,6 +1043,23 @@ "pm.record(\"train_time\", train_time.interval)\n", "pm.record(\"test_time\", test_time.interval)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "1. Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo, \"RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems\", *The 27th ACM International Conference on Information and Knowledge Management (CIKM 2018)*, 2018. https://arxiv.org/pdf/1803.03467.pdf\n", + "1. The original implementation of RippleNet: https://github.com/hwwang55/RippleNet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 3d92ee5f72f576b18cf00138267d5371079ded9a Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 22 Jan 2020 10:17:39 +0000 Subject: [PATCH 62/75] minor edits --- notebooks/02_model/rippleNet_deep_dive.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index f07b9b4b4c..777a47f335 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -966,20 +966,20 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The acc score is 0.8491929906968855\n" + "The accuracy is 0.8491929906968855\n" ] } ], "source": [ - "acc_score = np.mean(np.equal(predictions, labels))\n", - "print(\"The acc score is {}\".format(acc_score))" + "acc_score = np.mean(np.equal(predictions, labels)) # same result as in sklearn.metrics.accuracy_score \n", + "print(\"The accuracy is {}\".format(acc_score))" ] }, { @@ -1038,6 +1038,7 @@ "source": [ "# Record results with papermill for tests - ignore this cell\n", "pm.record(\"auc\", auc_score)\n", + "pm.record(\"accuracy\", acc_score)\n", "pm.record(\"precision\", precision_k_score)\n", "pm.record(\"recall\", recall_k_score)\n", "pm.record(\"train_time\", train_time.interval)\n", From a6ab41afe6ca85cedc83916af0c42fefd173db22 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 22 Jan 2020 11:01:33 +0000 Subject: [PATCH 63/75] fixed absolute link to code --- notebooks/02_model/rippleNet_deep_dive.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index 777a47f335..e6d3aa8aa2 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -39,7 +39,7 @@ "metadata": {}, "source": [ "## Implementation\n", - "Details of the python implementation can be found [here](https://github.com/microsoft/recommenders/tree/rippleNet/reco_utils/recommender/ripplenet). The implementation is based on the original code of RippleNet: https://github.com/hwwang55/RippleNet" + "Details of the python implementation can be found [here](../../reco_utils/recommender/ripplenet). The implementation is based on the original code of RippleNet: https://github.com/hwwang55/RippleNet" ] }, { From 6d70cf709c3420ac181150b359bba573e8324c76 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 8 Feb 2020 11:48:09 +0100 Subject: [PATCH 64/75] new function recommend_k_items --- reco_utils/recommender/ripplenet/model.py | 54 ++++++++++++++++++++--- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index e4cdb46ecc..87cbcc83c3 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -325,22 +325,23 @@ def fit(self, n_epoch, batch_size, train_data, ripple_set, show_loss): show_loss (bool): whether to show loss update """ self.ripple_set = ripple_set + self.train_data = train_data for step in range(n_epoch): # training - np.random.shuffle(train_data) + np.random.shuffle(self.train_data) start = 0 - while start < train_data.shape[0]: + while start < self.train_data.shape[0]: _, loss = self._train( self._get_feed_dict( - data=train_data, start=start, end=start + batch_size + data=self.train_data, start=start, end=start + batch_size ) ) start += batch_size if show_loss: - log.info("%.1f%% %.4f" % (start / train_data.shape[0] * 100, loss)) + log.info("%.1f%% %.4f" % (start / self.train_data.shape[0] * 100, loss)) train_auc, train_acc = self._print_metrics_evaluation( - data=train_data, batch_size=batch_size + data=self.train_data, batch_size=batch_size ) log.info( @@ -355,7 +356,7 @@ def predict(self, batch_size, data): data (pd.DataFrame): User id, item and rating dataframe Returns: - (pd.DataFrame, pd.DataFrame): real labels of the predicted items, predicted scores of the predicted items + (list, list): real labels of the predicted items, predicted scores of the predicted items """ start = 0 labels = [0] * data.shape[0] @@ -372,3 +373,44 @@ def predict(self, batch_size, data): start += batch_size return labels, scores + + def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): + """Recomment top K items method for RippleNet. + + Args: + batch_size (int): batch size + data (pd.DataFrame): User id, item and rating dataframe + top_k (int): number of items to recommend + remove_seen (bool): if the items seen by an user in train should be recomed from the test set + + Returns: + (pd.DataFrame): top K items by score per user + """ + if remove_seen == True: + log.info("Removing seen items") + seen_items = data.merge(self.train_data.iloc[:,0:2], on = list(data.columns[0:2]), indicator=True, how = 'left') + data = seen_items[seen_items['_merge']=='left_only'].drop(columns = ['_merge']) + start = 0 + labels = [0] * data.shape[0] + scores = [0] * data.shape[0] + while start < data.shape[0]: + ( + labels[start : start + batch_size], + scores[start : start + batch_size], + ) = self._return_scores( + feed_dict=self._get_feed_dict( + data=data, start=start, end=start + batch_size + ) + ) + start += batch_size + + data['scores'] = scores + top_k_items = ( + data.groupby(data.columns[0], as_index=False) + .apply(lambda x: x.nlargest(top_k, 'scores')) + .reset_index(drop=True) + ) + # Add ranks + top_k_items["rank"] = top_k_items.groupby(data.columns[0], sort=False).cumcount() + 1 + + return top_k_items \ No newline at end of file From 03258ebf913881c244a321c8899363e824ce347b Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 8 Feb 2020 11:48:30 +0100 Subject: [PATCH 65/75] new param to remove or keep negative ratings --- reco_utils/recommender/ripplenet/preprocess.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/reco_utils/recommender/ripplenet/preprocess.py b/reco_utils/recommender/ripplenet/preprocess.py index 7af0a92520..450738e2ac 100644 --- a/reco_utils/recommender/ripplenet/preprocess.py +++ b/reco_utils/recommender/ripplenet/preprocess.py @@ -32,7 +32,7 @@ def read_item_index_to_entity_id_file(item_to_entity): return item_index_old2new, entity_id2index -def convert_rating(ratings, item_index_old2new, threshold, seed): +def convert_rating(ratings, item_index_old2new, threshold, remove_negative_ratings=True, seed=14): """Apply item standarization to ratings dataset. Use rating threshold to determite positive ratings @@ -40,6 +40,8 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): ratings (pd.DataFrame): ratings with columns ["UserId", "ItemId", "Rating"] item_index_old2new (dictionary): dictionary, conversion from original item ID to internal item ID threshold (int): minimum valur for the rating to be considered positive + remove_negative_ratings (bool): if the train/test set should exclude items below the threshold, + as the original papel proposes Returns: ratings_final (pd.DataFrame): ratings converted with columns userID, @@ -67,7 +69,7 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): else: if user_index_old not in user_neg_ratings: user_neg_ratings[user_index_old] = set() - user_neg_ratings[user_index_old].add(item_index) + user_neg_ratings[user_index_old].add((item_index, rating)) log.info("converting rating file ...") writer = [] @@ -90,17 +92,25 @@ def convert_rating(ratings, item_index_old2new, threshold, seed): pos_item_set = set(i[0] for i in pos_item_set) unwatched_set = item_set - pos_item_set if user_index_old in user_neg_ratings: - unwatched_set -= user_neg_ratings[user_index_old] + negative_set = dict(list(user_neg_ratings[user_index_old])) + if remove_negative_ratings == True: + unwatched_set -= set(negative_set.keys()) + else: + negative_set = {} np.random.seed(seed) for item in np.random.choice( list(unwatched_set), size=len(pos_item_set), replace=False ): + if item in negative_set: + original_rating = negative_set[item] + else: + original_rating = 0 writer.append( { "user_index": user_index, "item": item, "rating": 0, - "original_rating": 0, + "original_rating": original_rating, } ) ratings_final = pd.DataFrame(writer) From 893c5ab87670f8541d92d009e9af43ee1d39ec12 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 8 Feb 2020 11:49:24 +0100 Subject: [PATCH 66/75] new params remove negative ratings and new function recommend_k_items --- notebooks/02_model/rippleNet_deep_dive.ipynb | 259 ++++++++++--------- 1 file changed, 131 insertions(+), 128 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index e6d3aa8aa2..d0c2879312 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -51,16 +51,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", - "[GCC 7.3.0]\n", - "Pandas version: 0.25.3\n", + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", + "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n", + "Pandas version: 0.25.1\n", "Tensorflow version: 1.12.0\n" ] } @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -100,7 +100,7 @@ "outputs": [], "source": [ "# Select MovieLens data size: 100k, 1M, 10M\n", - "MOVIELENS_DATA_SIZE = '1M'\n", + "MOVIELENS_DATA_SIZE = '100k'\n", "rating_threshold = 4 #Minimum rating of a movie to be considered positive\n", "\n", "# Ripple parameters\n", @@ -121,7 +121,8 @@ "seed = 12\n", "\n", "#Evaluation parameters\n", - "TOP_K = 10\n" + "TOP_K = 10\n", + "remove_seen = True" ] }, { @@ -142,14 +143,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 5.78k/5.78k [00:00<00:00, 17.9kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.17kKB/s]\n" ] }, { @@ -184,52 +185,47 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", + " 196\n", + " 242\n", + " 3.0\n", + " 881250949\n", + " Kolya (1996)\n", + " Comedy\n", + " 1996\n", + " \n", + " \n", " 1\n", - " 1193\n", - " 5.0\n", - " 978300760\n", - " One Flew Over the Cuckoo's Nest (1975)\n", - " Drama\n", - " 1975\n", + " 63\n", + " 242\n", + " 3.0\n", + " 875747190\n", + " Kolya (1996)\n", + " Comedy\n", + " 1996\n", " \n", " \n", - " 1\n", " 2\n", - " 1193\n", + " 226\n", + " 242\n", " 5.0\n", - " 978298413\n", - " One Flew Over the Cuckoo's Nest (1975)\n", - " Drama\n", - " 1975\n", - " \n", - " \n", - " 2\n", - " 12\n", - " 1193\n", - " 4.0\n", - " 978220179\n", - " One Flew Over the Cuckoo's Nest (1975)\n", - " Drama\n", - " 1975\n", + " 883888671\n", + " Kolya (1996)\n", + " Comedy\n", + " 1996\n", " \n", " \n", "\n", "" ], "text/plain": [ - " UserId ItemId Rating Timestamp Title \\\n", - "0 1 1193 5.0 978300760 One Flew Over the Cuckoo's Nest (1975) \n", - "1 2 1193 5.0 978298413 One Flew Over the Cuckoo's Nest (1975) \n", - "2 12 1193 4.0 978220179 One Flew Over the Cuckoo's Nest (1975) \n", - "\n", - " Genres Year \n", - "0 Drama 1975 \n", - "1 Drama 1975 \n", - "2 Drama 1975 " + " UserId ItemId Rating Timestamp Title Genres Year\n", + "0 196 242 3.0 881250949 Kolya (1996) Comedy 1996\n", + "1 63 242 3.0 875747190 Kolya (1996) Comedy 1996\n", + "2 226 242 5.0 883888671 Kolya (1996) Comedy 1996" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -245,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -278,46 +274,46 @@ " \n", " \n", " \n", - " 0\n", - " Q857313\n", - " Q7005314\n", - " New American Library\n", - " One Flew Over the Cuckoo's Nest (1975)\n", - " 1193\n", + " 0\n", + " Q1141186\n", + " Q130232\n", + " drama film\n", + " Kolya (1996)\n", + " 242\n", " \n", " \n", - " 1\n", - " Q857313\n", - " Q921536\n", - " Viking Press\n", - " One Flew Over the Cuckoo's Nest (1975)\n", - " 1193\n", + " 1\n", + " Q1141186\n", + " Q157443\n", + " comedy film\n", + " Kolya (1996)\n", + " 242\n", " \n", " \n", - " 2\n", - " Q857313\n", - " Q113013\n", - " postmodern literature\n", - " One Flew Over the Cuckoo's Nest (1975)\n", - " 1193\n", + " 2\n", + " Q1141186\n", + " Q10819887\n", + " Andrei Chalimon\n", + " Kolya (1996)\n", + " 242\n", " \n", " \n", "\n", "" ], "text/plain": [ - " original_entity linked_entities name_linked_entities \\\n", - "0 Q857313 Q7005314 New American Library \n", - "1 Q857313 Q921536 Viking Press \n", - "2 Q857313 Q113013 postmodern literature \n", + " original_entity linked_entities name_linked_entities movielens_title \\\n", + "0 Q1141186 Q130232 drama film Kolya (1996) \n", + "1 Q1141186 Q157443 comedy film Kolya (1996) \n", + "2 Q1141186 Q10819887 Andrei Chalimon Kolya (1996) \n", "\n", - " movielens_title movielens_id \n", - "0 One Flew Over the Cuckoo's Nest (1975) 1193 \n", - "1 One Flew Over the Cuckoo's Nest (1975) 1193 \n", - "2 One Flew Over the Cuckoo's Nest (1975) 1193 " + " movielens_id \n", + "0 242 \n", + "1 242 \n", + "2 242 " ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -338,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -350,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -380,32 +376,32 @@ " \n", " \n", " \n", - " 0\n", " 0\n", - " Q1503215\n", + " 0\n", + " Q204725\n", " \n", " \n", - " 1\n", " 1\n", - " Q271189\n", + " 1\n", + " Q794708\n", " \n", " \n", - " 2\n", " 2\n", - " Q832444\n", + " 2\n", + " Q271732\n", " \n", " \n", "\n", "" ], "text/plain": [ - " unified_id entity\n", - "0 0 Q1503215\n", - "1 1 Q271189\n", - "2 2 Q832444" + " unified_id entity\n", + "0 0 Q204725\n", + "1 1 Q794708\n", + "2 2 Q271732" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -419,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -450,22 +446,22 @@ " \n", " \n", " \n", - " 0\n", - " 3357\n", + " 0\n", + " 902\n", " 1\n", - " 22016\n", + " 13289\n", " \n", " \n", - " 1\n", - " 26376\n", " 1\n", - " 22016\n", + " 20606\n", + " 1\n", + " 13289\n", " \n", " \n", - " 2\n", - " 3357\n", + " 2\n", + " 1501\n", " 1\n", - " 12264\n", + " 13289\n", " \n", " \n", "\n", @@ -473,12 +469,12 @@ ], "text/plain": [ " original_entity_id relation linked_entities_id\n", - "0 3357 1 22016\n", - "1 26376 1 22016\n", - "2 3357 1 12264" + "0 902 1 13289\n", + "1 20606 1 13289\n", + "2 1501 1 13289" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -495,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -525,19 +521,19 @@ " \n", " \n", " \n", - " 0\n", - " 1193\n", - " 3357\n", + " 0\n", + " 242\n", + " 902\n", " \n", " \n", - " 1\n", - " 1193\n", - " 26376\n", + " 1\n", + " 242\n", + " 20606\n", " \n", " \n", - " 2\n", - " 661\n", - " 493\n", + " 2\n", + " 302\n", + " 1501\n", " \n", " \n", "\n", @@ -545,12 +541,12 @@ ], "text/plain": [ " movielens_id unified_id\n", - "0 1193 3357\n", - "1 1193 26376\n", - "2 661 493" + "0 242 902\n", + "1 242 20606\n", + "2 302 1501" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -565,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -607,14 +603,16 @@ "output_type": "stream", "text": [ "INFO:reco_utils.recommender.ripplenet.preprocess:converting rating file ...\n", - "INFO:reco_utils.recommender.ripplenet.preprocess:number of users: 6038\n", - "INFO:reco_utils.recommender.ripplenet.preprocess:number of items: 3689\n" + "INFO:reco_utils.recommender.ripplenet.preprocess:number of users: 942\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of items: 1677\n" ] } ], "source": [ "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new,\n", - " threshold = rating_threshold, seed = 12)" + " threshold = rating_threshold,\n", + " remove_negative_ratings=False,\n", + " seed = 12)" ] }, { @@ -913,33 +911,38 @@ " labels, scores = ripple.predict(batch_size=batch_size, \n", " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy())\n", " predictions = [1 if i >= 0.5 else 0 for i in scores]\n", - " \n", + "\n", + "test_data['scores'] = scores\n", "print(\"Took {} seconds for prediction.\".format(test_time.interval))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "In case you need to re-create the RippleNet again, simply run:\n", - "```python\n", - "tf.reset_default_graph()```" + "with Timer() as topK_time:\n", + " top_k_items = ripple.recommend_k_items(batch_size=batch_size, \n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", + " top_k=TOP_K, remove_seen=remove_seen)\n", + "print(\"Took {} seconds for top_k_items.\".format(topK_time.interval))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Results and Evaluation" + "In case you need to re-create the RippleNet again, simply run:\n", + "```python\n", + "tf.reset_default_graph()```" ] }, { - "cell_type": "code", - "execution_count": 22, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "test_data['scores'] = scores" + "## Results and Evaluation" ] }, { @@ -1066,9 +1069,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco_gpu)", + "display_name": "Python (reco)", "language": "python", - "name": "reco_gpu" + "name": "reco_base" }, "language_info": { "codemirror_mode": { From a98322e63c18d6c04103566cc9afaa28de0d22d5 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sat, 8 Feb 2020 11:57:54 +0100 Subject: [PATCH 67/75] clarification for param remove_negative_ratings --- notebooks/02_model/rippleNet_deep_dive.ipynb | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index d0c2879312..a8bf22cba3 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -102,6 +102,7 @@ "# Select MovieLens data size: 100k, 1M, 10M\n", "MOVIELENS_DATA_SIZE = '100k'\n", "rating_threshold = 4 #Minimum rating of a movie to be considered positive\n", + "remove_negative_ratings = True #Items rated below the threshold will be removed from train and test \n", "\n", "# Ripple parameters\n", "n_epoch = 10 #the number of epochs\n", @@ -593,6 +594,17 @@ "item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(item_to_entity)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the original paper, items are divided into those rated and above the threshold marked as 1, and those unwatched marked as 0. Items with a rating below the threshold are removed from train and test:\n", + "\n", + "> Since MovieLens-1M and Book-Crossing are explicit feedback data, we transform them into implicit feedback where each entry is marked with 1 indicating that the user has rated the item (the threshold of rating is 4 for MovieLens-1M, while no threshold is set for Book-Crossing due to its sparsity), and sample an unwatched set marked as 0 for each user, which is of equal size with the rated ones.\n", + "\n", + "We have added a param with the option to keep or remove the items watched and rated below the threshold marked as 0, *remove_negative_ratings*" + ] + }, { "cell_type": "code", "execution_count": 13, @@ -611,7 +623,7 @@ "source": [ "ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new,\n", " threshold = rating_threshold,\n", - " remove_negative_ratings=False,\n", + " remove_negative_ratings=remove_negative_ratings,\n", " seed = 12)" ] }, From 641a7c3e3c8b4962272c64f3258533fccf19a6af Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 9 Feb 2020 10:56:27 +0100 Subject: [PATCH 68/75] transform to numpy matrices internally --- notebooks/02_model/rippleNet_deep_dive.ipynb | 8 ++++---- reco_utils/recommender/ripplenet/model.py | 12 +++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index a8bf22cba3..e4dc891b48 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -598,7 +598,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the original paper, items are divided into those rated and above the threshold marked as 1, and those unwatched marked as 0. Items with a rating below the threshold are removed from train and test:\n", + "In the original paper, items are divided into those rated and above the threshold marked as 1, and those unwatched marked as 0. Items watched with a rating below the threshold are removed from train and test:\n", "\n", "> Since MovieLens-1M and Book-Crossing are explicit feedback data, we transform them into implicit feedback where each entry is marked with 1 indicating that the user has rated the item (the threshold of rating is 4 for MovieLens-1M, while no threshold is set for Book-Crossing due to its sparsity), and sample an unwatched set marked as 0 for each user, which is of equal size with the rated ones.\n", "\n", @@ -898,7 +898,7 @@ "source": [ "with Timer() as train_time:\n", " ripple.fit(n_epoch=n_epoch, batch_size=batch_size,\n", - " train_data=train_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(), \n", + " train_data=train_data[[\"user_index\", \"item\", \"rating\"]], \n", " ripple_set=ripple_set,\n", " show_loss=show_loss)\n", "\n", @@ -921,7 +921,7 @@ "source": [ "with Timer() as test_time:\n", " labels, scores = ripple.predict(batch_size=batch_size, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy())\n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]])\n", " predictions = [1 if i >= 0.5 else 0 for i in scores]\n", "\n", "test_data['scores'] = scores\n", @@ -936,7 +936,7 @@ "source": [ "with Timer() as topK_time:\n", " top_k_items = ripple.recommend_k_items(batch_size=batch_size, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]].to_numpy(),\n", + " data=test_data[[\"user_index\", \"item\", \"rating\"]],\n", " top_k=TOP_K, remove_seen=remove_seen)\n", "print(\"Took {} seconds for top_k_items.\".format(topK_time.interval))" ] diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 87cbcc83c3..053596f718 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -358,6 +358,7 @@ def predict(self, batch_size, data): Returns: (list, list): real labels of the predicted items, predicted scores of the predicted items """ + data = data.to_numpy() start = 0 labels = [0] * data.shape[0] scores = [0] * data.shape[0] @@ -375,7 +376,7 @@ def predict(self, batch_size, data): return labels, scores def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): - """Recomment top K items method for RippleNet. + """Recommend top K items method for RippleNet. Args: batch_size (int): batch size @@ -390,16 +391,17 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): log.info("Removing seen items") seen_items = data.merge(self.train_data.iloc[:,0:2], on = list(data.columns[0:2]), indicator=True, how = 'left') data = seen_items[seen_items['_merge']=='left_only'].drop(columns = ['_merge']) + data_np = data.to_numpy() start = 0 - labels = [0] * data.shape[0] - scores = [0] * data.shape[0] - while start < data.shape[0]: + labels = [0] * data_np.shape[0] + scores = [0] * data_np.shape[0] + while start < data_np.shape[0]: ( labels[start : start + batch_size], scores[start : start + batch_size], ) = self._return_scores( feed_dict=self._get_feed_dict( - data=data, start=start, end=start + batch_size + data=data_np, start=start, end=start + batch_size ) ) start += batch_size From 4004538d32b699ddae78346fc1e867bec3d29509 Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 9 Feb 2020 11:10:46 +0100 Subject: [PATCH 69/75] internal transform in numpy for train --- reco_utils/recommender/ripplenet/model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 053596f718..3f65b6cf8b 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -3,6 +3,7 @@ import tensorflow as tf import numpy as np +import pandas as pd import logging from sklearn.metrics import roc_auc_score @@ -325,7 +326,7 @@ def fit(self, n_epoch, batch_size, train_data, ripple_set, show_loss): show_loss (bool): whether to show loss update """ self.ripple_set = ripple_set - self.train_data = train_data + self.train_data = train_data.to_numpy() for step in range(n_epoch): # training np.random.shuffle(self.train_data) @@ -389,7 +390,8 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): """ if remove_seen == True: log.info("Removing seen items") - seen_items = data.merge(self.train_data.iloc[:,0:2], on = list(data.columns[0:2]), indicator=True, how = 'left') + train_data = pd.DataFrame(train_data) + seen_items = data.merge(train_data.iloc[:,0:2], on = list(data.columns[0:2]), indicator=True, how = 'left') data = seen_items[seen_items['_merge']=='left_only'].drop(columns = ['_merge']) data_np = data.to_numpy() start = 0 From 23e214b2b4f7383048a0eb855915c841968baf3e Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 9 Feb 2020 11:34:54 +0100 Subject: [PATCH 70/75] missing reference to self.train_data --- reco_utils/recommender/ripplenet/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 3f65b6cf8b..944822a2ef 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -390,7 +390,7 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): """ if remove_seen == True: log.info("Removing seen items") - train_data = pd.DataFrame(train_data) + train_data = pd.DataFrame(self.train_data) seen_items = data.merge(train_data.iloc[:,0:2], on = list(data.columns[0:2]), indicator=True, how = 'left') data = seen_items[seen_items['_merge']=='left_only'].drop(columns = ['_merge']) data_np = data.to_numpy() From c94bbbebd1ca760940070b063aef04880191ee0a Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 9 Feb 2020 12:07:14 +0100 Subject: [PATCH 71/75] black formatting --- reco_utils/recommender/ripplenet/model.py | 28 ++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 944822a2ef..3b4c87d638 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -339,7 +339,9 @@ def fit(self, n_epoch, batch_size, train_data, ripple_set, show_loss): ) start += batch_size if show_loss: - log.info("%.1f%% %.4f" % (start / self.train_data.shape[0] * 100, loss)) + log.info( + "%.1f%% %.4f" % (start / self.train_data.shape[0] * 100, loss) + ) train_auc, train_acc = self._print_metrics_evaluation( data=self.train_data, batch_size=batch_size @@ -375,7 +377,7 @@ def predict(self, batch_size, data): start += batch_size return labels, scores - + def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): """Recommend top K items method for RippleNet. @@ -391,8 +393,16 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): if remove_seen == True: log.info("Removing seen items") train_data = pd.DataFrame(self.train_data) - seen_items = data.merge(train_data.iloc[:,0:2], on = list(data.columns[0:2]), indicator=True, how = 'left') - data = seen_items[seen_items['_merge']=='left_only'].drop(columns = ['_merge']) + seen_items = data.merge( + train_data.iloc[:, 0:2], + right_on=list(data.columns[0:2]), + left_on=list(train_data.columns[0:2]), + indicator=True, + how="left" + ) + data = seen_items[seen_items["_merge"] == "left_only"].drop( + columns=["_merge"] + ) data_np = data.to_numpy() start = 0 labels = [0] * data_np.shape[0] @@ -408,13 +418,15 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): ) start += batch_size - data['scores'] = scores + data["scores"] = scores top_k_items = ( data.groupby(data.columns[0], as_index=False) - .apply(lambda x: x.nlargest(top_k, 'scores')) + .apply(lambda x: x.nlargest(top_k, "scores")) .reset_index(drop=True) ) # Add ranks - top_k_items["rank"] = top_k_items.groupby(data.columns[0], sort=False).cumcount() + 1 + top_k_items["rank"] = ( + top_k_items.groupby(data.columns[0], sort=False).cumcount() + 1 + ) - return top_k_items \ No newline at end of file + return top_k_items From f71754ae71dd5b1f1c0073507e0e85dfab27933a Mon Sep 17 00:00:00 2001 From: almudenasanz Date: Sun, 9 Feb 2020 12:54:53 +0100 Subject: [PATCH 72/75] fixed train_data column naming and added precision_at_k evaluation with top_k_items --- notebooks/02_model/rippleNet_deep_dive.ipynb | 16 +++++++++++++--- reco_utils/recommender/ripplenet/model.py | 6 +++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/rippleNet_deep_dive.ipynb index e4dc891b48..391c72e218 100644 --- a/notebooks/02_model/rippleNet_deep_dive.ipynb +++ b/notebooks/02_model/rippleNet_deep_dive.ipynb @@ -936,7 +936,7 @@ "source": [ "with Timer() as topK_time:\n", " top_k_items = ripple.recommend_k_items(batch_size=batch_size, \n", - " data=test_data[[\"user_index\", \"item\", \"rating\"]],\n", + " data=test_data[[\"user_index\", \"item\", \"rating\", \"original_rating\"]],\n", " top_k=TOP_K, remove_seen=remove_seen)\n", "print(\"Took {} seconds for top_k_items.\".format(topK_time.interval))" ] @@ -997,6 +997,16 @@ "print(\"The accuracy is {}\".format(acc_score))" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop column rank, not necessary for evaluation\n", + "top_k_items = top_k_items.drop(columns = \"rank\")" + ] + }, { "cell_type": "code", "execution_count": 25, @@ -1011,7 +1021,7 @@ } ], "source": [ - "precision_k_score = precision_at_k(test_data, test_data, \n", + "precision_k_score = precision_at_k(top_k_items, top_k_items, \n", " col_user=\"user_index\",\n", " col_item=\"item\",\n", " col_rating=\"original_rating\",\n", @@ -1035,7 +1045,7 @@ } ], "source": [ - "recall_k_score = recall_at_k(test_data, test_data, \n", + "recall_k_score = recall_at_k(top_k_items, top_k_items, \n", " col_user=\"user_index\",\n", " col_item=\"item\",\n", " col_rating=\"original_rating\",\n", diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 3b4c87d638..9fa3ad930d 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -392,11 +392,11 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): """ if remove_seen == True: log.info("Removing seen items") - train_data = pd.DataFrame(self.train_data) + train_data = pd.DataFrame(self.train_data).iloc[:, 0:2] + train_data.columns = list(data.columns[0:2]) seen_items = data.merge( train_data.iloc[:, 0:2], - right_on=list(data.columns[0:2]), - left_on=list(train_data.columns[0:2]), + on=list(data.columns[0:2]), indicator=True, how="left" ) From b372db0c8800defb0563364d10054b926a195147 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 12 Feb 2020 11:28:16 +0000 Subject: [PATCH 73/75] edit name to lower case --- .../{rippleNet_deep_dive.ipynb => ripplenet_deep_dive.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename notebooks/02_model/{rippleNet_deep_dive.ipynb => ripplenet_deep_dive.ipynb} (100%) diff --git a/notebooks/02_model/rippleNet_deep_dive.ipynb b/notebooks/02_model/ripplenet_deep_dive.ipynb similarity index 100% rename from notebooks/02_model/rippleNet_deep_dive.ipynb rename to notebooks/02_model/ripplenet_deep_dive.ipynb From 2aff81097333176c9af39fa76a3adaeadac3569f Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 12 Feb 2020 11:43:29 +0000 Subject: [PATCH 74/75] :bug: to_numpy() deprecated in favour of values --- notebooks/02_model/ripplenet_deep_dive.ipynb | 340 +++++-------------- reco_utils/recommender/ripplenet/model.py | 8 +- 2 files changed, 82 insertions(+), 266 deletions(-) diff --git a/notebooks/02_model/ripplenet_deep_dive.ipynb b/notebooks/02_model/ripplenet_deep_dive.ipynb index 391c72e218..d4bf88518a 100644 --- a/notebooks/02_model/ripplenet_deep_dive.ipynb +++ b/notebooks/02_model/ripplenet_deep_dive.ipynb @@ -58,9 +58,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", - "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n", - "Pandas version: 0.25.1\n", + "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", + "[GCC 7.3.0]\n", + "Pandas version: 0.23.4\n", "Tensorflow version: 1.12.0\n" ] } @@ -151,7 +151,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.17kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:01<00:00, 4.52kKB/s]\n" ] }, { @@ -186,7 +186,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 196\n", " 242\n", " 3.0\n", @@ -196,7 +196,7 @@ " 1996\n", " \n", " \n", - " 1\n", + " 1\n", " 63\n", " 242\n", " 3.0\n", @@ -206,7 +206,7 @@ " 1996\n", " \n", " \n", - " 2\n", + " 2\n", " 226\n", " 242\n", " 5.0\n", @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -275,7 +275,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " Q1141186\n", " Q130232\n", " drama film\n", @@ -283,7 +283,7 @@ " 242\n", " \n", " \n", - " 1\n", + " 1\n", " Q1141186\n", " Q157443\n", " comedy film\n", @@ -291,7 +291,7 @@ " 242\n", " \n", " \n", - " 2\n", + " 2\n", " Q1141186\n", " Q10819887\n", " Andrei Chalimon\n", @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -347,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -377,19 +377,19 @@ " \n", " \n", " \n", + " 0\n", " 0\n", - " 0\n", - " Q204725\n", + " Q607910\n", " \n", " \n", + " 1\n", " 1\n", - " 1\n", - " Q794708\n", + " Q657259\n", " \n", " \n", + " 2\n", " 2\n", - " 2\n", - " Q271732\n", + " Q491185\n", " \n", " \n", "\n", @@ -397,9 +397,9 @@ ], "text/plain": [ " unified_id entity\n", - "0 0 Q204725\n", - "1 1 Q794708\n", - "2 2 Q271732" + "0 0 Q607910\n", + "1 1 Q657259\n", + "2 2 Q491185" ] }, "execution_count": 6, @@ -416,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -447,22 +447,22 @@ " \n", " \n", " \n", - " 0\n", - " 902\n", + " 0\n", + " 1177\n", " 1\n", - " 13289\n", + " 15580\n", " \n", " \n", + " 1\n", + " 16107\n", " 1\n", - " 20606\n", - " 1\n", - " 13289\n", + " 15580\n", " \n", " \n", - " 2\n", - " 1501\n", + " 2\n", + " 1278\n", " 1\n", - " 13289\n", + " 15580\n", " \n", " \n", "\n", @@ -470,9 +470,9 @@ ], "text/plain": [ " original_entity_id relation linked_entities_id\n", - "0 902 1 13289\n", - "1 20606 1 13289\n", - "2 1501 1 13289" + "0 1177 1 15580\n", + "1 16107 1 15580\n", + "2 1278 1 15580" ] }, "execution_count": 7, @@ -492,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -522,19 +522,19 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 242\n", - " 902\n", + " 1177\n", " \n", " \n", - " 1\n", + " 1\n", " 242\n", - " 20606\n", + " 16107\n", " \n", " \n", - " 2\n", + " 2\n", " 302\n", - " 1501\n", + " 1278\n", " \n", " \n", "\n", @@ -542,9 +542,9 @@ ], "text/plain": [ " movielens_id unified_id\n", - "0 242 902\n", - "1 242 20606\n", - "2 302 1501" + "0 242 1177\n", + "1 242 16107\n", + "2 302 1278" ] }, "execution_count": 8, @@ -562,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -586,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -607,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "scrolled": true }, @@ -639,7 +639,7 @@ "output_type": "stream", "text": [ "INFO:reco_utils.recommender.ripplenet.preprocess:converting kg file ...\n", - "INFO:reco_utils.recommender.ripplenet.preprocess:number of entities (containing items): 39915\n", + "INFO:reco_utils.recommender.ripplenet.preprocess:number of entities (containing items): 22994\n", "INFO:reco_utils.recommender.ripplenet.preprocess:number of relations: 1\n" ] } @@ -664,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -673,90 +673,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_indexitemratingoriginal_rating
145034115.0
640359114.0
2870572500.0
2760140300.0
2230158100.0
\n", - "
" - ], - "text/plain": [ - " user_index item rating original_rating\n", - "145 0 341 1 5.0\n", - "64 0 3591 1 4.0\n", - "287 0 5725 0 0.0\n", - "276 0 1403 0 0.0\n", - "223 0 1581 0 0.0" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "train_data.head()" ] @@ -770,25 +689,9 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:reco_utils.recommender.ripplenet.data_loader:reading KG file ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of entities: 39799\n", - "Number of relations: 1\n" - ] - } - ], + "outputs": [], "source": [ "n_entity, n_relation, kg = load_kg(kg_final)\n", "print(\"Number of entities:\", n_entity)\n", @@ -814,17 +717,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:reco_utils.recommender.ripplenet.data_loader:constructing ripple set ...\n" - ] - } - ], + "outputs": [], "source": [ "user_history_dict = train_data.loc[train_data.rating == 1].groupby('user_index')['item'].apply(list).to_dict()\n", "ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)" @@ -839,18 +734,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:numexpr.utils:Note: NumExpr detected 24 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", - "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n" - ] - } - ], + "outputs": [], "source": [ "ripple = RippleNet(dim=dim,\n", " n_hop=n_hop,\n", @@ -868,33 +754,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:reco_utils.recommender.ripplenet.model:epoch 0 train auc: 0.9063 acc: 0.8241\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 1 train auc: 0.9303 acc: 0.8534\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 2 train auc: 0.9386 acc: 0.8638\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 3 train auc: 0.9456 acc: 0.8737\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 4 train auc: 0.9499 acc: 0.8788\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 5 train auc: 0.9522 acc: 0.8822\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 6 train auc: 0.9547 acc: 0.8860\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 7 train auc: 0.9578 acc: 0.8900\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 8 train auc: 0.9607 acc: 0.8955\n", - "INFO:reco_utils.recommender.ripplenet.model:epoch 9 train auc: 0.9624 acc: 0.8980\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 650.0238825449924 seconds for training.\n" - ] - } - ], + "outputs": [], "source": [ "with Timer() as train_time:\n", " ripple.fit(n_epoch=n_epoch, batch_size=batch_size,\n", @@ -907,17 +769,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 5.9928844239912 seconds for prediction.\n" - ] - } - ], + "outputs": [], "source": [ "with Timer() as test_time:\n", " labels, scores = ripple.predict(batch_size=batch_size, \n", @@ -934,11 +788,11 @@ "metadata": {}, "outputs": [], "source": [ - "with Timer() as topK_time:\n", + "with Timer() as topk_time:\n", " top_k_items = ripple.recommend_k_items(batch_size=batch_size, \n", " data=test_data[[\"user_index\", \"item\", \"rating\", \"original_rating\"]],\n", " top_k=TOP_K, remove_seen=remove_seen)\n", - "print(\"Took {} seconds for top_k_items.\".format(topK_time.interval))" + "print(\"Took {} seconds for top_k_items.\".format(topk_time.interval))" ] }, { @@ -959,17 +813,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The auc score is 0.9229666674514663\n" - ] - } - ], + "outputs": [], "source": [ "auc_score = auc(test_data, test_data, \n", " col_user=\"user_index\",\n", @@ -981,17 +827,9 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The accuracy is 0.8491929906968855\n" - ] - } - ], + "outputs": [], "source": [ "acc_score = np.mean(np.equal(predictions, labels)) # same result as in sklearn.metrics.accuracy_score \n", "print(\"The accuracy is {}\".format(acc_score))" @@ -1009,17 +847,9 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The precision_k_score score at k = 10, is 0.9307883405101026\n" - ] - } - ], + "outputs": [], "source": [ "precision_k_score = precision_at_k(top_k_items, top_k_items, \n", " col_user=\"user_index\",\n", @@ -1033,17 +863,9 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The recall_k_score score at k = 10, is 0.5144293430105117\n" - ] - } - ], + "outputs": [], "source": [ "recall_k_score = recall_at_k(top_k_items, top_k_items, \n", " col_user=\"user_index\",\n", @@ -1067,7 +889,8 @@ "pm.record(\"precision\", precision_k_score)\n", "pm.record(\"recall\", recall_k_score)\n", "pm.record(\"train_time\", train_time.interval)\n", - "pm.record(\"test_time\", test_time.interval)" + "pm.record(\"test_time\", test_time.interval)\n", + "pm.record(\"topk_time\", topk_time.interval)\n" ] }, { @@ -1079,21 +902,14 @@ "1. Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo, \"RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems\", *The 27th ACM International Conference on Information and Knowledge Management (CIKM 2018)*, 2018. https://arxiv.org/pdf/1803.03467.pdf\n", "1. The original implementation of RippleNet: https://github.com/hwwang55/RippleNet" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco)", + "display_name": "Python (reco_gpu)", "language": "python", - "name": "reco_base" + "name": "reco_gpu" }, "language_info": { "codemirror_mode": { @@ -1105,7 +921,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/reco_utils/recommender/ripplenet/model.py b/reco_utils/recommender/ripplenet/model.py index 9fa3ad930d..38920edbb0 100644 --- a/reco_utils/recommender/ripplenet/model.py +++ b/reco_utils/recommender/ripplenet/model.py @@ -326,7 +326,7 @@ def fit(self, n_epoch, batch_size, train_data, ripple_set, show_loss): show_loss (bool): whether to show loss update """ self.ripple_set = ripple_set - self.train_data = train_data.to_numpy() + self.train_data = train_data.values for step in range(n_epoch): # training np.random.shuffle(self.train_data) @@ -361,7 +361,7 @@ def predict(self, batch_size, data): Returns: (list, list): real labels of the predicted items, predicted scores of the predicted items """ - data = data.to_numpy() + data = data.values start = 0 labels = [0] * data.shape[0] scores = [0] * data.shape[0] @@ -398,12 +398,12 @@ def recommend_k_items(self, batch_size, data, top_k=10, remove_seen=True): train_data.iloc[:, 0:2], on=list(data.columns[0:2]), indicator=True, - how="left" + how="left", ) data = seen_items[seen_items["_merge"] == "left_only"].drop( columns=["_merge"] ) - data_np = data.to_numpy() + data_np = data.values start = 0 labels = [0] * data_np.shape[0] scores = [0] * data_np.shape[0] From 8aa4f553c8f59363650f95e622e40a43e041b01b Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 12 Feb 2020 11:45:51 +0000 Subject: [PATCH 75/75] model run with movielens 100k --- notebooks/02_model/ripplenet_deep_dive.ipynb | 316 +++++++++++++++++-- 1 file changed, 282 insertions(+), 34 deletions(-) diff --git a/notebooks/02_model/ripplenet_deep_dive.ipynb b/notebooks/02_model/ripplenet_deep_dive.ipynb index d4bf88518a..e497067307 100644 --- a/notebooks/02_model/ripplenet_deep_dive.ipynb +++ b/notebooks/02_model/ripplenet_deep_dive.ipynb @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -347,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -416,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -492,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -562,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -586,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -607,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "scrolled": true }, @@ -664,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -673,9 +673,90 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
itemoriginal_ratingratinguser_index
12932810.000
23114070.000
524614.010
22932730.000
25020070.000
\n", + "
" + ], + "text/plain": [ + " item original_rating rating user_index\n", + "129 3281 0.0 0 0\n", + "231 1407 0.0 0 0\n", + "52 461 4.0 1 0\n", + "229 3273 0.0 0 0\n", + "250 2007 0.0 0 0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "train_data.head()" ] @@ -689,9 +770,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:reco_utils.recommender.ripplenet.data_loader:reading KG file ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of entities: 22908\n", + "Number of relations: 1\n" + ] + } + ], "source": [ "n_entity, n_relation, kg = load_kg(kg_final)\n", "print(\"Number of entities:\", n_entity)\n", @@ -717,9 +814,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:reco_utils.recommender.ripplenet.data_loader:constructing ripple set ...\n" + ] + } + ], "source": [ "user_history_dict = train_data.loc[train_data.rating == 1].groupby('user_index')['item'].apply(list).to_dict()\n", "ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)" @@ -734,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -754,9 +859,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:reco_utils.recommender.ripplenet.model:epoch 0 train auc: 0.9051 acc: 0.8202\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 1 train auc: 0.9162 acc: 0.8308\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 2 train auc: 0.9326 acc: 0.8527\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 3 train auc: 0.9407 acc: 0.8631\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 4 train auc: 0.9515 acc: 0.8775\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 5 train auc: 0.9615 acc: 0.8932\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 6 train auc: 0.9690 acc: 0.9076\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 7 train auc: 0.9747 acc: 0.9173\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 8 train auc: 0.9789 acc: 0.9248\n", + "INFO:reco_utils.recommender.ripplenet.model:epoch 9 train auc: 0.9818 acc: 0.9316\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 72.98155543790199 seconds for training.\n" + ] + } + ], "source": [ "with Timer() as train_time:\n", " ripple.fit(n_epoch=n_epoch, batch_size=batch_size,\n", @@ -769,9 +898,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 0.7585273641161621 seconds for prediction.\n" + ] + } + ], "source": [ "with Timer() as test_time:\n", " labels, scores = ripple.predict(batch_size=batch_size, \n", @@ -784,9 +921,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:reco_utils.recommender.ripplenet.model:Removing seen items\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 2.4120034659281373 seconds for top_k_items.\n" + ] + } + ], "source": [ "with Timer() as topk_time:\n", " top_k_items = ripple.recommend_k_items(batch_size=batch_size, \n", @@ -813,9 +965,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The auc score is 0.9012968931693994\n" + ] + } + ], "source": [ "auc_score = auc(test_data, test_data, \n", " col_user=\"user_index\",\n", @@ -827,9 +987,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The accuracy is 0.8271610513955379\n" + ] + } + ], "source": [ "acc_score = np.mean(np.equal(predictions, labels)) # same result as in sklearn.metrics.accuracy_score \n", "print(\"The accuracy is {}\".format(acc_score))" @@ -837,7 +1005,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -847,9 +1015,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The precision_k_score score at k = 10, is 0.8679405520169851\n" + ] + } + ], "source": [ "precision_k_score = precision_at_k(top_k_items, top_k_items, \n", " col_user=\"user_index\",\n", @@ -863,9 +1039,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The recall_k_score score at k = 10, is 1.0\n" + ] + } + ], "source": [ "recall_k_score = recall_at_k(top_k_items, top_k_items, \n", " col_user=\"user_index\",\n", @@ -879,9 +1063,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/papermill.record+json": { + "auc": 0.9012968931693994 + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/papermill.record+json": { + "accuracy": 0.8271610513955379 + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/papermill.record+json": { + "precision": 0.8679405520169851 + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/papermill.record+json": { + "recall": 1 + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/papermill.record+json": { + "train_time": 72.98155543790199 + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/papermill.record+json": { + "test_time": 0.7585273641161621 + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/papermill.record+json": { + "topk_time": 2.4120034659281373 + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Record results with papermill for tests - ignore this cell\n", "pm.record(\"auc\", auc_score)\n",