# RippleNet

In [4]:
import sys
sys.path.append("../../")
import pandas as pd
import numpy as np
import os
import argparse 
from reco_utils.dataset import movielens

from reco_utils.recommender.ripplenet.preprocess import (read_item_index_to_entity_id_file, 
                                         convert_rating, 
                                         convert_kg)

from reco_utils.recommender.ripplenet.data_loader import (
                                         dataset_split,
                                         load_kg, 
                                         get_ripple_set)

from reco_utils.recommender.ripplenet.train import (train)


print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas version: 0.25.1


In [5]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

## Read original data and transform entity ids to numerical

In [6]:
kg_original = pd.read_csv("https://recodatasets.blob.core.windows.net/wikidata/movielens_{}_wikidata.csv".format(MOVIELENS_DATA_SIZE))
ratings_original = movielens.load_pandas_df(MOVIELENS_DATA_SIZE,
                              ('UserId', 'ItemId', 'Rating', 'Timestamp'),
                             title_col='Title',
                             genres_col='Genres',
                             year_col='Year')
rating_threshold = 4

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.19kKB/s]


In [7]:
def transform_id(df, entities_id, col_transform, col_name = "unified_id"):
    df = df.merge(entities_id, left_on = col_transform, right_on = "entity")
    df = df.rename(columns = {"unified_id": col_name})
    return df.drop(columns = [col_transform, "entity"])

In [8]:
var_id = "movielens_id"
entities_id = pd.DataFrame({"entity":list(set(kg_original.original_entity)) + list(set(kg_original.linked_entities))}).reset_index()
entities_id = entities_id.rename(columns = {"index": "unified_id"})

item_to_entity = kg_original[[var_id, "original_entity"]].drop_duplicates().reset_index().drop(columns = "index")
item_to_entity = transform_id(item_to_entity, entities_id, "original_entity")

In [9]:
kg = kg_original[["original_entity", "linked_entities"]].drop_duplicates()
kg = transform_id(kg, entities_id, "original_entity", "original_entity_id")
kg = transform_id(kg, entities_id, "linked_entities", "linked_entities_id")
kg["relation"] = 1
kg_wikidata = kg[["original_entity_id","relation", "linked_entities_id"]]

In [10]:
vars_movielens = ["UserId", "ItemId", "Rating", "Timestamp"]
ratings = ratings_original[vars_movielens].sort_values(vars_movielens[1])

## Preprocess module from RippleNet

In [11]:
item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(item_to_entity)

In [12]:
ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new, threshold = rating_threshold)

converting rating file ...
number of users: 943
number of items: 1677


In [13]:
kg_final = convert_kg(kg_wikidata, entity_id2index = entity_id2index)

converting kg file ...
number of entities (containing items): 22994
number of relations: 1


## Load data

In [14]:
parser = argparse.ArgumentParser() 
parser.add_argument('--dataset', type=str, default='movielens100k', help='which dataset to use') 
parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings') 
parser.add_argument('--n_hop', type=int, default=2, help='maximum hops') 
parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term') 
parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term') 
parser.add_argument('--lr', type=float, default=0.02, help='learning rate') 
parser.add_argument('--batch_size', type=int, default=1024, help='batch size') 
parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs') 
parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop') 
parser.add_argument('--item_update_mode', type=str, default='plus_transform', help='how to update item at the end of each hop') 
parser.add_argument('--using_all_hops', type=bool, default=True, help='whether using outputs of all hops or just the last hop when making prediction') 
args = parser.parse_args("--dataset movielens100k".split())

In [15]:
train_data, eval_data, test_data, user_history_dict = dataset_split(ratings_final)
n_entity, n_relation, kg = load_kg(kg_final)
ripple_set = get_ripple_set(args, kg, user_history_dict)

splitting dataset ...
reading KG file ...
constructing knowledge graph ...
constructing ripple set ...


In [16]:
data_info = train_data.to_numpy(), eval_data.to_numpy(), test_data.to_numpy(), n_entity, n_relation, ripple_set
show_loss = False

In [None]:
train(args, data_info, show_loss)