In [None]:
# default_exp datasets.movielens

# MovieLens Dataset
> Implementation of MovieLens datasets.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import Any, Iterable, List, Optional, Tuple, Union, Callable
import os
import json

import pandas as pd
import numpy as np

import torch

from recohut.utils.common_utils import *
from recohut.datasets.bases.common import Dataset
from recohut.datasets.bases.interactions import InteractionsDataset, InteractionsDataModule
from recohut.datasets.bases.sequential import SequentialDataset, SequentialDataModule
from recohut.utils.splitting import stratified_split_v2

## ML1m Rating Dataset

In [None]:
#export
class ML1mDataset(InteractionsDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    @property
    def raw_file_names(self):
        return 'ratings.dat'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(os.path.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        # drop duplicate user-item pair records, keeping recent ratings only
        df.drop_duplicates(subset=['uid', 'sid'], keep='last', inplace=True)
        return df

In [None]:
#export
class ML1mDataModule(InteractionsDataModule):
    dataset_cls = ML1mDataset

In [None]:
class Args:
    def __init__(self):
        self.data_dir = '/content/data'
        self.min_rating = 4
        self.num_negative_samples = 99
        self.min_uc = 5
        self.min_sc = 5
        self.val_p = 0.2
        self.test_p = 0.2
        self.seed = 42
        self.split_type = 'stratified'

args = Args()

In [None]:
ds = ML1mDataModule(**args.__dict__)
ds.prepare_data()

Processing...


Turning into implicit ratings
Filtering triplets
Densifying index


Done!


In [None]:
#export
class ML1mDataset_v2(SequentialDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    @property
    def raw_file_names(self):
        return 'ratings.dat'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(os.path.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        return df

In [None]:
#export
class ML1mDataModule_v2(SequentialDataModule):
    dataset_cls = ML1mDataset_v2

In [None]:
class Args:
    def __init__(self):
        self.pad = 0
        self.mask = 1
        self.cap = 0
        self.seed = 42
        self.vocab_size = 10000
        self.channels = 128
        self.dropout = 0.4
        self.learning_rate = 1e-4
        self.history_size = 30
        self.data_dir = '/content/data'
        self.log_dir = '/content/recommender_logs'
        self.model_dir = '/content/recommender_models'
        self.batch_size = 32
        self.shuffle = True
        self.max_epochs = 2
        self.val_epoch = 1
        self.gpus = None
        self.monitor = 'valid_loss'
        self.mode = 'min'

args = Args()

In [None]:
ds = ML1mDataModule_v2(data_sir=args.data_dir, **args.__dict__)
ds.prepare_data()

In [None]:
#export
class ML1mDataset_v3(SequentialDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    def __init__(self, data_dir, data_type='train', *args, **kwargs):
        super().__init__(data_dir, data_type, *args, **kwargs)
        if data_type == 'train':
            self.ratings_frame = pd.read_csv(self.processed_paths[0], delimiter=",")
        elif data_type == 'valid':
            self.ratings_frame = pd.read_csv(self.processed_paths[1], delimiter=",")
        elif data_type == 'test':
            self.ratings_frame = pd.read_csv(self.processed_paths[2], delimiter=",")

    @property
    def raw_file_names(self):
        return ['ratings.dat', 'movies.dat', 'users.dat']

    @property
    def processed_file_names(self):
        return ['train.csv', 'valid.csv', 'test.csv']

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        for raw_file_name in self.raw_file_names:
            move(os.path.join(self.raw_dir, 'ml-1m', raw_file_name), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        return df

    def load_movies_df(self):
        df = pd.read_csv(self.raw_paths[1], sep='::', header=None, engine='python')
        df.columns = ["sid", "title", "genres"]
        return df

    def load_users_df(self):
        df = pd.read_csv(self.raw_paths[2], sep='::', header=None, engine='python')
        df.columns = ["uid", "sex", "age_group", "occupation", "zip_code"]
        return df

    def process(self):
        ## movies
        movies = self.load_movies_df()
        movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
        movies.year = pd.Categorical(movies.year)
        movies["year"] = movies.year.cat.codes
        movies["sid"] = movies["sid"].astype(str)

        genres = ["Action","Adventure","Animation","Children's","Comedy","Crime",
                "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical",
                "Mystery","Romance","Sci-Fi","Thriller","War","Western"]

        for genre in genres:
            movies[genre] = movies["genres"].apply(
                lambda values: int(genre in values.split("|"))
            )

        ## users
        users = self.load_users_df()
        users.sex = pd.Categorical(users.sex)
        users["sex"] = users.sex.cat.codes
        users.age_group = pd.Categorical(users.age_group)
        users["age_group"] = users.age_group.cat.codes
        users.occupation = pd.Categorical(users.occupation)
        users["occupation"] = users.occupation.cat.codes
        users.zip_code = pd.Categorical(users.zip_code)
        users["zip_code"] = users.zip_code.cat.codes
        users["uid"] = users["uid"].astype(str)

        # ratings
        ratings = self.load_ratings_df()
        ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
        ratings["sid"] = ratings["sid"].astype(str)
        ratings["uid"] = ratings["uid"].astype(str)

        # Transform the movie ratings data into sequences
        # First, let's sort the the ratings data using the unix_timestamp, 
        # and then group the movie_id values and the rating values by user_id.
        # The output DataFrame will have a record for each user_id, with two 
        # ordered lists (sorted by rating datetime): the movies they have rated,
        # and their ratings of these movies.

        ratings_group = ratings.sort_values(by=["timestamp"]).groupby("uid")

        ratings_data = pd.DataFrame(
            data={
                "uid": list(ratings_group.groups.keys()),
                "sids": list(ratings_group.sid.apply(list)),
                "ratings": list(ratings_group.rating.apply(list)),
                "timestamps": list(ratings_group.timestamp.apply(list)),
            }
        )

        # Now, let's split the movie_ids list into a set of sequences of a fixed 
        # length. We do the same for the ratings. Set the sequence_length variable 
        # to change the length of the input sequence to the model. You can also 
        # change the step_size to control the number of sequences to generate for 
        # each user.
        ratings_data.sids = ratings_data.sids.apply(
            lambda ids: self.create_sequences(ids, self.history_size, self.step_size)
        )
        ratings_data.ratings = ratings_data.ratings.apply(
            lambda ids: self.create_sequences(ids, self.history_size, self.step_size)
        )
        del ratings_data["timestamps"]

        # After that, we process the output to have each sequence in a separate 
        # records in the DataFrame. In addition, we join the user features with 
        # the ratings data.
        ratings_data_movies = ratings_data[["uid", "sids"]].explode(
            "sids", ignore_index=True
        )
        ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
        ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
        ratings_data_transformed = ratings_data_transformed.join(
            users.set_index("uid"), on="uid"
        )
        ratings_data_transformed.sids = ratings_data_transformed.sids.apply(
            lambda x: ",".join(x)
        )
        ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
            lambda x: ",".join([str(v) for v in x])
        )
        del ratings_data_transformed["zip_code"]
        ratings_data_transformed.rename(
            columns={"sids": "sequence_sids", "ratings": "sequence_ratings"},
            inplace=True,
        )
        # Finally, we split the data into training and testing splits, with 85% 
        # and 15% of the instances, respectively, and store them to CSV files.
        random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
        train_data = ratings_data_transformed[random_selection]
        test_data = ratings_data_transformed[~random_selection]

        # save
        train_data.to_csv(self.processed_paths[0], index=False, sep=",")
        test_data.to_csv(self.processed_paths[1], index=False, sep=",")
        test_data.to_csv(self.processed_paths[2], index=False, sep=",")

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.uid
        movie_history = eval(data.sequence_sids)
        movie_history_ratings = eval(data.sequence_ratings)
        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]
        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])
        sex, age_group, occupation = data.sex, data.age_group, data.occupation
        output = (user_id, movie_history, target_movie_id, movie_history_ratings,
                  target_movie_rating, sex, age_group, occupation)
        return output

In [None]:
#export
class ML1mDataModule_v3(SequentialDataModule):
    dataset_cls = ML1mDataset_v3

Example

In [None]:
class Args:
    def __init__(self):
        self.history_size = 8
        self.step_size = 1
        self.data_dir = '/content/data'
        self.log_dir = '/content/recommender_logs'
        self.model_dir = '/content/recommender_models'
        self.batch_size = 32
        self.shuffle = True
        self.max_epochs = 2
        self.val_epoch = 1
        self.gpus = None
        self.monitor = 'valid_loss'
        self.mode = 'min'
        self.window_size = 20

args = Args()

In [None]:
ds = ML1mDataset_v3(**args.__dict__)
display(ds.ratings_frame.head())
ds.ratings_frame.info()

Unnamed: 0,uid,sequence_sids,sequence_ratings,sex,age_group,occupation
0,1,31861721127010222340183634081207,44553544,0,0,10
1,1,17211270102223401836340812072804,45535445,0,0,10
2,1,1270102223401836340812072804260,55354454,0,0,10
3,1,234018363408120728042607201193,35445435,0,0,10
4,1,18363408120728042607201193919,54454354,0,0,10


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 819470 entries, 0 to 819469
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   uid               819470 non-null  int64 
 1   sequence_sids     819470 non-null  object
 2   sequence_ratings  819470 non-null  object
 3   sex               819470 non-null  int64 
 4   age_group         819470 non-null  int64 
 5   occupation        819470 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 37.5+ MB


In [None]:
ds = ML1mDataModule_v3(**args.__dict__)
ds.prepare_data()

ds.setup()

for batch in ds.train_dataloader():
    print(batch)
    break

  "DataModule property `train_transforms` was deprecated in v1.5 and will be removed in v1.7."


[tensor([ 214,  802, 2063, 2743, 2106,  675,  388, 4956,   73, 2063, 5585, 4115,
         889, 3044, 5449, 3346, 5253, 1980, 1833, 1714, 2860, 5323, 1764, 1631,
        1790,  482, 5852,  536, 2241, 3620, 2409, 3626]), tensor([[2027,  673,  518,  413, 1894, 1918, 2335],
        [3219,  165, 3697, 2617,  288,  707,   18],
        [3072, 2926, 1307, 3424, 1257, 3210, 1965],
        [3502, 1037,  547,  330, 1391, 2694, 2722],
        [  39, 1057, 3477, 3409,  453, 2005, 3481],
        [2710, 2694, 3298, 2770, 3005, 2355, 2433],
        [ 931, 3386, 1097,  161, 1270, 1225, 1957],
        [2336, 2313, 2291, 1196, 3125, 1183,  443],
        [  70, 3484, 3821, 2399,  240, 3864, 3578],
        [1677,  315,  459, 3064, 3774, 3593, 3751],
        [3783, 3408, 3897, 3911, 3893, 3753, 3649],
        [2105,  170,  533, 1562, 2616,  485,  750],
        [2692,   25, 1247,  265, 1393, 1277, 2020],
        [ 858, 1198,  318,   36, 3421, 1225,  903],
        [1722,  736, 3638, 2617, 2376, 1552,  786],
 

In [None]:
#export
class ML1mDataset_v4(InteractionsDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    @property
    def raw_file_names(self):
        return 'ratings.dat'

    @property
    def processed_file_names(self):
        return 'data.json'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(os.path.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        # drop duplicate user-item pair records, keeping recent ratings only
        df.drop_duplicates(subset=['uid', 'sid'], keep='last', inplace=True)
        return df

    def process(self):
        df = self.load_ratings_df()
        train, test = stratified_split_v2(df, ratio=0.7, col_user='uid', col_item='sid')

        min_session_length = 2
        session_len = test.groupby('uid').size()
        test = test[np.in1d(test['uid'], session_len[session_len >= min_session_length].index)]

        last_items = test.sort_values(by=['uid', 'timestamp']).groupby('uid').timestamp.idxmax()
        y_test = test.loc[last_items]
        x_test = test[~test.index.isin(y_test.index)]

        data = dict()

        def formatRecords(g):
            keys = ['uid','sid','rating']
            result = []
            for item in g.values.tolist():
                item = dict(zip(keys, item))
                result.append(item)
            return result

        data['x_train'] = list(train.groupby('uid').apply(lambda g: formatRecords(g)).to_dict().values())
        data['x_test'] = list(x_test.groupby('uid').apply(lambda g: formatRecords(g)).to_dict().values())
        data['y_test'] = list(y_test.groupby('uid').apply(lambda g: formatRecords(g)).to_dict().values())

        with open(self.processed_paths[0], "w") as f:
            json.dump(data, f)

    def load(self):
        with open(self.processed_paths[0]) as f:
            data = json.load(f)
        return data

In [None]:
!tree -h --du -C "{args.data_dir}"

[01;34m/content/data[00m
├── [ 11M]  [01;34mprocessed[00m
│   ├── [2.3M]  data_test_neg.pt
│   ├── [ 95K]  data_test_pos.pt
│   ├── [6.5M]  data_train.pt
│   ├── [2.3M]  data_valid_neg.pt
│   └── [ 95K]  data_valid_pos.pt
└── [ 23M]  [01;34mraw[00m
    └── [ 23M]  ratings.dat

  35M used in 2 directories, 6 files


## ML100k Dataset

In [None]:
#export
class ML100kDataset(Dataset):
    url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
    
    def __init__(self, root):
        super().__init__(root)
    
    @property
    def raw_file_names(self) -> str:
        return ['u1.base', 'u1.test', 'u4.test', 'allbut.pl', 'u.item', 
                'ua.test', 'u.occupation', 'u3.test', 'u5.base', 'ub.test', 
                'u2.test', 'u3.base', 'u.genre', 'u.data', 'u4.base', 
                'u5.test', 'u.info', 'README', 'ub.base', 'mku.sh', 'u2.base', 
                'u.user', 'ua.base']

    @property
    def processed_file_names(self) -> str:
        raise NotImplementedError

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        file_names = os.listdir(osp.join(self.raw_dir, 'ml-100k'))   
        for file_name in file_names:
            move(osp.join(self.raw_dir, 'ml-100k', file_name), self.raw_dir)
        rmtree(osp.join(self.raw_dir, 'ml-100k'))
        os.unlink(path)

    def process(self):
        raise NotImplementedError

## MovieLens Dataset Transformation
> Implementation of transformation functions specific to movielens datasets.

In [None]:
#export
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from collections import defaultdict

In [None]:
#export
def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}

In [None]:
#export
def create_ml_1m_dataset(file, trans_score=2, embed_dim=8, test_neg_num=100):
    """
    :param file: A string. dataset path.
    :param trans_score: A scalar. Greater than it is 1, and less than it is 0.
    :param embed_dim: A scalar. latent factor.
    :param test_neg_num: A scalar. The number of test negative samples
    :return: user_num, item_num, train_df, test_df
    """
    print('==========Data Preprocess Start=============')
    data_df = pd.read_csv(file, sep="::", engine='python',
                          names=['user_id', 'item_id', 'label', 'Timestamp'])
    # filtering
    data_df['item_count'] = data_df.groupby('item_id')['item_id'].transform('count')
    data_df = data_df[data_df.item_count >= 5]
    # trans score
    data_df = data_df[data_df.label >= trans_score]
    # sort
    data_df = data_df.sort_values(by=['user_id', 'Timestamp'])
    # split dataset and negative sampling
    print('============Negative Sampling===============')
    train_data, val_data, test_data = defaultdict(list), defaultdict(list), defaultdict(list)
    item_id_max = data_df['item_id'].max()
    for user_id, df in tqdm(data_df[['user_id', 'item_id']].groupby('user_id')):
        pos_list = df['item_id'].tolist()

        def gen_neg():
            neg = pos_list[0]
            while neg in set(pos_list):
                neg = random.randint(1, item_id_max)
            return neg

        neg_list = [gen_neg() for i in range(len(pos_list) + test_neg_num)]
        for i in range(1, len(pos_list)):
            hist_i = pos_list[:i]
            if i == len(pos_list) - 1:
                test_data['user_id'].append(user_id)
                test_data['pos_id'].append(pos_list[i])
                test_data['neg_id'].append(neg_list[i:])
            elif i == len(pos_list) - 2:
                val_data['user_id'].append(user_id)
                val_data['pos_id'].append(pos_list[i])
                val_data['neg_id'].append(neg_list[i])
            else:
                train_data['user_id'].append(user_id)
                train_data['pos_id'].append(pos_list[i])
                train_data['neg_id'].append(neg_list[i])
    # feature columns
    user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max() + 1
    feat_col = [sparseFeature('user_id', user_num, embed_dim),
                sparseFeature('item_id', item_num, embed_dim)]
    # shuffle
    random.shuffle(train_data)
    random.shuffle(val_data)
    train = [np.array(train_data['user_id']), np.array(train_data['pos_id']),
               np.array(train_data['neg_id'])]
    val = [np.array(val_data['user_id']), np.array(val_data['pos_id']),
             np.array(val_data['neg_id'])]
    test = [np.array(test_data['user_id']), np.array(test_data['pos_id']),
              np.array(test_data['neg_id'])]
    print('============Data Preprocess End=============')
    return feat_col, train, val, test

In [None]:
#export
def create_implicit_ml_1m_dataset(file, trans_score=2, embed_dim=8, maxlen=40):
    """
    :param file: A string. dataset path.
    :param trans_score: A scalar. Greater than it is 1, and less than it is 0.
    :param embed_dim: A scalar. latent factor.
    :param maxlen: A scalar. maxlen.
    :return: user_num, item_num, train_df, test_df
    """
    print('==========Data Preprocess Start=============')
    data_df = pd.read_csv(file, sep="::", engine='python',
                          names=['user_id', 'item_id', 'label', 'Timestamp'])
    # implicit dataset
    data_df = data_df[data_df.label >= trans_score]

    # sort
    data_df = data_df.sort_values(by=['user_id', 'Timestamp'])

    train_data, val_data, test_data = [], [], []

    item_id_max = data_df['item_id'].max()
    for user_id, df in tqdm(data_df[['user_id', 'item_id']].groupby('user_id')):
        pos_list = df['item_id'].tolist()

        def gen_neg():
            neg = pos_list[0]
            while neg in pos_list:
                neg = random.randint(1, item_id_max)
            return neg

        neg_list = [gen_neg() for i in range(len(pos_list) + 100)]
        for i in range(1, len(pos_list)):
            hist_i = pos_list[:i]
            if i == len(pos_list) - 1:
                test_data.append([user_id, hist_i, pos_list[i], 1])
                for neg in neg_list[i:]:
                    test_data.append([user_id, hist_i, neg, 0])
            elif i == len(pos_list) - 2:
                val_data.append([user_id, hist_i, pos_list[i], 1])
                val_data.append([user_id, hist_i, neg_list[i], 0])
            else:
                train_data.append([user_id, hist_i, pos_list[i], 1])
                train_data.append([user_id, hist_i, neg_list[i], 0])
    # item feature columns
    user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max() + 1
    feature_columns = [sparseFeature('user_id', user_num, embed_dim),
                       sparseFeature('item_id', item_num, embed_dim)]

    # shuffle
    random.shuffle(train_data)
    random.shuffle(val_data)
    # random.shuffle(test_data)

    # create dataframe
    train = pd.DataFrame(train_data, columns=['user_id', 'hist', 'target_item', 'label'])
    val = pd.DataFrame(val_data, columns=['user_id', 'hist', 'target_item', 'label'])
    test = pd.DataFrame(test_data, columns=['user_id', 'hist', 'target_item', 'label'])

    print('==================Padding===================')
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    train_X = [train['user_id'].values, pad_sequences(train['hist'], maxlen=maxlen), train['target_item'].values]
    train_y = train['label'].values
    val_X = [val['user_id'].values, pad_sequences(val['hist'], maxlen=maxlen), val['target_item'].values]
    val_y = val['label'].values
    test_X = [test['user_id'].values, pad_sequences(test['hist'], maxlen=maxlen), test['target_item'].values]
    test_y = test['label'].values.tolist()
    print('============Data Preprocess End=============')
    return feature_columns, (train_X, train_y), (val_X, val_y), (test_X, test_y)

In [None]:
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [None]:
file = 'ml-1m/ratings.dat'
test_neg_num = 100
embed_dim = 64
trans_score = 1
maxlen = 200

In [None]:
feature_columns, train, val, test = create_ml_1m_dataset(file, trans_score, embed_dim, test_neg_num)



100%|██████████| 6040/6040 [00:44<00:00, 134.33it/s]




In [None]:
feature_columns

[{'embed_dim': 64, 'feat': 'user_id', 'feat_num': 6041},
 {'embed_dim': 64, 'feat': 'item_id', 'feat_num': 3953}]

In [None]:
train

[array([   1,    1,    1, ..., 6040, 6040, 6040]),
 array([1270, 1721, 1022, ..., 2917, 1921, 1784]),
 array([2152, 1229, 3617, ..., 2960, 3686, 1569])]

In [None]:
val

[array([   1,    2,    3, ..., 6038, 6039, 6040]),
 array([1907, 1544, 3868, ..., 2700, 1204,  161]),
 array([3132, 1812,  271, ..., 1697, 2718, 3572])]

In [None]:
test

[array([   1,    2,    3, ..., 6038, 6039, 6040]),
 array([  48, 1917, 2081, ..., 1183, 1254, 1221]),
 array([[ 426, 1915, 2201, ..., 1687, 2916, 1266],
        [3294, 2362,  167, ..., 1322, 2715, 3013],
        [2973, 3000, 1832, ...,  514, 2845, 1901],
        ...,
        [1258,  335, 3638, ..., 3582, 2221,  763],
        [1767, 2924,  691, ..., 1624, 2493,  371],
        [1106, 3048, 1940, ..., 3520, 2102, 2275]])]

In [None]:
feature_columns, train, val, test = create_implicit_ml_1m_dataset(file, trans_score, embed_dim, maxlen)



100%|██████████| 6040/6040 [00:35<00:00, 170.13it/s]




In [None]:
feature_columns

[{'embed_dim': 64, 'feat': 'user_id', 'feat_num': 6041},
 {'embed_dim': 64, 'feat': 'item_id', 'feat_num': 3953}]

In [None]:
train

([array([5534, 3031, 1764, ..., 3159, 2137, 3129]),
  array([[   0,    0,    0, ...,  349, 1356, 1580],
         [   0,    0,    0, ..., 3255, 2108,  507],
         [2322, 3316,    9, ..., 2502, 1476, 2759],
         ...,
         [   0,    0,    0, ..., 1258, 1240, 1270],
         [   0,    0,    0, ..., 2038, 1831,   24],
         [2379, 3846, 3041, ..., 2391,  866, 3476]], dtype=int32),
  array([1372, 2309, 3052, ..., 1285, 2668, 2143])],
 array([1, 0, 1, ..., 1, 1, 0]))

In [None]:
val

([array([3468, 1903, 1902, ..., 5215, 2977, 3597]),
  array([[   0,    0,    0, ..., 3114,  593, 2345],
         [   0,    0,    0, ..., 1201, 3671, 3681],
         [1754,   44,  247, ..., 1092, 3005, 2605],
         ...,
         [   0,    0,    0, ..., 1252,  720,  745],
         [   0,    0,    0, ..., 2581, 2724, 2763],
         [   0,    0,    0, ..., 2096, 2137, 1032]], dtype=int32),
  array([3951, 1202,  832, ..., 3177,  476, 1029])],
 array([0, 0, 1, ..., 0, 0, 1]))

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-10 10:04:23

recohut: 0.0.10

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0
pandas : 1.1.5

