In [None]:
# default_exp transforms.datasets.movielens

# MovieLens Dataset Transformation
> Implementation of transformation functions specific to movielens datasets.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from collections import defaultdict

In [None]:
#export
def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}

In [None]:
#export
def create_ml_1m_dataset(file, trans_score=2, embed_dim=8, test_neg_num=100):
    """
    :param file: A string. dataset path.
    :param trans_score: A scalar. Greater than it is 1, and less than it is 0.
    :param embed_dim: A scalar. latent factor.
    :param test_neg_num: A scalar. The number of test negative samples
    :return: user_num, item_num, train_df, test_df
    """
    print('==========Data Preprocess Start=============')
    data_df = pd.read_csv(file, sep="::", engine='python',
                          names=['user_id', 'item_id', 'label', 'Timestamp'])
    # filtering
    data_df['item_count'] = data_df.groupby('item_id')['item_id'].transform('count')
    data_df = data_df[data_df.item_count >= 5]
    # trans score
    data_df = data_df[data_df.label >= trans_score]
    # sort
    data_df = data_df.sort_values(by=['user_id', 'Timestamp'])
    # split dataset and negative sampling
    print('============Negative Sampling===============')
    train_data, val_data, test_data = defaultdict(list), defaultdict(list), defaultdict(list)
    item_id_max = data_df['item_id'].max()
    for user_id, df in tqdm(data_df[['user_id', 'item_id']].groupby('user_id')):
        pos_list = df['item_id'].tolist()

        def gen_neg():
            neg = pos_list[0]
            while neg in set(pos_list):
                neg = random.randint(1, item_id_max)
            return neg

        neg_list = [gen_neg() for i in range(len(pos_list) + test_neg_num)]
        for i in range(1, len(pos_list)):
            hist_i = pos_list[:i]
            if i == len(pos_list) - 1:
                test_data['user_id'].append(user_id)
                test_data['pos_id'].append(pos_list[i])
                test_data['neg_id'].append(neg_list[i:])
            elif i == len(pos_list) - 2:
                val_data['user_id'].append(user_id)
                val_data['pos_id'].append(pos_list[i])
                val_data['neg_id'].append(neg_list[i])
            else:
                train_data['user_id'].append(user_id)
                train_data['pos_id'].append(pos_list[i])
                train_data['neg_id'].append(neg_list[i])
    # feature columns
    user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max() + 1
    feat_col = [sparseFeature('user_id', user_num, embed_dim),
                sparseFeature('item_id', item_num, embed_dim)]
    # shuffle
    random.shuffle(train_data)
    random.shuffle(val_data)
    train = [np.array(train_data['user_id']), np.array(train_data['pos_id']),
               np.array(train_data['neg_id'])]
    val = [np.array(val_data['user_id']), np.array(val_data['pos_id']),
             np.array(val_data['neg_id'])]
    test = [np.array(test_data['user_id']), np.array(test_data['pos_id']),
              np.array(test_data['neg_id'])]
    print('============Data Preprocess End=============')
    return feat_col, train, val, test

In [None]:
#export
def create_implicit_ml_1m_dataset(file, trans_score=2, embed_dim=8, maxlen=40):
    """
    :param file: A string. dataset path.
    :param trans_score: A scalar. Greater than it is 1, and less than it is 0.
    :param embed_dim: A scalar. latent factor.
    :param maxlen: A scalar. maxlen.
    :return: user_num, item_num, train_df, test_df
    """
    print('==========Data Preprocess Start=============')
    data_df = pd.read_csv(file, sep="::", engine='python',
                          names=['user_id', 'item_id', 'label', 'Timestamp'])
    # implicit dataset
    data_df = data_df[data_df.label >= trans_score]

    # sort
    data_df = data_df.sort_values(by=['user_id', 'Timestamp'])

    train_data, val_data, test_data = [], [], []

    item_id_max = data_df['item_id'].max()
    for user_id, df in tqdm(data_df[['user_id', 'item_id']].groupby('user_id')):
        pos_list = df['item_id'].tolist()

        def gen_neg():
            neg = pos_list[0]
            while neg in pos_list:
                neg = random.randint(1, item_id_max)
            return neg

        neg_list = [gen_neg() for i in range(len(pos_list) + 100)]
        for i in range(1, len(pos_list)):
            hist_i = pos_list[:i]
            if i == len(pos_list) - 1:
                test_data.append([user_id, hist_i, pos_list[i], 1])
                for neg in neg_list[i:]:
                    test_data.append([user_id, hist_i, neg, 0])
            elif i == len(pos_list) - 2:
                val_data.append([user_id, hist_i, pos_list[i], 1])
                val_data.append([user_id, hist_i, neg_list[i], 0])
            else:
                train_data.append([user_id, hist_i, pos_list[i], 1])
                train_data.append([user_id, hist_i, neg_list[i], 0])
    # item feature columns
    user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max() + 1
    feature_columns = [sparseFeature('user_id', user_num, embed_dim),
                       sparseFeature('item_id', item_num, embed_dim)]

    # shuffle
    random.shuffle(train_data)
    random.shuffle(val_data)
    # random.shuffle(test_data)

    # create dataframe
    train = pd.DataFrame(train_data, columns=['user_id', 'hist', 'target_item', 'label'])
    val = pd.DataFrame(val_data, columns=['user_id', 'hist', 'target_item', 'label'])
    test = pd.DataFrame(test_data, columns=['user_id', 'hist', 'target_item', 'label'])

    print('==================Padding===================')
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    train_X = [train['user_id'].values, pad_sequences(train['hist'], maxlen=maxlen), train['target_item'].values]
    train_y = train['label'].values
    val_X = [val['user_id'].values, pad_sequences(val['hist'], maxlen=maxlen), val['target_item'].values]
    val_y = val['label'].values
    test_X = [test['user_id'].values, pad_sequences(test['hist'], maxlen=maxlen), test['target_item'].values]
    test_y = test['label'].values.tolist()
    print('============Data Preprocess End=============')
    return feature_columns, (train_X, train_y), (val_X, val_y), (test_X, test_y)

In [None]:
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [None]:
file = 'ml-1m/ratings.dat'
test_neg_num = 100
embed_dim = 64
trans_score = 1
maxlen = 200

In [None]:
feature_columns, train, val, test = create_ml_1m_dataset(file, trans_score, embed_dim, test_neg_num)



100%|██████████| 6040/6040 [00:44<00:00, 134.33it/s]




In [None]:
feature_columns

[{'embed_dim': 64, 'feat': 'user_id', 'feat_num': 6041},
 {'embed_dim': 64, 'feat': 'item_id', 'feat_num': 3953}]

In [None]:
train

[array([   1,    1,    1, ..., 6040, 6040, 6040]),
 array([1270, 1721, 1022, ..., 2917, 1921, 1784]),
 array([2152, 1229, 3617, ..., 2960, 3686, 1569])]

In [None]:
val

[array([   1,    2,    3, ..., 6038, 6039, 6040]),
 array([1907, 1544, 3868, ..., 2700, 1204,  161]),
 array([3132, 1812,  271, ..., 1697, 2718, 3572])]

In [None]:
test

[array([   1,    2,    3, ..., 6038, 6039, 6040]),
 array([  48, 1917, 2081, ..., 1183, 1254, 1221]),
 array([[ 426, 1915, 2201, ..., 1687, 2916, 1266],
        [3294, 2362,  167, ..., 1322, 2715, 3013],
        [2973, 3000, 1832, ...,  514, 2845, 1901],
        ...,
        [1258,  335, 3638, ..., 3582, 2221,  763],
        [1767, 2924,  691, ..., 1624, 2493,  371],
        [1106, 3048, 1940, ..., 3520, 2102, 2275]])]

In [None]:
feature_columns, train, val, test = create_implicit_ml_1m_dataset(file, trans_score, embed_dim, maxlen)



100%|██████████| 6040/6040 [00:35<00:00, 170.13it/s]




In [None]:
feature_columns

[{'embed_dim': 64, 'feat': 'user_id', 'feat_num': 6041},
 {'embed_dim': 64, 'feat': 'item_id', 'feat_num': 3953}]

In [None]:
train

([array([5534, 3031, 1764, ..., 3159, 2137, 3129]),
  array([[   0,    0,    0, ...,  349, 1356, 1580],
         [   0,    0,    0, ..., 3255, 2108,  507],
         [2322, 3316,    9, ..., 2502, 1476, 2759],
         ...,
         [   0,    0,    0, ..., 1258, 1240, 1270],
         [   0,    0,    0, ..., 2038, 1831,   24],
         [2379, 3846, 3041, ..., 2391,  866, 3476]], dtype=int32),
  array([1372, 2309, 3052, ..., 1285, 2668, 2143])],
 array([1, 0, 1, ..., 1, 1, 0]))

In [None]:
val

([array([3468, 1903, 1902, ..., 5215, 2977, 3597]),
  array([[   0,    0,    0, ..., 3114,  593, 2345],
         [   0,    0,    0, ..., 1201, 3671, 3681],
         [1754,   44,  247, ..., 1092, 3005, 2605],
         ...,
         [   0,    0,    0, ..., 1252,  720,  745],
         [   0,    0,    0, ..., 2581, 2724, 2763],
         [   0,    0,    0, ..., 2096, 2137, 1032]], dtype=int32),
  array([3951, 1202,  832, ..., 3177,  476, 1029])],
 array([0, 0, 1, ..., 0, 0, 1]))

In [None]:
#hide
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-20 12:31:12

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy     : 1.19.5
tensorflow: 2.7.0
pandas    : 1.1.5
IPython   : 5.5.0

