In [None]:
# default_exp utils.grouping

# Grouping
> Grouping.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
import os
import shutil
import numpy as np
import pandas as pd
import scipy.sparse as sp
from collections import Counter
import csv
from pathlib import Path
import pickle

## User-items sequencing

In [None]:
#export
def create_user_sequences(df: pd.DataFrame,
                          user_col: str = 'USERID',
                          item_col: str = 'ITEMID', 
                          ts_col: str = 'TIMESTAMP',
                          save_path: str = './data.txt') -> None:
    """
    Convert interaction dataframe into user-wise sequences

    Args:
        df (DataFrame): interactions dataframe
        user_col (str): name of user id column
        item_col (str): name of item id column
        ts_col (str): name of timestamp column
        save_path (str): file path for saving processed data
    """
    df = df.copy()

    # sort by user id and timestamp
    df = df.sort_values(by=[user_col,ts_col])

    # label encode user id and item id
    umap = {u: i for i, u in enumerate(set(df[user_col]))}
    smap = {s: i for i, s in enumerate(set(df[item_col]))}
    df[user_col] = df[user_col].map(umap)
    df[item_col] = df[item_col].map(smap)
    # also save the dicts
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)
    with open(os.path.join(str(Path(save_path).parent),'user_map.pkl'), 'wb') as f:
        pickle.dump(umap, f)
    with open(os.path.join(str(Path(save_path).parent),'item_map.pkl'), 'wb') as f:
        pickle.dump(smap, f)

    # remove timestamp and rating column
    df.drop([ts_col], axis=1, inplace=True)

    # convert item ids into list format
    df = df.groupby(user_col)[item_col].apply(list).reset_index(name=item_col)

    # store as a space-seperated txt file
    with open(save_path, 'w+') as f:
        writer = csv.writer(f, delimiter=' ')
        for userid, row in zip(df[user_col].values,df[item_col].values):
            row = [userid] + row
            writer.writerow(row)

Example

In [None]:
#hide
%cd /content
!wget -q http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -qq ml-100k.zip

In [None]:
import pandas as pd

df = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['USERID','ITEMID','RATING','TIMESTAMP'])
df.head()

Unnamed: 0,USERID,ITEMID,RATING,TIMESTAMP
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
from recohut.utils.splitting import chrono_split

# We split the data chronologically in 80:20 ratio. Validated the split for user 4.
df_train, df_test = chrono_split(df, ratio=0.8)

userid = 4

query = "USERID==@userid"
display(df.query(query))
display(df_train.query(query))
display(df_test.query(query))

Unnamed: 0,USERID,ITEMID,RATING,TIMESTAMP
1250,4,264,3,892004275
1329,4,303,5,892002352
2204,4,361,5,892002353
2526,4,357,4,892003525
3277,4,260,4,892004275
5960,4,356,3,892003459
12151,4,294,5,892004409
13893,4,288,4,892001445
16305,4,50,5,892003526
18930,4,354,5,892002353


Unnamed: 0,USERID,ITEMID,RATING,TIMESTAMP
24743,4,258,5,892001374
13893,4,288,4,892001445
20383,4,300,5,892001445
24519,4,328,3,892001537
20082,4,271,4,892001690
68273,4,359,5,892002352
71055,4,362,5,892002352
1329,4,303,5,892002352
51203,4,327,5,892002352
35313,4,329,5,892002352


Unnamed: 0,USERID,ITEMID,RATING,TIMESTAMP
76722,4,358,2,892004275
3277,4,260,4,892004275
1250,4,264,3,892004275
12151,4,294,5,892004409
48826,4,11,4,892004520


In [None]:
create_user_sequences(df_train, save_path='./train.txt')
!head train.txt

0 167 171 164 155 165 195 186 13 249 126 180 116 108 0 245 256 247 49 248 252 261 92 223 123 18 122 136 145 6 234 14 244 259 23 263 125 236 12 24 120 250 235 239 117 129 64 189 46 30 27 113 38 51 237 198 182 10 68 160 94 59 82 178 21 97 63 134 162 25 201 88 7 213 181 47 98 159 174 191 179 127 142 184 67 54 203 55 95 80 78 150 211 22 69 83 93 196 190 183 133 206 144 187 185 96 84 35 143 158 16 173 251 104 147 107 146 219 105 242 121 106 103 246 119 44 267 266 258 260 262 9 149 233 91 70 41 175 90 192 216 176 215 193 72 58 132 40 194 217 169 212 156 222 26 226 79 230 66 118 199 3 214 163 1 205 76 52 135 45 39 152 268 253 114 172 210 228 154 202 61 89 218 166 229 34 161 60 264 111 56 48 29 232 130 151 81 140 71 32 157 197 224 112 20 148 87 100 109 102 238 33 28 42 131 209 204 115 124
1 285 257 304 306 287 311 300 305 291 302 268 298 314 295 0 18 296 292 274 256 294 276 286 254 297 289 279 273 275 272 290 277 293 24 278 13 110 9 281 12 236 283 99 126 312 284 301 282 250 310
2 301 332 343

In [None]:
!wget -q https://github.com/RecoHut-Datasets/amazon_beauty/raw/v1/amazon-ratings.zip
!unzip -qq amazon-ratings.zip
!head ratings_Beauty.csv

UserId,ProductId,Rating,Timestamp
A39HTATAQ9V7YF,0205616461,5.0,1369699200
A3JM6GV9MNOF9X,0558925278,3.0,1355443200
A1Z513UWSAAO0F,0558925278,5.0,1404691200
A1WMRR494NWEWV,0733001998,4.0,1382572800
A3IAAVS479H7M7,0737104473,1.0,1274227200
AKJHHD5VEH7VG,0762451459,5.0,1404518400
A1BG8QW55XHN6U,1304139212,5.0,1371945600
A22VW0P4VZHDE3,1304139220,5.0,1373068800
A3V3RE4132GKRO,130414089X,5.0,1401840000


In [None]:
import pandas as pd

df = pd.read_csv('ratings_Beauty.csv', sep=',')
df.columns = ['USERID','ITEMID','RATING','TIMESTAMP']
df.head()

Unnamed: 0,USERID,ITEMID,RATING,TIMESTAMP
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [None]:
create_user_sequences(df, save_path='./beautydata.txt')
!head beautydata.txt

0 50409
1 187288 166945
2 108830 18755
3 219697
4 164112 91390
5 199346
6 135197
7 67762 177122
8 52050
9 169601


## GroupGenerator
Generate user groups for training group recommenders.

In [None]:
#export
class GroupGenerator(object):
    """
    Group Data Generator
    """
    def __init__(self,
                 user_ids,
                 item_ids,
                 ratings,
                 output_path,
                 rating_threshold,
                 num_groups,
                 group_sizes,
                 min_num_ratings,
                 train_ratio,
                 val_ratio,
                 negative_sample_size,
                 verbose=True,
                 ):
        self.users = user_ids
        self.items = item_ids
        self.ratings = ratings
        self.rating_threshold = rating_threshold
        self.negative_sample_size = negative_sample_size
        rating_mat, timestamp_mat = \
                    self.load_ratings_file()

        groups, group_ratings, groups_rated_items_dict, groups_rated_items_set = \
            self.generate_group_ratings(self.users, rating_mat, timestamp_mat,
                                        num_groups=num_groups,
                                        group_sizes=group_sizes,
                                        min_num_ratings=min_num_ratings)
        members, group_ratings_train, group_ratings_val, group_ratings_test, \
            group_negative_items_val, group_negative_items_test, \
            user_ratings_train, user_ratings_val, user_ratings_test, \
            user_negative_items_val, user_negative_items_test = \
            self.split_ratings(group_ratings, rating_mat, timestamp_mat,
                               groups, groups_rated_items_dict, groups_rated_items_set,
                               train_ratio=train_ratio, val_ratio=val_ratio)

        os.makedirs(output_path, exist_ok=True)
        groups_path = os.path.join(output_path, 'groupMember.dat')
        group_ratings_train_path = os.path.join(output_path, 'groupRatingTrain.dat')
        group_ratings_val_path = os.path.join(output_path, 'groupRatingVal.dat')
        group_ratings_test_path = os.path.join(output_path, 'groupRatingTest.dat')
        group_negative_items_val_path = os.path.join(output_path, 'groupRatingValNegative.dat')
        group_negative_items_test_path = os.path.join(output_path, 'groupRatingTestNegative.dat')
        user_ratings_train_path = os.path.join(output_path, 'userRatingTrain.dat')
        user_ratings_val_path = os.path.join(output_path, 'userRatingVal.dat')
        user_ratings_test_path = os.path.join(output_path, 'userRatingTest.dat')
        user_negative_items_val_path = os.path.join(output_path, 'userRatingValNegative.dat')
        user_negative_items_test_path = os.path.join(output_path, 'userRatingTestNegative.dat')

        self.save_groups(groups_path, groups)
        self.save_ratings(group_ratings_train, group_ratings_train_path)
        self.save_ratings(group_ratings_val, group_ratings_val_path)
        self.save_ratings(group_ratings_test, group_ratings_test_path)
        self.save_negative_samples(group_negative_items_val, group_negative_items_val_path)
        self.save_negative_samples(group_negative_items_test, group_negative_items_test_path)
        self.save_ratings(user_ratings_train, user_ratings_train_path)
        self.save_ratings(user_ratings_val, user_ratings_val_path)
        self.save_ratings(user_ratings_test, user_ratings_test_path)
        self.save_negative_samples(user_negative_items_val, user_negative_items_val_path)
        self.save_negative_samples(user_negative_items_test, user_negative_items_test_path)

        if verbose:
            num_group_ratings = len(group_ratings)
            num_user_ratings = len(user_ratings_train) + len(user_ratings_val) + len(user_ratings_test)
            num_rated_items = len(groups_rated_items_set)

            print('Save data: ' + output_path)
            print('# Users: ' + str(len(members)))
            print('# Items: ' + str(num_rated_items))
            print('# Groups: ' + str(len(groups)))
            print('# U-I ratings: ' + str(num_user_ratings))
            print('# G-I ratings: ' + str(num_group_ratings))
            print('Avg. # ratings / user: {:.2f}'.format(num_user_ratings / len(members)))
            print('Avg. # ratings / group: {:.2f}'.format(num_group_ratings / len(groups)))
            print('Avg. group size: {:.2f}'.format(np.mean(list(map(len, groups)))))

    def load_ratings_file(self):
        rating_mat = sp.dok_matrix((max(self.users) + 1, max(self.items) + 1),
                                   dtype=np.int)
        timestamp_mat = rating_mat.copy()

        for _, row in self.ratings.iterrows():
            arr = row.values
            user, item, rating, timestamp = \
                int(arr[0]), int(arr[1]), int(arr[2]), int(arr[3])
            rating_mat[user, item] = rating
            timestamp_mat[user, item] = timestamp

        return rating_mat, timestamp_mat

    def generate_group_ratings(self, users, rating_mat, timestamp_mat,
                               num_groups, group_sizes, min_num_ratings):
        np.random.seed(0)
        groups = set()
        groups_ratings = []
        groups_rated_items_dict = {}
        groups_rated_items_set = set()

        while len(groups) < num_groups:
            group_id = len(groups) + 1

            while True:
                group = tuple(np.sort(
                    np.random.choice(users, np.random.choice(group_sizes),
                                     replace=False)))
                if group not in groups:
                    break

            pos_group_rating_counter = Counter()
            neg_group_rating_counter = Counter()
            group_rating_list = []
            group_rated_items = set()

            for member in group:
                _, items = rating_mat[member, :].nonzero()
                pos_items = [item for item in items
                             if rating_mat[member, item] >= self.rating_threshold]
                neg_items = [item for item in items
                             if rating_mat[member, item] < self.rating_threshold]
                pos_group_rating_counter.update(pos_items)
                neg_group_rating_counter.update(neg_items)

            for item, num_ratings in pos_group_rating_counter.items():
                if num_ratings == len(group):
                    timestamp = max([timestamp_mat[member, item]
                                     for member in group])
                    group_rated_items.add(item)
                    group_rating_list.append((group_id, item, 1, timestamp))

            for item, num_ratings in neg_group_rating_counter.items():
                if (num_ratings == len(group)) \
                        or (num_ratings + pos_group_rating_counter[item] == len(group)):
                    timestamp = max([timestamp_mat[member, item]
                                     for member in group])
                    group_rated_items.add(item)
                    group_rating_list.append((group_id, item, 0, timestamp))

            if len(group_rating_list) >= min_num_ratings:
                groups.add(group)
                groups_rated_items_dict[group_id] = group_rated_items
                groups_rated_items_set.update(group_rated_items)
                for group_rating in group_rating_list:
                    groups_ratings.append(group_rating)

        return list(groups), groups_ratings, groups_rated_items_dict, groups_rated_items_set

    def split_ratings(self, group_ratings, rating_mat, timestamp_mat,
                      groups, groups_rated_items_dict, groups_rated_items_set, train_ratio, val_ratio):
        num_group_ratings = len(group_ratings)
        num_train = int(num_group_ratings * train_ratio)
        num_test = int(num_group_ratings * (1 - train_ratio - val_ratio))

        group_ratings = \
            sorted(group_ratings, key=lambda group_rating: group_rating[-1])
        group_ratings_train = group_ratings[:num_train]
        group_ratings_val = group_ratings[num_train:-num_test]
        group_ratings_test = group_ratings[-num_test:]

        timestamp_split_train = group_ratings_train[-1][-1]
        timestamp_split_val = group_ratings_val[-1][-1]

        user_ratings_train = []
        user_ratings_val = []
        user_ratings_test = []

        members = set()
        users_rated_items_dict = {}

        for group in groups:
            for member in group:
                if member in members:
                    continue
                members.add(member)
                user_rated_items = set()
                _, items = rating_mat[member, :].nonzero()
                for item in items:
                    if item not in groups_rated_items_set:
                        continue
                    user_rated_items.add(item)
                    if rating_mat[member, item] >= self.rating_threshold:
                        rating_tuple = (member, item, 1,
                                        timestamp_mat[member, item])
                    else:
                        rating_tuple = (member, item, 0,
                                        timestamp_mat[member, item])
                    if timestamp_mat[member, item] <= timestamp_split_train:
                        user_ratings_train.append(rating_tuple)
                    elif timestamp_split_train < timestamp_mat[member, item] <= timestamp_split_val:
                        user_ratings_val.append(rating_tuple)
                    else:
                        user_ratings_test.append(rating_tuple)

                users_rated_items_dict[member] = user_rated_items

        np.random.seed(0)

        user_negative_items_val = self.get_negative_samples(
            user_ratings_val, groups_rated_items_set, users_rated_items_dict)
        user_negative_items_test = self.get_negative_samples(
            user_ratings_test, groups_rated_items_set, users_rated_items_dict)
        group_negative_items_val = self.get_negative_samples(
            group_ratings_val, groups_rated_items_set, groups_rated_items_dict)
        group_negative_items_test = self.get_negative_samples(
            group_ratings_test, groups_rated_items_set, groups_rated_items_dict)

        return members, group_ratings_train, group_ratings_val, group_ratings_test, \
            group_negative_items_val, group_negative_items_test, \
            user_ratings_train, user_ratings_val, user_ratings_test, \
            user_negative_items_val, user_negative_items_test

    def get_negative_samples(self, ratings, groups_rated_items_set, rated_items_dict):
        negative_items_list = []
        for sample in ratings:
            sample_id, item, _, _ = sample
            missed_items = groups_rated_items_set - rated_items_dict[sample_id]
            negative_items = \
                np.random.choice(list(missed_items), self.negative_sample_size,
                                 replace=(len(missed_items) < self.negative_sample_size))
            negative_items_list.append((sample_id, item, negative_items))
        return negative_items_list

    def save_groups(self, groups_path, groups):
        with open(groups_path, 'w') as file:
            for i, group in enumerate(groups):
                file.write(str(i + 1) + ' '
                           + ','.join(map(str, list(group))) + '\n')

    def save_ratings(self, ratings, ratings_path):
        with open(ratings_path, 'w') as file:
            for rating in ratings:
                file.write(' '.join(map(str, list(rating))) + '\n')

    def save_negative_samples(self, negative_items, negative_items_path):
        with open(negative_items_path, 'w') as file:
            for samples in negative_items:
                user, item, negative_items = samples
                file.write('({},{}) '.format(user, item)
                           + ' '.join(map(str, list(negative_items))) + '\n')

Example

In [None]:
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [None]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python', header=None)
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
ratings.max()

0          6040
1          3952
2             5
3    1046454590
dtype: int64

In [None]:
group_generator = GroupGenerator(
                    user_ids=np.arange(6041),
                    item_ids=np.arange(3953),
                    ratings=ratings,
                    output_path='./data/silver',
                    rating_threshold=4,
                    num_groups=10,
                    group_sizes=[2, 3],
                    min_num_ratings=20,
                    train_ratio=0.7,
                    val_ratio=0.1,
                    negative_sample_size=10)

Save data: ./data/silver
# Users: 23
# Items: 360
# Groups: 10
# U-I ratings: 2866
# G-I ratings: 503
Avg. # ratings / user: 124.61
Avg. # ratings / group: 50.30
Avg. group size: 2.30


In [None]:
path = './data/silver'
fun = lambda x : os.path.isfile(os.path.join(path,x))
files_list = filter(fun, os.listdir(path))
size_of_file = [
    (f,os.stat(os.path.join(path, f)).st_size)
    for f in files_list
]
for f,s in size_of_file:
    print("{:.1f}K: {}".format(round(s/(1024),3),f))

53.0K: userRatingTrain.dat
0.1K: groupMember.dat
5.4K: groupRatingTestNegative.dat
5.0K: userRatingValNegative.dat
1.8K: userRatingVal.dat
15.2K: userRatingTestNegative.dat
5.7K: userRatingTest.dat
6.4K: groupRatingTrain.dat
1.8K: groupRatingTest.dat
0.9K: groupRatingVal.dat
2.8K: groupRatingValNegative.dat


In [None]:
#hide
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-19 08:58:22

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas : 1.1.5
IPython: 5.5.0
numpy  : 1.19.5
scipy  : 1.4.1

