In [99]:
# importing required pakcages
import pandas as pd
from tqdm import tqdm

In [150]:
ds_name = 'CiteULike'
ds_rate = 10
ds_sep = '\t'

In [151]:
dataset = pd.read_csv(f"datasets/{ds_name}/ratings_data.txt", sep=ds_sep, names=['uid', 'iid', 'rating'])

In [152]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1,70,1.0
1,1,495,1.0
2,1,1631,1.0
3,1,2317,1.0
4,1,2526,1.0


In [153]:
# what columns or attributes the dataset is included?
dataset.columns

Index(['uid', 'iid', 'rating'], dtype='object')

In [154]:
# statistics on explicit dataset
print("Original dataset statistics: ")
print(f"> No. of users: {len(dataset['uid'].unique())}")
print(f"> No. of Items: {len(dataset['iid'].unique())}")
print(f"> No. of Interaction: {dataset.shape[0]}")

Original dataset statistics: 
> No. of users: 5551
> No. of Items: 16980
> No. of Interaction: 210537


In [155]:
# checking the number of times in which a rating happened - (0: implicit, 1-10: explicit)
dataset['rating'].value_counts()

1.0    210537
Name: rating, dtype: int64

In [156]:
# This method return a dataframe in which the specified values are removed from a specific column. We can use it to generate the implicit/explicit dataset.
# To do this, we can remove the 0 or 1-10 values from the `Book-Rating` column.
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(values)]

In [157]:
# To remove the users with fewer than 5 interaction we first count the number of interactino per user and add a new column (`Count`) in the dataframe.
# This column shows the number of interaction per user in the dataset
users_counts = dataset['uid'].value_counts()
users_counts = users_counts.to_dict() #converts to dictionary
dataset['count'] = dataset['uid'].map(users_counts)

### Iteratively remove users and items with fewer than `ds_rate` interactions

In [158]:
user_interaction, item_interaction = 1, 1

while user_interaction != 0 or item_interaction != 0:
    print(f"The current number of user and item with < {ds_rate} interactions: ")
    # user side fewer than ds_rate cheking
    uid_value_counts = dataset['uid'].value_counts()
    user_interaction = uid_value_counts[uid_value_counts < ds_rate].count()
    print(f"No. of users < {ds_rate} ineractions: {user_interaction}")

    users_counts = dataset['uid'].value_counts()
    users_counts = users_counts.to_dict() #converts to dictionary
    dataset['count'] = dataset['uid'].map(users_counts)

    dataset = filter_rows_by_values(dataset, "count", list(range(ds_rate)))

    # item side fewer than ds_rate cheking
    bid_value_counts = dataset['iid'].value_counts()
    item_interaction = bid_value_counts[bid_value_counts < ds_rate].count()
    print(f"No. of items < {ds_rate} ineractions: {item_interaction}")

    items_counts = dataset['iid'].value_counts()
    items_counts = items_counts.to_dict() #converts to dictionary
    dataset['count'] = dataset['iid'].map(items_counts)

    dataset = filter_rows_by_values(dataset, "count", list(range(ds_rate)))

The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 0
No. of items < 10 ineractions: 9086
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 1103
No. of items < 10 ineractions: 736
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 224
No. of items < 10 ineractions: 178
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 67
No. of items < 10 ineractions: 45
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 15
No. of items < 10 ineractions: 16
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 7
No. of items < 10 ineractions: 11
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 2
No. of items < 10 ineractions: 2
The current number of user and item with < 10 interactions: 
No. of users < 10 ineractions: 1


In [159]:
# statistics on 5 rate explicit dataset (after pre-processing)
print(f"No. of users: {len(dataset['uid'].unique())}")
print(f"No. of Items: {len(dataset['iid'].unique())}")
print(f"No. of Interaction: {dataset.shape[0]}")

No. of users: 4131
No. of Items: 6904
No. of Interaction: 131967


In [160]:
# Before we save the preprocessed explicit dataset (5Rate) we first remove the added column which is `Count`
del dataset['count']
if ds_name == 'AmazonToy':
    del dataset['time']

### User and Item ID Mapping

In [161]:
uid_to_index = dict()
iid_to_index = dict()

last_user_id = 0
last_item_id = 0

for eachline in tqdm(dataset.iterrows()):
  # add a new user id with an index
  if eachline[1][0] not in uid_to_index.keys():
    uid_to_index[eachline[1][0]] = last_user_id
    last_user_id += 1
  # add a new book id with an index
  if eachline[1][1] not in iid_to_index.keys():
    iid_to_index[eachline[1][1]] = last_item_id
    last_item_id += 1

131967it [00:17, 7680.50it/s]


In [162]:
# write the file with new mapped indices into a txt file
mapped_dataset = open(f"datasets/{ds_name}/{ds_name}_data.csv", 'w')
mapped_dataset_txt = open(f"datasets/{ds_name}/{ds_name}_data.txt", 'w')
mapped_dataset.write('uid' + ',' + 'bid' + ',' + 'rating' + '\n')

for eachline in tqdm(dataset.iterrows()):
    mapped_dataset.write(str(uid_to_index[eachline[1][0]]) + "," + str(iid_to_index[eachline[1][1]]) + "," + str(eachline[1][2]) + "\n")
    mapped_dataset_txt.write(str(uid_to_index[eachline[1][0]]) + "," + str(iid_to_index[eachline[1][1]]) + "," + str(eachline[1][2]) + "\n")
mapped_dataset.close()

131967it [00:37, 3509.46it/s]
