Book-Crossing: This dataset was collected by Cai-Nicolas Ziegler in a 4-week crawl (August / September 2004) from the Book-Crossing community with kind permission from Ron Hornbaker, CTO of Humankind Systems. It contains 278,858 users (anonymized but with demographic information) providing 1,149,780 ratings (explicit / implicit) about 271,379 books.

Book-Crossing Dataset:
- Split based on the implicit and explicit feedback
- Remove users with fewer than 5 interaction

In [1]:
# importing required pakcages
import pandas as pd
from tqdm import tqdm

In [29]:
# config
data_config = 'Amazon'
config = {
    'Amazon':
    {
        'ds_name': 'Amazon',
        'ds_fullname': 'rating',
        'ds_fmt': 'txt',
        'ds_acronym': 'AO',
        'ds_rate': 5,
        'ds_toprate': 51,
        'ds_sep': ' ',
        'ds_encode': 'utf-8'
    },
    'Goodreads':
    {
        'ds_name': 'Goodreads',
        'ds_fullname': 'goodreads_interactions_comics_graphic',
        'ds_fmt': 'json',
        'ds_acronym': 'GR',
        'ds_rate': 5,
        'ds_toprate': 50,
        'ds_sep': ',',
        'ds_encode': 'utf-8'
    }
}

In [19]:
# loading the Book-Crossing dataset
if data_config == 'Goodreads':
    # load dataset using json reader
    dataset_fpath = f"dataset/{data_config}/goodreads_interactions_poetry.json"
    dataset = pd.read_json(dataset_fpath, lines=True, orient='columns')
    # remove those row with rating 0 and save the other rows (rating = 1-5)
    dataset = dataset.loc[dataset['rating'] != 0, ['user_id', 'book_id', 'rating']]
elif data_config == 'Bookcrossing':
    dataset = pd.read_csv("dataset/BX-Book-Ratings.csv", sep=";", encoding='ISO-8859–1')
elif data_config == 'Amazon':
    dataset = pd.read_csv(f"dataset/{data_config}/rating.txt", sep=" ", names=['User-ID', 'Book-ID', 'Book-Rating'])
    
# dataset.columns = ['User-ID', 'Book-ID', 'Book-Rating']

In [20]:
dataset.head()

Unnamed: 0,User-ID,Book-ID,Book-Rating
0,200,1,5
1,1234,1,5
2,1353,1,5
3,1640,2,4
4,200,2,5


In [21]:
# what columns or attributes the dataset is included?
dataset.columns

Index(['User-ID', 'Book-ID', 'Book-Rating'], dtype='object')

In [22]:
# statistics on explicit dataset
print("Original dataset statistics: ")
print(f"> No. of users: {len(dataset['User-ID'].unique())}")
print(f"> No. of Books: {len(dataset['Book-ID'].unique())}")
print(f"> No. of Interaction: {dataset.shape[0]}")

Original dataset statistics: 
> No. of users: 3703
> No. of Books: 6523
> No. of Interaction: 53282


In [23]:
# checking the number of times in which a rating happened - (0: implicit, 1-10: explicit)
dataset['Book-Rating'].value_counts()

5    31362
4    13976
3     4823
2     1781
1     1340
Name: Book-Rating, dtype: int64

In [24]:
# This method return a dataframe in which the specified values are removed from a specific column. We can use it to generate the implicit/explicit dataset.
# To do this, we can remove the 0 or 1-10 values from the `Book-Rating` column.
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(values)]

In [25]:
# To create the explicit dataset, we first remove those rows which includes 0 on Book-Rating columns (implicit ones), then we save the new generated daaset.
if data_config == 'Bookcrossing':
    dataset = filter_rows_by_values(dataset, "Book-Rating", [0])
    dataset.to_csv("BX-Book-Explicit.csv", index=False)

    # Here we can check that our explicit dataset does not include 0 rating.
    dataset['Book-Rating'].value_counts()

    # statistics on explicit dataset
    print("Explicit dataset statistics: ")
    print(f"> No. of users: {len(dataset['User-ID'].unique())}")
    print(f"> No. of Books: {len(dataset['ISBN'].unique())}")
    print(f"> No. of Interaction: {dataset.shape[0]}")

### Remove user with more than TopRate interactions

In [33]:
# To check if there is any user with more than toprate interaction in the preprocessed dataset
# The correct output will be zero
uid_value_counts = dataset['User-ID'].value_counts()
print(f"The numebr of users with more than {config[data_config]['ds_toprate']} interactions: {uid_value_counts[uid_value_counts > config[data_config]['ds_toprate']].count()}")

The numebr of users with more than 51 interactions: 0


In [31]:
# To remove the users with fewer than 5 interaction we first count the number of interactino per user and add a new column (`Count`) in the dataframe.
# This column shows the number of interaction per user in the dataset
users_counts = dataset['User-ID'].value_counts()
users_counts = users_counts.to_dict() #converts to dictionary
dataset['Count'] = dataset['User-ID'].map(users_counts)

In [32]:
dataset = filter_rows_by_values(dataset, "Count", list(range(config[data_config]['ds_toprate'], max(dataset['Count']) + 1)))

In [34]:
# statistics on explicit dataset after removing users with more than 200 int.
print(f"New dataset statistics (users with interactions < {config[data_config]['ds_toprate']}): ")
print(f"> No. of users: {len(dataset['User-ID'].unique())}")
print(f"> No. of Books: {len(dataset['Book-ID'].unique())}")
print(f"> No. of Interaction: {dataset.shape[0]}")

New dataset statistics (users with interactions < 51): 
> No. of users: 3654
> No. of Books: 6407
> No. of Interaction: 49978


### Iteratively remove users and items with fewer than `ds_rate` interactions

In [35]:
list(range(config[data_config]['ds_rate']))

[0, 1, 2, 3, 4]

In [36]:
user_interaction, item_interaction = 1, 1

while user_interaction != 0 or item_interaction != 0:
    print(f"The current number of user and item with < {config[data_config]['ds_rate']} interactions: ")
    # user side fewer than ds_rate cheking
    uid_value_counts = dataset['User-ID'].value_counts()
    user_interaction = uid_value_counts[uid_value_counts < config[data_config]['ds_rate']].count()
    print(f"No. of users < {config[data_config]['ds_rate']} ineractions: {user_interaction}")

    users_counts = dataset['User-ID'].value_counts()
    users_counts = users_counts.to_dict() #converts to dictionary
    dataset['Count'] = dataset['User-ID'].map(users_counts)

    dataset = filter_rows_by_values(dataset, "Count", list(range(config[data_config]['ds_rate'])))

    # item side fewer than ds_rate cheking
    bid_value_counts = dataset['Book-ID'].value_counts()
    item_interaction = bid_value_counts[bid_value_counts < config[data_config]['ds_rate']].count()
    print(f"No. of items < {config[data_config]['ds_rate']} ineractions: {item_interaction}")

    items_counts = dataset['Book-ID'].value_counts()
    items_counts = items_counts.to_dict() #converts to dictionary
    dataset['Count'] = dataset['Book-ID'].map(items_counts)

    dataset = filter_rows_by_values(dataset, "Count", list(range(config[data_config]['ds_rate'])))

The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 166
No. of items < 5 ineractions: 4423
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 802
No. of items < 5 ineractions: 317
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 179
No. of items < 5 ineractions: 104
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 66
No. of items < 5 ineractions: 30
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 25
No. of items < 5 ineractions: 14
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 9
No. of items < 5 ineractions: 6
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 6
No. of items < 5 ineractions: 0
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 0
No. of items < 5 inerac

In [37]:
# statistics on 5 rate explicit dataset (after pre-processing)
print(f"No. of users: {len(dataset['User-ID'].unique())}")
print(f"No. of Books: {len(dataset['Book-ID'].unique())}")
print(f"No. of Interaction: {dataset.shape[0]}")

No. of users: 2356
No. of Books: 1504
No. of Interaction: 33770


In [38]:
# Before we save the preprocessed explicit dataset (5Rate) we first remove the added column which is `Count`
del dataset['Count']

In [None]:
# Here we save our new explicit dataset to do our experiments and analysis on that.
# dataset.to_csv("BX-Book-Explicit-5Rate.csv", index=False)

### User and Item ID Mapping

In [None]:
# dataset = pd.read_csv("BX-Book-Explicit-5Rate.csv", sep=",", encoding='ISO-8859–1')

In [39]:
uid_to_index = dict()
bid_to_index = dict()

last_user_id = 0
last_book_id = 0

for eachline in tqdm(dataset.iterrows()):
  # add a new user id with an index
  if eachline[1][0] not in uid_to_index.keys():
    uid_to_index[eachline[1][0]] = last_user_id
    last_user_id += 1
  # add a new book id with an index
  if eachline[1][1] not in bid_to_index.keys():
    bid_to_index[eachline[1][1]] = last_book_id
    last_book_id += 1

33770it [00:03, 9343.24it/s]


In [40]:
# write the file with new mapped indices into a txt file
mapped_dataset = open(f"dataset/{data_config}/GR-Book-Explicit-10Rate-Map.csv", 'w')
mapped_dataset.write('uid' + ',' + 'bid' + ',' + 'rating' + '\n')

for eachline in tqdm(dataset.iterrows()):
    mapped_dataset.write(str(uid_to_index[eachline[1][0]]) + "," + str(bid_to_index[eachline[1][1]]) + "," + str(eachline[1][2]) + "\n")
mapped_dataset.close()

33770it [00:04, 7805.49it/s]
