Book-Crossing: This dataset was collected by Cai-Nicolas Ziegler in a 4-week crawl (August / September 2004) from the Book-Crossing community with kind permission from Ron Hornbaker, CTO of Humankind Systems. It contains 278,858 users (anonymized but with demographic information) providing 1,149,780 ratings (explicit / implicit) about 271,379 books.

Book-Crossing Dataset:
- Split based on the implicit and explicit feedback
- Remove users with fewer than 5 interaction

In [2]:
# importing required pakcages
import pandas as pd
from tqdm import tqdm

In [2]:
# loading the Book-Crossing dataset
dataset = pd.read_csv("dataset/BX-Book-Ratings.csv", sep=";", encoding='ISO-8859–1')

In [8]:
# statistics on explicit dataset
print("Original dataset statistics: ")
print(f"> No. of users: {len(dataset['User-ID'].unique())}")
print(f"> No. of Books: {len(dataset['ISBN'].unique())}")
print(f"> No. of Interaction: {dataset.shape[0]}")

Original dataset statistics: 
> No. of users: 244309
> No. of Books: 89169
> No. of Interaction: 4514094


In [9]:
# what columns or attributes the dataset is included?
dataset.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [10]:
# checking the number of times in which a rating happened - (0: implicit, 1-10: explicit)
dataset['Book-Rating'].value_counts()

5    1689761
4    1627804
3     937833
2     209939
1      48757
Name: Book-Rating, dtype: int64

In [11]:
# This method return a dataframe in which the specified values are removed from a specific column. We can use it to generate the implicit/explicit dataset.
# To do this, we can remove the 0 or 1-10 values from the `Book-Rating` column.
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(values)]

### Explicit Dataset

In [13]:
# To create the explicit dataset, we first remove those rows which includes 0 on Book-Rating columns (implicit ones), then we save the new generated daaset.
explicit_dataset = filter_rows_by_values(dataset, "Book-Rating", [0])
explicit_dataset.to_csv("BX-Book-Explicit.csv", index=False)

In [9]:
# Here we can check that our explicit dataset does not include 0 rating.
explicit_dataset['Book-Rating'].value_counts()

8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: Book-Rating, dtype: int64

In [10]:
# statistics on explicit dataset
print("Explicit dataset statistics: ")
print(f"> No. of users: {len(explicit_dataset['User-ID'].unique())}")
print(f"> No. of Books: {len(explicit_dataset['ISBN'].unique())}")
print(f"> No. of Interaction: {explicit_dataset.shape[0]}")

Explicit dataset statistics: 
> No. of users: 77805
> No. of Books: 185973
> No. of Interaction: 433671


#### Remove user with more than 200 interactions

In [13]:
# To check if there is any user with fewer than 5 interaction in the preprocessed dataset
# The correct output will be zero
uid_value_counts = explicit_dataset['User-ID'].value_counts()
print(f"The numebr of users with more than 200 interactions: {uid_value_counts[uid_value_counts > 200].count()}")

The numebr of users with more than 200 interactions: 144


In [12]:
# To remove the users with fewer than 5 interaction we first count the number of interactino per user and add a new column (`Count`) in the dataframe.
# This column shows the number of interaction per user in the dataset
users_counts = explicit_dataset['User-ID'].value_counts()
users_counts = users_counts.to_dict() #converts to dictionary
explicit_dataset['Count'] = explicit_dataset['User-ID'].map(users_counts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  explicit_dataset['Count'] = explicit_dataset['User-ID'].map(users_counts)


In [14]:
explicit_dataset = filter_rows_by_values(explicit_dataset, "Count", list(range(200, 8524 + 1)))

In [15]:
# statistics on explicit dataset after removing users with more than 200 int.
print("Explicit dataset statistics (users > 200): ")
print(f"> No. of users: {len(explicit_dataset['User-ID'].unique())}")
print(f"> No. of Books: {len(explicit_dataset['ISBN'].unique())}")
print(f"> No. of Interaction: {explicit_dataset.shape[0]}")

Explicit dataset statistics (users > 200): 
> No. of users: 77660
> No. of Books: 156891
> No. of Interaction: 364245


### Iteratively remove users and items with fewer than 5 interactions

In [12]:
user_5_interaction, item_5_interaction = 1, 1

while user_5_interaction != 0 or item_5_interaction != 0:
    print("The current number of user and item with < 5 interactions: ")
    # user side fewer than 5 cheking
    uid_value_counts = explicit_dataset['User-ID'].value_counts()
    user_5_interaction = uid_value_counts[uid_value_counts < 5].count()
    print(f"No. of users < 5 ineractions: {user_5_interaction}")

    users_counts = explicit_dataset['User-ID'].value_counts()
    users_counts = users_counts.to_dict() #converts to dictionary
    explicit_dataset['Count'] = explicit_dataset['User-ID'].map(users_counts)

    explicit_dataset = filter_rows_by_values(explicit_dataset, "Count", [1, 2, 3, 4])

    # item side fewer than 5 cheking
    bid_value_counts = explicit_dataset['ISBN'].value_counts()
    item_5_interaction = bid_value_counts[bid_value_counts < 5].count()
    print(f"No. of items < 5 ineractions: {item_5_interaction}")

    items_counts = explicit_dataset['ISBN'].value_counts()
    items_counts = items_counts.to_dict() #converts to dictionary
    explicit_dataset['Count'] = explicit_dataset['ISBN'].map(items_counts)

    explicit_dataset = filter_rows_by_values(explicit_dataset, "Count", [1, 2, 3, 4])

The current number of user and item with < 5 interactions: 


NameError: name 'explicit_dataset' is not defined

In [18]:
# statistics on 5 rate explicit dataset (after pre-processing)
print(f"No. of users: {len(explicit_dataset['User-ID'].unique())}")
print(f"No. of Books: {len(explicit_dataset['ISBN'].unique())}")
print(f"No. of Interaction: {explicit_dataset.shape[0]}")

No. of users: 6358
No. of Books: 6921
No. of Interaction: 88552


In [25]:
# Before we save the preprocessed explicit dataset (5Rate) we first remove the added column which is `Count`
del explicit_dataset['Count']

In [26]:
# Here we save our new explicit dataset to do our experiments and analysis on that.
explicit_dataset.to_csv("BX-Book-Explicit-5Rate.csv", index=False)

### User and Item ID Mapping

In [27]:
dataset = pd.read_csv("BX-Book-Explicit-5Rate.csv", sep=",", encoding='ISO-8859–1')

In [28]:
uid_to_index = dict()
bid_to_index = dict()

last_user_id = 0
last_book_id = 0

for eachline in dataset.iterrows():
  # add a new user id with an index
  if eachline[1][0] not in uid_to_index.keys():
    uid_to_index[eachline[1][0]] = last_user_id
    last_user_id += 1
  # add a new book id with an index
  if eachline[1][1] not in bid_to_index.keys():
    bid_to_index[eachline[1][1]] = last_book_id
    last_book_id += 1

In [29]:
mapped_dataset = pd.DataFrame(columns=['uid', 'bid', 'rating'])

for eachline in tqdm(dataset.iterrows()):
  mapped_dataset = mapped_dataset.append({'uid': uid_to_index[eachline[1][0]], 'bid': bid_to_index[eachline[1][1]], 'rating': eachline[1][2]},
  ignore_index=True)

88552it [04:20, 340.11it/s]


In [30]:
mapped_dataset.to_csv("BX-Book-Explicit-5Rate-Map.csv", index=False)