In [1]:
# importing required pakcages
import pandas as pd
from tqdm import tqdm

In [2]:
# config
data_config = 'ClothingFit'
config = {
    'ClothingFit':
    {
        'ds_name': 'ClothingFit',
        'ds_fullname': 'ClothingFit_data_map',
        'ds_catname': 'ClothingFit_genre_map',
        'ds_fmt': 'txt',
        # 'ds_acronym': 'BX',
        'ds_rate': 5,
        'ds_toprate': 200,
        'ds_sep': ',',
        # 'ds_encode': 'ISO-8859–1'
    }
}

In [5]:
# loading the Book-Crossing dataset
dataset = pd.read_csv(f"datasets/{data_config}/raw/{data_config}_data_map.{config[data_config]['ds_fmt']}", sep=f"{config[data_config]['ds_sep']}")
dataset.columns = ['UserID', 'ItemID', 'Rating']

In [6]:
dataset.head()

Unnamed: 0,UserID,ItemID,Rating
0,1,1,10
1,2,2,10
2,3,3,8
3,4,4,10
4,5,5,8


In [7]:
# what columns or attributes the dataset is included?
dataset.columns

Index(['UserID', 'ItemID', 'Rating'], dtype='object')

In [9]:
# statistics on explicit dataset
print("Original dataset statistics: ")
print(f"> No. of users: {len(dataset['UserID'].unique())}")
print(f"> No. of Books: {len(dataset['ItemID'].unique())}")
print(f"> No. of Interaction: {dataset.shape[0]}")

Original dataset statistics: 
> No. of users: 105508
> No. of Books: 5850
> No. of Interaction: 192461


In [10]:
# checking the number of times in which a rating happened - (0: implicit, 1-10: explicit)
dataset['Rating'].value_counts()

10    124536
8      53391
6      10697
4       2791
2       1046
Name: Rating, dtype: int64

In [11]:
# This method return a dataframe in which the specified values are removed from a specific column. We can use it to generate the implicit/explicit dataset.
# To do this, we can remove the 0 or 1-10 values from the `Book-Rating` column.
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(values)]

In [13]:
# To create the explicit dataset, we first remove those rows which includes 0 on Book-Rating columns (implicit ones), then we save the new generated daaset.
if data_config == 'ClothingFit':
    dataset = filter_rows_by_values(dataset, "Rating", [0])

    print(dataset['Rating'].value_counts())

    # statistics on explicit dataset
    print("Explicit dataset statistics: ")
    print(f"> No. of users: {len(dataset['UserID'].unique())}")
    print(f"> No. of Books: {len(dataset['ItemID'].unique())}")
    print(f"> No. of Interaction: {dataset.shape[0]}")

10    124536
8      53391
6      10697
4       2791
2       1046
Name: Rating, dtype: int64
Explicit dataset statistics: 
> No. of users: 105508
> No. of Books: 5850
> No. of Interaction: 192461


### Remove user with more than TopRate interactions

In [15]:
# To check if there is any user with more than toprate interaction in the preprocessed dataset
# The correct output will be zero
uid_value_counts = dataset['UserID'].value_counts()
print(f"The numebr of users with more than {config[data_config]['ds_toprate']} interactions: {uid_value_counts[uid_value_counts > config[data_config]['ds_toprate']].count()}")

The numebr of users with more than 200 interactions: 3


In [16]:
# To remove the users with fewer than 5 interaction we first count the number of interactino per user and add a new column (`Count`) in the dataframe.
# This column shows the number of interaction per user in the dataset
users_counts = dataset['UserID'].value_counts()
users_counts = users_counts.to_dict() #converts to dictionary
dataset['Count'] = dataset['UserID'].map(users_counts)

In [17]:
dataset = filter_rows_by_values(dataset, "Count", list(range(config[data_config]['ds_toprate'], max(dataset['Count']) + 1)))

In [18]:
# statistics on explicit dataset after removing users with more than 200 int.
print(f"New dataset statistics (users with interactions < {config[data_config]['ds_toprate']}): ")
print(f"> No. of users: {len(dataset['UserID'].unique())}")
print(f"> No. of Books: {len(dataset['ItemID'].unique())}")
print(f"> No. of Interaction: {dataset.shape[0]}")

New dataset statistics (users with interactions < 200): 
> No. of users: 105505
> No. of Books: 5833
> No. of Interaction: 191505


### Iteratively remove users and items with fewer than `ds_rate` interactions

In [19]:
list(range(config[data_config]['ds_rate']))

[0, 1, 2, 3, 4]

In [20]:
user_interaction, item_interaction = 1, 1

while user_interaction != 0 or item_interaction != 0:
    print(f"The current number of user and item with < {config[data_config]['ds_rate']} interactions: ")
    # user side fewer than ds_rate cheking
    uid_value_counts = dataset['UserID'].value_counts()
    user_interaction = uid_value_counts[uid_value_counts < config[data_config]['ds_rate']].count()
    print(f"No. of users < {config[data_config]['ds_rate']} ineractions: {user_interaction}")

    users_counts = dataset['UserID'].value_counts()
    users_counts = users_counts.to_dict() #converts to dictionary
    dataset['Count'] = dataset['UserID'].map(users_counts)

    dataset = filter_rows_by_values(dataset, "Count", list(range(config[data_config]['ds_rate'])))

    # item side fewer than ds_rate cheking
    bid_value_counts = dataset['ItemID'].value_counts()
    item_interaction = bid_value_counts[bid_value_counts < config[data_config]['ds_rate']].count()
    print(f"No. of items < {config[data_config]['ds_rate']} ineractions: {item_interaction}")

    items_counts = dataset['ItemID'].value_counts()
    items_counts = items_counts.to_dict() #converts to dictionary
    dataset['Count'] = dataset['ItemID'].map(items_counts)

    dataset = filter_rows_by_values(dataset, "Count", list(range(config[data_config]['ds_rate'])))

The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 99780


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Count'] = dataset['UserID'].map(users_counts)


No. of items < 5 ineractions: 2112
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 659
No. of items < 5 ineractions: 186
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 118
No. of items < 5 ineractions: 34
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 34
No. of items < 5 ineractions: 16
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 11
No. of items < 5 ineractions: 8
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 7
No. of items < 5 ineractions: 4
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 1
No. of items < 5 ineractions: 1
The current number of user and item with < 5 interactions: 
No. of users < 5 ineractions: 0
No. of items < 5 ineractions: 0


In [21]:
# statistics on 5 rate explicit dataset (after pre-processing)
print(f"No. of users: {len(dataset['UserID'].unique())}")
print(f"No. of Books: {len(dataset['ItemID'].unique())}")
print(f"No. of Interaction: {dataset.shape[0]}")

No. of users: 4893
No. of Books: 3106
No. of Interaction: 41494


In [22]:
# Before we save the preprocessed explicit dataset (5Rate) we first remove the added column which is `Count`
del dataset['Count']

### User and Item ID Mapping

In [23]:
uid_to_index = dict()
iid_to_index = dict()

last_user_id = 0
last_item_id = 0

for eachline in tqdm(dataset.iterrows()):
  # add a new user id with an index
  if eachline[1][0] not in uid_to_index.keys():
    uid_to_index[eachline[1][0]] = last_user_id
    last_user_id += 1
  # add a new book id with an index
  if eachline[1][1] not in iid_to_index.keys():
    iid_to_index[eachline[1][1]] = last_item_id
    last_item_id += 1

41494it [00:04, 9342.08it/s]


In [24]:
# write the file with new mapped indices into a txt file
# mapped_dataset = open(f"dataset/{data_config}/{config[data_config]['ds_acronym']}-Book-Explicit-{config[data_config]['ds_rate']}Rate-Map.csv", 'w')
mapped_dataset = open(f"datasets/{data_config}/{data_config}_data.txt", 'w')

for eachline in tqdm(dataset.iterrows()):
    mapped_dataset.write(str(uid_to_index[eachline[1][0]]) + "," + str(iid_to_index[eachline[1][1]]) + "," + str(eachline[1][2]) + "\n")
mapped_dataset.close()

41494it [00:05, 7862.93it/s]


In [25]:
categories = pd.read_csv(f"datasets/{data_config}/raw/{data_config}_genre_map.{config[data_config]['ds_fmt']}", sep=f"{config[data_config]['ds_sep']}")
categories.columns = ['ItemID', 'Categories']

In [26]:
categories.head()

Unnamed: 0,ItemID,Categories
0,1,gown
1,2,sheath
2,3,dress
3,4,gown
4,5,dress


In [27]:
mapped_genre = open(f"datasets/{data_config}/{data_config}_genre.txt", 'w')

for eachline in tqdm(categories.itertuples(index=True)):
    if eachline.ItemID in iid_to_index.keys():
        mapped_genre.write(str(iid_to_index[eachline.ItemID]) + "," + str(eachline.Categories) + "\n")

192461it [00:00, 349134.86it/s]
