In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
import random
import copy
from transformers import set_seed
import hashlib
import json
import pickle as pkl
import h5py

set_seed(42)

dataset_name = "BookCrossing"
root = f"../data/{dataset_name}"
source_dir = os.path.join(root, "raw_data")
target_dir = os.path.join(root, "proc_data")
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from string import ascii_letters, digits

def character_check(item, special_letters=""):
    for letter in str(item):
        if letter not in ascii_letters + digits + special_letters:
            return 1
    return 0

def isin_selected(item, selected_dict):
    if item in selected_dict:
        return 1
    else:
        return 0

In [3]:
# Read user info

user_fields = ["User ID", "Location", "Age"]
pattern = re.compile(r'NULL|".*?(?<!\\)"', re.S)
with open(os.path.join(source_dir, "BX-Users.csv"), 'r', encoding='cp1252') as f:
    content = pattern.findall(f.read())
    content = [s[1:-1] if s != 'NULL' else None for s in content]
    processed_list = list(np.array(content).reshape((-1, 3)))
    processed_list.pop(0)
    df_users = pd.DataFrame(processed_list, columns=user_fields)

# There are messy info/code (or totally empty) in the `Location` field, we only use the country instead.
# E.g., ['&#37073;&#24030;&#26159;, &#20013;&#22269;&#27827;&#21335;&#30465;&#37073;&#24030;&#24066;, china', 
#        'philippine science high school - cmc, mcc main stadium, sagadan, tubod, lanao del norte, philippines', 
#        '6.a.4.a.6.a`4.a, 6.a.4.a.6.a`4.a.6.a`4.a.6.a.4.a.6.a`4.aoe6.a`4.a -- 6.a.4.a.6.a`4.aoe6.a`4.a ã??, ä¸\xadå?½']
def convert_location_to_country(x):
    x = x.split(', ')[-1].strip().title().replace("!", "").strip()
    if x.lower() in ["usa", "us", "u s", "u s a"]:
        x = "USA"
    if x.lower() in ["uk", "u k"]:
        x = "UK"
    while len(x) > 0 and x[-1] in [",", "."]:
        x = x[:-1]
    while len(x) > 0 and x[0] in [",", "."]:
        x = x[1:]
    if "U.S" in x.upper() and x != "U.S. Virgin Islands":
        x = "USA"
    if x in ["San José", "San Josï¿½"]:
        x = "USA"
    if x in ["España", "Castilla-León", "Espaã±A", "Cataluña", "Mérida", "Álava", "Málaga", "A Coruña", "Barcelonès", "Berguedà",
              "Espaï¿½A", "Castilla-Leï¿½N", "A Coruï¿½A", "Cataluï¿½A", "Barcelonï¿½S", "Ï¿½Lava", "Mï¿½Rida", "Berguedï¿½", "Mï¿½Laga"] or "spain" in x.lower():
        x = "Spain"
    if x in ["L`Italia"]:
        x = "Italy"
    if x in ["Baden-Württemberg", "Bademn Würtemberg", "Baden-Wï¿½Rttemberg", "Bademn Wï¿½Rtemberg"]:
        x = "German"
    if x in ["Cote D`Ivoire", "Côte D", "Cï¿½Te D"]:
        x = "Ivory Coast"
    if x in ["Oberösterreich", "Oberï¿½Sterreich"]:
        x = "Austria"
    if x in ["México", "Mï¿½Xico"]:
        x = "Mexico"
    if x in ["Türkiye", "Içel", "Tï¿½Rkiye"]:
        x = "Turkey"
    if x in ["L`Algérie", "Algérie", "Kärnten", "Kï¿½Rnten", "L`Algï¿½Rie", "Algï¿½Rie"]:
        x = "Algeria"
    if "Brasil" in x:
        x = "Brazil"
    if x in ["Rhône-Alpes", "Rhône Alpes", "Rhï¿½Ne-Alpes", "Rhï¿½Ne Alpes"]:
        x = "France"
    if "Greece" in x:
        x = "Greece"
    if x in ["Santarém", "Santarï¿½M"]:
        x = "Portugal"
    if x in ["Länsi-Suomen Lääni", "Lï¿½Nsi-Suomen Lï¿½Ï¿½Ni"]:
        x = "Finland"
    if x in ["V.Götaland", "Nyhamnsläge", "V.Gï¿½Taland", "Nyhamnslï¿½Ge"]:
        x = "Sweden"
    if x in ["Moçambique", "Moï¿½Ambique"]:
        x = "Mozambique"
    if x in ["Ix Región", "Ix Regiï¿½N"]:
        x = "Chile"
    if x in ["Maï¿½Opolskie", "Ma³Opolskie"]:
        x = "Poland"
    if x in ["Perï¿½", "Perãº"]:
        x = "Peru"
    if x != "China" and ("china" in x.lower() or x == "La Chine Éternelle" or x == "La Chine Ï¿½Ternelle"):
        x = "China"
    if x == "Ï¿½Ï¿½Ï¿½":
        x = "China"
    if (x == "" or \
        x in ["Öð¹Ú", "ºþäï", "We`Re Global", "Ï¿½Ï¿½Ï¿½Ï¿½", "Iï¿½El"] or \
        len(x) == 1 or \
        "N/A" in x or \
        "&#" in x or \
        "?" in x or \
        "@" in x or \
        "*" in x):
        x = "unknown"
    return x
df_users["Location"] = df_users["Location"].apply(convert_location_to_country)
df_users["location_check"] = df_users["Location"].apply(lambda x: character_check(x, special_letters="- .&/()"))

assert len(df_users.loc[df_users["location_check"] == 1, "Location"]) == 0

# Nearly a half of the features in `Age` field are missing.
def convert_age_to_bucket(x):
    if x is None:
        x = "unknown"
    else:
        x = int(x)
        # There are out-of-range ages (e.g., < 5 or > 100).
        if x < 5 or x > 100:
            x = "unknown"
        # Age discretization
        elif x < 18:
            x = "under 18"
        elif 18 <= x < 25:
            x = "18-24"
        elif 25 <= x < 30:
            x = "25-29"
        elif 30 <= x < 35:
            x = "30-34"
        elif 35 <= x < 40:
            x = "35-39"
        elif 40 <= x < 45:
            x = "40-44"
        elif 45 <= x < 50:
            x = "45-49"
        elif 50 <= x < 55:
            x = "50-54"
        elif 55 <= x < 60:
            x = "55-59"
        else:
            x = "60+"
    return x
df_users["Age"] = df_users["Age"].apply(convert_age_to_bucket)

for field in user_fields:
    for s in list(df_users[field]):
        if field == "User ID":
            assert 1 <= int(s) <= 278858
        if field == "Location":
            assert 2 <= len(s) <= 45
        if field == "Age":
            assert s in ["unknown", "under 18" ,"18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60+"]

df_users = df_users[user_fields]

md5_hash = hashlib.md5(json.dumps(df_users.values.tolist(), sort_keys=True).encode('utf-8')).hexdigest()
print("df_users", md5_hash)
assert md5_hash == "111bda80ee793f1efcaf0f58cb920771"
# df_users.head(20)

df_users 111bda80ee793f1efcaf0f58cb920771


In [26]:
df_users['check'] = df_users['User ID'].apply(lambda x: isin_selected(x, user_selected))
df_users = df_users[df_users['check'] == 1]
df_users = df_users[user_fields]

In [27]:
df_users.describe()

Unnamed: 0,User ID,Location,Age
count,8723,8723,8723
unique,8723,109,11
top,278843,USA,unknown
freq,1,6632,2683


In [5]:
# Read book info

book_fields = ["ISBN", "Book title", "Author", "Publication year", "Publisher"]
pattern = re.compile(r'(?<=");(?=")')
processed_list = []
with open(os.path.join(source_dir, "BX-Books.csv"), 'r', encoding='cp1252') as f:
    for line in f.readlines():
        split_line = pattern.split(line.strip())
        split_line = [item[1:-1].strip('\t') for item in split_line][:-3] # The last three image URLs are not needed.
        processed_list.append(split_line)
    processed_list.pop(0)
    df_books = pd.DataFrame(processed_list, columns=book_fields)

# ISBN should only contain letters and digits.
df_books['ISBN_check'] = df_books['ISBN'].apply(lambda x: character_check(x))
df_books = df_books[df_books['ISBN_check'] == 0]

# There are invalid publication years, i.e., "0"
def convert_publication_year(x):
    x = x if len(x) == 4 else "unknown"
    return x
df_books["Publication year"] = df_books["Publication year"].apply(convert_publication_year)

df_books["Publisher"] = df_books["Publisher"].apply(lambda x: x if x.lower() != "n/a" else "unknown")
df_books["Author"] = df_books["Author"].apply(lambda x: x if x.lower() != "n/a" else "unknown")

for field in book_fields:
    for s in list(df_books[field]):
        if field == "ISBN":
            assert len(s) == 10
        if field == "Book title":
            assert 1 <= len(s) <= 256
        if field == "Author":
            assert 1 <= len(s) <= 143
        if field == "Publication year":
            assert s == "unknown" or len(s) == 4
        if field == "Publisher":
            assert 1 <= len(s) <= 134

df_books = df_books[book_fields]
print(df_books.head())
print('---------------------------------------------------------------')
print(df_books.info())
print('---------------------------------------------------------------')
print(df_books.describe())
print('---------------------------------------------------------------')
# md5_hash = hashlib.md5(json.dumps(df_books.values.tolist(), sort_keys=True).encode('utf-8')).hexdigest()
# print("df_books", md5_hash)
# assert md5_hash == "39d643d7c252ea60633e28cc3328ee82"

         ISBN                                         Book title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 Author Publication year                Publisher  
0    Mark P. O. Morford             2002  Oxford University Press  
1  Richard Bruce Wright             2001    HarperFlamingo Canada  
2          Carlo D'Este             1991          HarperPerennial  
3      Gina Bari Kolata             1999     Farrar Straus Giroux  
4       E. J. W. Barber             1999   W. W. Norton & Company  
---------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 271376 entries, 0 to 271378
Data columns (total 5 columns):
 #   Column   

In [6]:
ibsn_selected = json.load(open(os.path.join(target_dir, 'isbn2id.json')))

In [7]:
# filter ISBN with less than 10 records.
df_books['ISBN_check'] = df_books['ISBN'].apply(lambda x: isin_selected(x, ibsn_selected))
df_books = df_books[df_books['ISBN_check'] == 1]
df_books = df_books[book_fields]

In [8]:
df_books.describe()

Unnamed: 0,ISBN,Book title,Author,Publication year,Publisher
count,3547,3547,3547,3547,3547
unique,3547,3164,1199,44,310
top,3803112133,Bridget Jones's Diary,Stephen King,2002,Berkley Publishing Group
freq,1,5,97,399,197


In [30]:
# Encode features

from tqdm import tqdm

def add_to_dict(dict, feature):
    if feature not in dict:
        dict[feature] = len(dict)

feature_dict = {field : {} for field in user_fields + book_fields}
user_dict = {}
book_dict = {}

for idx, row in df_users.iterrows():
    if row["User ID"] not in user_dict:
        user_dict[row["User ID"]] = [row["Location"], row["Age"]]
    for field in user_fields:
        add_to_dict(feature_dict[field], row[field])

for idx, row in df_books.iterrows():
    if row["ISBN"] not in book_dict:
        book_dict[row["ISBN"]] = [row["Book title"], row["Author"], row["Publication year"], row["Publisher"]]
    for field in book_fields:
        add_to_dict(feature_dict[field], row[field])

feature_count = [len(feature_dict[field]) for field in user_fields + book_fields]

for field in user_fields:
    print(field, len(feature_dict[field]))
    assert len(feature_dict[field]) == len(set(list(df_users[field])))

for field in book_fields:
    print(field, len(feature_dict[field]))
    assert len(feature_dict[field]) == len(set(list(df_books[field])))


User ID 8723
Location 109
Age 11
ISBN 3547
Book title 3164
Author 1199
Publication year 44
Publisher 310


In [341]:
import json
json.dump(book_dict, open(os.path.join(target_dir, "book_dict.json"), "w"), indent=4)

In [342]:
len(book_dict)

3547

In [343]:
len(feature_dict["ISBN"])

3547

In [None]:
# cnt = 0
# with open(os.path.join(source_dir, "BX-Book-Ratings.csv"), 'r', encoding='cp1252') as f:
#     for line in f.readlines():
#         split_line = line.strip().split(';')
#         split_line = [item[1:-1] for item in split_line]
#         if cnt < 3:
#             print(line)
#         cnt += 1


"User-ID";"ISBN";"Book-Rating"

"276725";"034545104X";"0"

"276726";"0155061224";"5"



In [None]:
# import pandas as pd

# def filter_10_core(data, user_col, item_col):
#     """
#     Iteratively filters the dataset to ensure every user and item has at least 10 interactions.
    
#     :param data: The raw dataset as a Pandas DataFrame.
#     :param user_col: Column name for users.
#     :param item_col: Column name for items.
#     :return: Filtered DataFrame where each user and item has at least 10 interactions.
#     """
#     while True:

#         # Filter users with at least 10 history interactions but no more than 200
#         user_counts = data[user_col].value_counts()
#         valid_users = user_counts[(user_counts > 10)&(user_counts <= 200)].index
#         data = data[data[user_col].isin(valid_users)]

#         # Filter items with at least 10 interactions
#         item_counts = data[item_col].value_counts()
#         valid_items = item_counts[item_counts >= 10].index
#         data = data[data[item_col].isin(valid_items)]
        
#         # Check if the dataset is stable (no more filtering needed)
#         if len(valid_users) == len(user_counts) and len(valid_items) == len(item_counts):
#             break

#     return data


# # Example usage:
# # Assuming you have a dataset `df` with columns 'user_id' and 'item_id'
# processed_list1 = []
# with open(os.path.join(source_dir, "BX-Book-Ratings.csv"), 'r', encoding='cp1252') as f:
#     for line in f.readlines():
#         split_line = line.strip().split(';')
#         split_line = [item[1:-1] for item in split_line]
#         processed_list1.append(split_line)
#     column_list = processed_list1[0]
#     processed_list1.pop(0)
# df1 = pd.DataFrame(processed_list1, columns=['User ID', 'ISBN', "Rating"])
# filtered_df = filter_10_core(df1, user_col='User ID', item_col='ISBN')

# # Save the filtered dataset
# filtered_df.to_csv(os.path.join(source_dir, "bookcrossing_10_core.csv"), index=False)

In [None]:
# filtered_df.describe()

Unnamed: 0,User ID,ISBN,Rating
count,89744,89744,89744
unique,3575,3665,11
top,43619,971880107,0
freq,146,531,51654


In [None]:
# user_counts = filtered_df['User ID'].value_counts()
# multi_occurrences = user_counts[user_counts > 10].index
# len(multi_occurrences)

# book_counts = filtered_df['ISBN'].value_counts()
# multi_occurrences = book_counts[book_counts >= 10].index
# len(multi_occurrences)

3575

In [11]:
# Read ratings

processed_list = []
with open(os.path.join(source_dir, "BX-Book-Ratings.csv"), 'r', encoding='cp1252') as f:
    for line in f.readlines():
        split_line = line.strip().split(';')
        split_line = [item[1:-1] for item in split_line]
        processed_list.append(split_line)
    column_list = processed_list[0]
    processed_list.pop(0)

user_hist, hist_rating, labels = {}, {}, {}
for user, isbn, rating in processed_list:
    if user in feature_dict["User ID"] and isbn in ibsn_selected:
        if user not in user_hist:
            user_hist[user] = []
            hist_rating[user] = []
            labels[user] = []
        user_hist[user].append(isbn)
        hist_rating[user].append(int(rating))
        labels[user].append(int(int(rating) >= 5))

In [None]:
# processed_list = []
# filtered_df = pd.read_csv(os.path.join(source_dir, "bookcrossing_10_core.csv"))
# columns = filtered_df.columns
# for idx, line in filtered_df.iterrows():
#     split_line = [str(line[col]) for col in columns]
#     processed_list.append(split_line)
    
# item_cnt = {}
# for user, isbn, rating in processed_list:
#     if isbn not in item_cnt:
#         item_cnt[isbn] = 0
#     item_cnt[isbn] += 1

In [None]:
# user_hist, hist_rating, labels = {}, {}, {}
# for user, isbn, rating in processed_list:
#     if user in feature_dict["User ID"] and isbn in feature_dict["ISBN"]:
#         if user not in user_hist:
#             user_hist[user] = []
#             hist_rating[user] = []
#             labels[user] = []
#         user_hist[user].append(isbn)
#         hist_rating[user].append(int(rating))
#         labels[user].append(int(int(rating) >= 5))

In [12]:
len(user_hist)

46447

In [10]:
len(feature_dict["ISBN"])

3547

In [18]:
# filter users who rated no more than 5 books
user_del = []
for user, hist in user_hist.items():
    if len(hist) < 5:
        user_del.append(user)

print("Number of users deleted:", len(user_del))

for user in user_del:
    del user_hist[user]
    del hist_rating[user]
    del labels[user]


Number of users deleted: 37724


In [19]:
len(user_hist)

8723

In [20]:
data_list = []
book_list = set()
for user in user_hist.keys():
    zipped_data = list(zip(user_hist[user], hist_rating[user], labels[user]))
    set_seed(42)
    random.shuffle(zipped_data)
    user_hist[user], hist_rating[user], labels[user] = map(list, zip(*zipped_data))
    isbn = user_hist[user][-1]
    data_sample = copy.deepcopy([user] + user_dict[user] + [isbn] + book_dict[isbn] +
                                    [user_hist[user][:-1]] + [hist_rating[user][:-1]] + [labels[user][-1]] + [hist_rating[user][-1]])
    data_list.append(data_sample)
    book_list.update(user_hist[user][:])

print(len(data_list))
print(len(book_list))

8723
3547


In [21]:
ibsn_selected = {}
for ibsn in book_list:
    ibsn_selected[ibsn] = 1

user_selected = {}
for user in user_hist:
    user_selected[user] = 1

In [22]:
values = []
for user in labels.keys():
    values.append(labels[user][-1])
print(max(values),sum(values)/len(values))

1 0.46222629829187206


In [23]:
len(user_selected)

8723

In [24]:
cnt = 0
count = {}
for user, hist in user_hist.items():
    cnt += len(hist) - 1
    if len(hist)-1 not in count:
        count[len(hist)-1] = 0
    count[len(hist)-1] += 1
print("avg", cnt/11124)

print("*"*50)
print("Hist lens / Number of users")
for cnt in sorted(count.keys()):
    print(cnt, count[cnt])

avg 19.68824163969795
**************************************************
Hist lens / Number of users
4 1391
5 983
6 739
7 574
8 406
9 273
10 365
11 292
12 243
13 214
14 176
15 176
16 160
17 133
18 126
19 98
20 106
21 89
22 80
23 88
24 70
25 77
26 60
27 57
28 58
29 44
30 71
31 43
32 49
33 34
34 34
35 34
36 31
37 38
38 31
39 54
40 23
41 26
42 15
43 24
44 29
45 25
46 28
47 31
48 26
49 23
50 17
51 17
52 23
53 18
54 15
55 20
56 15
57 12
58 15
59 17
60 18
61 14
62 14
63 12
64 11
65 10
66 16
67 17
68 9
69 15
70 12
71 14
72 14
73 15
74 9
75 14
76 11
77 7
78 6
79 7
80 10
81 7
82 4
83 10
84 6
85 7
86 7
87 7
88 9
89 8
90 6
91 9
92 9
93 6
94 6
95 8
96 5
97 4
98 5
99 9
100 4
101 2
102 6
103 8
104 5
105 3
106 10
107 8
108 7
109 5
110 1
111 4
112 4
113 3
114 4
115 3
116 5
117 4
118 4
119 3
120 4
121 4
122 4
123 3
124 4
125 7
126 4
127 2
129 5
130 2
131 5
132 4
133 4
134 6
135 2
136 3
137 7
139 4
140 2
141 3
142 4
143 2
144 2
145 4
147 1
148 2
149 4
150 1
151 1
152 4
154 4
155 4
156 3
157 2
158 2
159 

In [25]:
set_seed(42)
random.shuffle(data_list)
df_data = pd.DataFrame(data_list, columns=user_fields + book_fields + ["user_hist", "hist_rating" , "labels", "rating"])
print(f"Total number of samples: {len(df_data)}")

df_data.head(20)

Total number of samples: 8723


Unnamed: 0,User ID,Location,Age,ISBN,Book title,Author,Publication year,Publisher,user_hist,hist_rating,labels,rating
0,242790,USA,unknown,0312976275,Hot Six : A Stephanie Plum Novel (A Stephanie ...,Janet Evanovich,2001,St. Martin's Paperbacks,"[0449003787, 0312980140, 0425155404, 0449003795]","[6, 10, 0, 9]",0,0
1,205059,India,60+,0099244926,The Street Lawyer,John Grisham,1999,Random House Uk Ltd,"[0449227421, 0440211727, 0440236673, 0449227545]","[0, 0, 4, 0]",0,0
2,232708,USA,under 18,0345402871,Airframe,Michael Crichton,1997,Ballantine Books,"[0425179559, 0425132935, 0425141551, 0425182886]","[10, 9, 10, 8]",1,10
3,202783,USA,unknown,0345413350,"The Golden Compass (His Dark Materials, Book 1)",PHILIP PULLMAN,1997,Del Rey,"[0380789019, 0345413369, 0345413377, 0439049962]","[9, 5, 0, 5]",0,0
4,116972,USA,40-44,0446611778,Last Man Standing,David Baldacci,2002,Warner Vision,"[0440241073, 0345426274, 0380789035, 044660466...","[7, 0, 5, 8, 0]",0,0
5,67174,USA,18-24,014100018X,Chocolat,Joanne Harris,2000,Penguin Books,"[0553562614, 0449212602, 0553213695, 0886773520]","[9, 0, 6, 7]",1,9
6,265751,USA,unknown,0802130208,A Confederacy of Dunces (Evergreen Book),John Kennedy Toole,1987,Grove Press,"[068484477X, 0451166892, 044022103X, 068486574...","[10, 10, 9, 8, 7, 9, 10, 9, 8, 8]",1,7
7,28735,Germany,55-59,0312971230,Ice Station,Matthew J. Reilly,2000,St. Martin's Press,"[0440993717, 0671041789, 0971880107, 158322489...","[8, 0, 6, 8, 9, 0, 0]",0,0
8,72352,Spain,45-49,0440110653,"CRY IN THE NIGHT, A",MARY HIGGINS CLARK,1983,Dell,"[067101417X, 0440237556, 0446364703, 0671797050]","[0, 0, 6, 0]",1,7
9,245864,USA,45-49,0553569058,The Robber Bride,Margaret Atwood,1995,Bantam,"[0446606383, 0440235596, 0446329894, 042515863...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, ...",0,0


In [28]:
# Save train/test in parquet format

df_train = df_data[:int(0.9 * len(df_data))].reset_index(drop=True)
df_test = df_data[int(0.9 * len(df_data)):].reset_index(drop=True)

print(f"Train num: {len(df_train)}")
print(f"Test num: {len(df_test)}")

df_train.to_parquet(
    os.path.join(target_dir, "train.parquet.gz"), 
    compression="gzip", 
)
df_test.to_parquet(
    os.path.join(target_dir, "test.parquet.gz"), 
    compression="gzip", 
)

Train num: 7850
Test num: 873


In [29]:
# Re-read for sanity check

train_dataset = pd.read_parquet(os.path.join(target_dir, "train.parquet.gz"))
test_dataset = pd.read_parquet(os.path.join(target_dir, "test.parquet.gz"))

for (i1, a1), (i2, a2) in zip(df_train.iterrows(), train_dataset.iterrows()):
    for field in user_fields + book_fields + ["labels"]:
        assert not isinstance(a1[field], str) or "\t" not in a1[field]
        assert a1[field] == a2[field], (field, a1[field], a2[field])
for (i1, a1), (i2, a2) in zip(df_test.iterrows(), test_dataset.iterrows()):
    for field in user_fields + book_fields + ["labels"]:
        assert not isinstance(a1[field], str) or "\t" not in a1[field]
        assert a1[field] == a2[field], (field, a1[field], a2[field])

In [31]:
# Save the meta data for CTR

field_names = user_fields + book_fields

feature_count = [len(feature_dict[field]) for field in field_names]

feature_offset = [0]
for c in feature_count[:-1]:
    feature_offset.append(feature_offset[-1] + c)

for field in field_names:
    print(field, len(feature_dict[field]))

meta_data = {
    'field_names': field_names,
    'feature_count': feature_count,
    'feature_dict': feature_dict,
    'feature_offset': feature_offset,
    'num_ratings': 11
}

json.dump(meta_data, open(os.path.join(target_dir, 'ctr-meta.json'), 'w'))

User ID 8723
Location 109
Age 11
ISBN 3547
Book title 3164
Author 1199
Publication year 44
Publisher 310


In [32]:
book_dict = json.load(open(os.path.join(target_dir, 'book_dict.json')))
meta_data = json.load(open(os.path.join(target_dir, 'ctr-meta.json')))
isbn2id = meta_data['feature_dict']['ISBN']
id2book = {book_id: [isbn] + book_dict[isbn] for isbn, book_id in isbn2id.items()}
json.dump(id2book, open(os.path.join(target_dir, 'id2book_1.json'), "w"), indent=4)
json.dump(isbn2id, open(os.path.join(target_dir, 'isbn2id_1.json'), "w"), indent=4)

In [33]:
# Convert df_data to CTR data via feature_dict

ctr_X, ctr_Y = [], []
for idx, row in df_data.iterrows():
    ctr_X.append([feature_dict[field][row[field]] for field in field_names])
    ctr_Y.append(int(row["labels"]))


ctr_X = np.array(ctr_X)
ctr_Y = np.array(ctr_Y)
print("ctr_X", ctr_X.shape)
print("ctr_Y", ctr_Y.shape)
feature_count_np = np.array(feature_count).reshape(1, -1)
assert (ctr_X - feature_count_np <= 0).sum() == ctr_X.shape[0] * ctr_X.shape[1]
assert (ctr_Y == 0).sum() + (ctr_Y == 1).sum() == ctr_Y.shape[0]

ctr_X (8723, 8)
ctr_Y (8723,)


In [34]:
history_column = {}

# history_column["ID"] = df_data['user_hist'].tolist()
history_column["ID"] = [[isbn2id[x] for x in hist] for hist in df_data['user_hist'].tolist()]
history_column["rating"] = df_data['hist_rating'].tolist()
history_column["hist length"] = [len(x) for x in history_column["rating"]]

train_num = int(0.9 * len(ctr_X))

user_seq = {
    "history ID": {
        "train": history_column["ID"][:train_num],
        "test": history_column["ID"][train_num:],
    },
    "history rating": {
        "train": history_column["rating"][:train_num],
        "test": history_column["rating"][train_num:],
    },
    "history length": {
        "train": history_column["hist length"][:train_num],
        "test": history_column["hist length"][train_num:],
    },
}

json.dump(user_seq, open(os.path.join(target_dir, "user_seq.json"), "w"), ensure_ascii=False)



In [35]:
len(user_seq["history ID"]["test"])

873

In [36]:
import torch
from torch.nn.utils.rnn import pad_sequence

user_seq_trunc = {
    "history ID": {}, 
    "history rating": {}, 
    "history mask": {}, 
}

for hist_name in user_seq:
    for split in user_seq[hist_name]:
        if hist_name != "history length":
            user_seq_trunc[hist_name][split] = pad_sequence(
                [torch.tensor(x[-60:]) for x in user_seq[hist_name][split]], 
                batch_first=True, 
            )
        else:
            user_seq_trunc["history mask"][split] = pad_sequence(
                [torch.ones(min(x, 60)) for x in user_seq[hist_name][split]], 
                batch_first=True, 
            )

md5_user_seq_trunc = {}
for hist_name in user_seq_trunc:
    md5_user_seq_trunc[hist_name] = {}
    for split in user_seq_trunc[hist_name]:
        md5_user_seq_trunc[hist_name][split] = user_seq_trunc[hist_name][split].tolist()
        print(hist_name, split, user_seq_trunc[hist_name][split].shape)

history ID train torch.Size([7850, 60])
history ID test torch.Size([873, 60])
history rating train torch.Size([7850, 60])
history rating test torch.Size([873, 60])
history mask train torch.Size([7850, 60])
history mask test torch.Size([873, 60])


In [37]:
# Save CTR data

with h5py.File(os.path.join(target_dir, f'ctr.h5'), 'w') as hf:
    hf.create_dataset('train data', data=ctr_X[:int(0.9 * len(ctr_X)), :])
    hf.create_dataset('test data', data=ctr_X[int(0.9 * len(ctr_X)):, :])
    hf.create_dataset('train label', data=ctr_Y[:int(0.9 * len(ctr_X))])
    hf.create_dataset('test label', data=ctr_Y[int(0.9 * len(ctr_X)):])
    for hist_name in user_seq_trunc:
        for split in user_seq_trunc[hist_name]:
            hf.create_dataset(f"{split} {hist_name}", data=user_seq_trunc[hist_name][split])


with h5py.File(os.path.join(target_dir, f'ctr.h5'), 'r') as hf:
    assert (ctr_X - np.concatenate([hf['train data'][:], hf['test data'][:]], axis=0)).sum() == 0
    assert (ctr_Y - np.concatenate([hf['train label'][:], hf['test label'][:]], axis=0)).sum() == 0
    for hist_name in user_seq_trunc:
        for split in user_seq_trunc[hist_name]:
            assert (user_seq_trunc[hist_name][split] - hf[f"{split} {hist_name}"][:]).sum() == 0    

    x = hf['train data'][:]
    assert (x - ctr_X[:int(0.9 * len(ctr_X)), :]).sum() == 0
    print(f'train data: {x.shape}')
    
    x = hf['test data'][:]
    assert (x - ctr_X[int(0.9 * len(ctr_X)):, :]).sum() == 0
    print(f'test data: {x.shape}')
    x = hf['train label'][:]
    assert (x - ctr_Y[:int(0.9 * len(ctr_X))]).sum() == 0
    print(f'train label: {x.shape}')
    x = hf['test label'][:]
    assert (x - ctr_Y[int(0.9 * len(ctr_X)):]).sum() == 0
    print(f'test label: {x.shape}')



train data: (7850, 8)
test data: (873, 8)
train label: (7850,)
test label: (873,)


In [38]:
# Final check: ensure each row from tsv and ctr is matched

train_dataset = pd.read_parquet(os.path.join(target_dir, 'train.parquet.gz'))
test_dataset = pd.read_parquet(os.path.join(target_dir, 'test.parquet.gz')).reset_index(drop=True)


with h5py.File(os.path.join(target_dir, f'ctr.h5'), 'r') as hf:
    train_x = hf['train data'][:]
    train_y = hf['train label'][:]
    test_x = hf['test data'][:]
    test_y = hf['test label'][:]

for idx, row in train_dataset.iterrows():
    for fi, field in enumerate(field_names):
        assert feature_dict[field][row[field]] == train_x[idx, fi]
    assert int(row["labels"]) == train_y[idx]

for idx, row in test_dataset.iterrows():
    for fi, field in enumerate(field_names):
        assert feature_dict[field][row[field]] == test_x[idx, fi]
    assert int(row["labels"]) == test_y[idx]

print("Pass final check.")

Pass final check.


In [39]:
## split for training recommend models
rating_data = []
rating_fields = ["User ID", "ISBN", "rating"]
for user, isbn, rating in processed_list:
    if user in feature_dict["User ID"] and isbn in feature_dict["ISBN"]:
        rating_data.append([user, isbn, rating])

df_ratings = pd.DataFrame(rating_data, columns=rating_fields)
print("Data set shape:", df_ratings.shape)

# Sort by user_id and timestamp
df_ratings = df_ratings.sort_values(by=['User ID'])

Data set shape: (227735, 3)


In [40]:
df_ratings.describe()

Unnamed: 0,User ID,ISBN,rating
count,227735,227735,227735
unique,8723,3547,11
top,11676,971880107,0
freq,2117,1129,147856


In [41]:
book_counts = df_ratings['ISBN'].value_counts()
multi_occurrences = book_counts[book_counts >= 5].index
len(multi_occurrences)

3547

In [42]:
# Group by each user and filter for users with more than 5 ratings
user_groups = df_ratings.groupby('User ID').filter(lambda x: len(x) > 5)

# Extract the last record for each user in the test set and the rest as training
tail_set = user_groups.groupby('User ID').tail(1)
test_set = tail_set[tail_set['ISBN'].isin(multi_occurrences)]
train_set = df_ratings.drop(test_set.index)

# Display the shapes to confirm the split
print("Training set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

Training set shape: (220403, 3)
Test set shape: (7332, 3)


In [43]:
# Save the data as .txt files
df_train_rec = train_set.sort_values(by=["User ID", "ISBN"], inplace=False, kind="stable")
df_test_rec = test_set.sort_values(by=["User ID", "ISBN"], inplace=False, kind="stable")
df_train_rec.to_csv(os.path.join(target_dir, "train.txt"), sep=' ', index=False, header=None)
df_test_rec.to_csv(os.path.join(target_dir, "test.txt"), sep=' ', index=False, header=None)