In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/BX-Book-Ratings.csv", delimiter=";", encoding="iso-8859-1")
df.columns = ["userID", "isbn", "rating"]
df.head()

Unnamed: 0,userID,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [2]:
df2 = pd.read_csv("data/BX-Books.csv", delimiter=";", encoding="iso-8859-1", error_bad_lines=False)
df2.columns = ["isbn", "title", "author", "pubyear", "publisher", "img_s", "img_m", "img_l"]

df = df.merge(df2, on="isbn")
df = df.dropna()
df.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,userID,isbn,rating,title,author,pubyear,publisher,img_s,img_m,img_l
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [3]:
from sklearn.preprocessing import LabelEncoder

book_encoder = LabelEncoder()
df["bookID"] = book_encoder.fit_transform(df["isbn"])

book_lookup = dict(zip(df["isbn"], df["title"]))

df3 = df[["userID", "bookID", "rating"]]
df3.head()

Unnamed: 0,userID,bookID,rating
0,276725,45921,0
1,2313,45921,5
2,6543,45921,0
3,8680,45921,5
4,10314,45921,9


In [4]:
df3.shape

(1031129, 3)

In [5]:
df3["bookID"].nunique()

270145

In [6]:
df3["userID"].nunique()

92106

In [7]:
# remove implicit feedback data

df3 = df3.query("rating > 0")
df3["bookID"].nunique()

149832

In [8]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383838 entries, 1 to 1031135
Data columns (total 3 columns):
userID    383838 non-null int64
bookID    383838 non-null int64
rating    383838 non-null int64
dtypes: int64(3)
memory usage: 11.7 MB


In [9]:
book_frequency = df3.groupby("bookID").count()[["rating"]]
book_frequency.head()

Unnamed: 0_level_0,rating
bookID,Unnamed: 1_level_1
0,1
2,1
4,1
6,1
10,2


In [10]:
top_k_books = book_frequency["rating"].nlargest(2000).index.values

for book in top_k_books[:10]:
    print(book, book_lookup[book_encoder.inverse_transform(book)], book_frequency.loc[book]['rating'])

38570 The Lovely Bones: A Novel 707
215950 Wild Animus 581
70798 The Da Vinci Code 487
32370 The Red Tent (Bestselling Backlist) 383
7344 Divine Secrets of the Ya-Ya Sisterhood: A Novel 320
125012 Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) 313
21342 The Secret Life of Bees 307
93847 Where the Heart Is (Oprah's Book Club (Paperback)) 295
87397 A Painted House 281
103997 Girl with a Pearl Earring 278


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [11]:
top_books_df = df.loc[df["bookID"].isin(top_k_books)]
top_books_df.head()

Unnamed: 0,userID,isbn,rating,title,author,pubyear,publisher,img_s,img_m,img_l,bookID
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,45921
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,45921
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,45921
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,45921
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,45921


In [12]:
top_books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220789 entries, 0 to 691238
Data columns (total 11 columns):
userID       220789 non-null int64
isbn         220789 non-null object
rating       220789 non-null int64
title        220789 non-null object
author       220789 non-null object
pubyear      220789 non-null object
publisher    220789 non-null object
img_s        220789 non-null object
img_m        220789 non-null object
img_l        220789 non-null object
bookID       220789 non-null int64
dtypes: int64(3), object(8)
memory usage: 20.2+ MB


In [13]:
ratings_df = pd.pivot_table(top_books_df, index="userID", columns="bookID", values="rating", fill_value=0)

ratings_df.head()

bookID,803,1104,1289,1462,1759,2086,3154,3219,3230,3263,...,253683,254484,255557,257623,259656,260359,260549,268008,268128,268365
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from scipy.sparse import coo_matrix, csr_matrix

max_uid = np.max(top_books_df['userID'])
max_bid = np.max(top_books_df['bookID'])

sparse_ratings = csr_matrix((top_books_df['rating'], (top_books_df['userID'], top_books_df['bookID'])), shape=(max_uid + 1, max_bid + 1), dtype=np.float32)
# sparse_ratings = csr_matrix(ratings_df, dtype=np.float32)

# del df, book_frequency, top_k_books, top_books_df, ratings_df

train_sparse = sparse_ratings.copy()

test_sparse = sparse_ratings.copy()
test_sparse[test_sparse != 0] = 1

sparse_ratings.shape

(278855, 268366)

In [15]:
import random

random.seed(17)

nonzero_inds = train_sparse.nonzero()
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))

num_samples = int(np.ceil(0.1 * len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples)

user_inds = [index[0] for index in samples]
artist_inds = [index[1] for index in samples]

train_sparse[user_inds, artist_inds] = 0
train_sparse.eliminate_zeros()

altered_users = np.sort(list(set(user_inds)))

altered_samples = {}
for user, artist in samples:
    if user in altered_samples:
        altered_samples[user].append(artist)
    else:
        altered_samples[user] = [artist]

In [16]:
from lightfm import LightFM

train_coo = train_sparse.tocoo()
test_coo = test_sparse.tocoo()

model = LightFM(loss="warp")
model.fit(train_coo, epochs=10)

<lightfm.lightfm.LightFM at 0x7fd9a3a28fd0>

In [17]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_precision = precision_at_k(model, train_coo, k=3).mean()
test_precision = precision_at_k(model, test_coo, k=3).mean()

print("Precision:", train_precision, test_precision)

train_recall = recall_at_k(model, train_coo, k=3).mean()
test_recall = recall_at_k(model, test_coo, k=3).mean()

print("Recall:", train_recall, test_recall)

train_auc = auc_score(model, train_coo).mean()
test_auc = auc_score(model, test_coo, train_interactions=train_coo).mean()

print("AUC Score:", train_auc, test_auc)

Precision: 0.02152845 0.037018873
Recall: 0.033213920666095645 0.04500346853447644
AUC Score: 0.99877286 0.9982601


In [18]:
# test iterative training performance
def index_marks(nrows, chunk_size):
    return range(1 * chunk_size, (nrows // chunk_size + 1) * chunk_size, chunk_size)

def split(dfm, chunk_size):
    indices = index_marks(dfm.shape[0], chunk_size)
    return np.split(dfm, indices)

model = LightFM(loss="warp")

for split_df in split(top_books_df[['userID', 'bookID', 'rating']], 1000):
    sm = coo_matrix((split_df['rating'], (split_df['userID'], split_df['bookID'])), shape=(max_uid + 1, max_bid + 1))
    model.fit_partial(sm, epochs=5)

train_auc = auc_score(model, train_coo).mean()
test_auc = auc_score(model, test_coo, train_interactions=train_coo).mean()

print("AUC Score:", train_auc, test_auc)

AUC Score: 0.99751186 0.9968525


In [19]:
df = df[["bookID", "isbn", "title", "author", "pubyear", "publisher", "img_s", "img_m", "img_l"]].drop_duplicates()
df.to_csv("./data/processed/books.csv", index_label="id")

In [20]:
df3.to_csv("./data/processed/ratings.csv", index_label="id")

In [21]:
ratings_df.to_csv("./data/processed/pivot_ratings.csv", index_label="user_id")