In [22]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/BX-Book-Ratings.csv", delimiter=";", encoding="iso-8859-1")
df.columns = ["userID", "isbn", "rating"]
df.head()

Unnamed: 0,userID,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [23]:
df2 = pd.read_csv("data/BX-Books.csv", delimiter=";", encoding="iso-8859-1", error_bad_lines=False)
df2.columns = ["isbn", "title", "author", "pubyear", "publisher", "img_s", "img_m", "img_l"]

df = df.merge(df2, on="isbn")
df.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,userID,isbn,rating,title,author,pubyear,publisher,img_s,img_m,img_l
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [24]:
from sklearn.preprocessing import LabelEncoder

book_encoder = LabelEncoder()
df["bookID"] = book_encoder.fit_transform(df["isbn"])

book_lookup = dict(zip(df["isbn"], df["title"]))

df = df[["userID", "bookID", "rating"]]
df.head()

Unnamed: 0,userID,bookID,rating
0,276725,45921,0
1,2313,45921,5
2,6543,45921,0
3,8680,45921,5
4,10314,45921,9


In [25]:
df.shape

(1031136, 3)

In [26]:
df["bookID"].nunique()

270151

In [27]:
df["userID"].nunique()

92106

In [31]:
# remove explicit feedback data

interested = (df["rating"] == 0) | (df["rating"] >= 5)
df["interest"] = 0
df["interest"].loc[:, interested] = 1
df.head(10)

IndexingError: (slice(None, None, None), 0          True
1          True
2          True
3          True
4          True
5          True
6          True
7          True
8          True
9          True
10         True
11         True
12         True
13         True
14         True
15         True
16         True
17         True
18         True
19         True
20         True
21         True
22         True
23         True
24         True
25         True
26         True
27         True
28         True
29         True
           ... 
1031106    True
1031107    True
1031108    True
1031109    True
1031110    True
1031111    True
1031112    True
1031113    True
1031114    True
1031115    True
1031116    True
1031117    True
1031118    True
1031119    True
1031120    True
1031121    True
1031122    True
1031123    True
1031124    True
1031125    True
1031126    True
1031127    True
1031128    True
1031129    True
1031130    True
1031131    True
1031132    True
1031133    True
1031134    True
1031135    True
Name: rating, Length: 1031136, dtype: bool)

In [None]:
df["bookID"].nunique()

In [None]:
df.info()

In [None]:
book_frequency = df.groupby("bookID").count()[["rating"]]
book_frequency.head()

In [None]:
top_10k_books = book_frequency["rating"].nlargest(10000).index.values

for book in top_10k_books[:5]:
    print(book_lookup[book_encoder.inverse_transform(book)])

In [None]:
top_books_df = df.loc[df["bookID"].isin(top_10k_books)]
top_books_df.head()

In [None]:
top_books_df.info()

In [None]:
ratings_df = pd.pivot_table(top_books_df, index="userID", columns="bookID", values="rating", fill_value=0)

ratings_df.head()

In [None]:
from scipy.sparse import coo_matrix, csr_matrix

sparse_ratings = csr_matrix(ratings_df)

train_sparse = sparse_ratings.copy()

test_sparse = sparse_ratings.copy()
test_sparse[test_sparse != 0] = 1

sparse_ratings.shape

In [None]:
import random

random.seed(17)

nonzero_inds = train_sparse.nonzero()
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))

num_samples = int(np.ceil(0.1 * len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples)

user_inds = [index[0] for index in samples]
artist_inds = [index[1] for index in samples]

train_sparse[user_inds, artist_inds] = 0
train_sparse.eliminate_zeros()

altered_users = np.sort(list(set(user_inds)))

altered_samples = {}
for user, artist in samples:
    if user in altered_samples:
        altered_samples[user].append(artist)
    else:
        altered_samples[user] = [artist]

In [None]:
from lightfm import LightFM

train_coo = train_sparse.tocoo()
test_coo = test_sparse.tocoo()

model = LightFM(loss="warp")
model.fit(train_coo, epochs=10)

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_precision = precision_at_k(model, train_coo, k=10).mean()
test_precision = precision_at_k(model, test_coo, k=10).mean()

print(train_precision, test_precision)

train_recall = recall_at_k(model, train_coo, k=10).mean()
test_recall = recall_at_k(model, test_coo, k=10).mean()

print(train_recall, test_recall)

train_auc = auc_score(model, train_coo).mean()
test_auc = auc_score(model, test_coo).mean()

print(train_auc, test_auc)