In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/BX-Book-Ratings.csv", delimiter=";", encoding="iso-8859-1")
df.columns = ["userID", "isbn", "rating"]
df.head()

Unnamed: 0,userID,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [2]:
df2 = pd.read_csv("data/BX-Books.csv", delimiter=";", encoding="iso-8859-1", error_bad_lines=False)
df2.columns = ["isbn", "title", "author", "pubyear", "publisher", "img_s", "img_m", "img_l"]

df = df.merge(df2, on="isbn")
df.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,userID,isbn,rating,title,author,pubyear,publisher,img_s,img_m,img_l
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [3]:
from sklearn.preprocessing import LabelEncoder

book_encoder = LabelEncoder()
df["bookID"] = book_encoder.fit_transform(df["isbn"])

book_lookup = dict(zip(df["isbn"], df["title"]))

df = df[["userID", "bookID", "rating"]]
df.head()

Unnamed: 0,userID,bookID,rating
0,276725,45921,0
1,2313,45921,5
2,6543,45921,0
3,8680,45921,5
4,10314,45921,9


In [4]:
df.shape

(1031136, 3)

In [5]:
df["bookID"].nunique()

270151

In [6]:
df["userID"].nunique()

92106

In [7]:
# remove explicit feedback data

interested = (df["rating"] == 0) | (df["rating"] >= 5)
df["interest"] = 0
df["interest"][interested] = 1
df.head(10)

Unnamed: 0,userID,bookID,rating,interest
0,276725,45921,0,1
1,2313,45921,5,1
2,6543,45921,0,1
3,8680,45921,5,1
4,10314,45921,9,1
5,23768,45921,0,1
6,28266,45921,0,1
7,28523,45921,0,1
8,39002,45921,0,1
9,50403,45921,9,1


In [8]:
df["bookID"].nunique()

270151

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031136 entries, 0 to 1031135
Data columns (total 4 columns):
userID      1031136 non-null int64
bookID      1031136 non-null int64
rating      1031136 non-null int64
interest    1031136 non-null int64
dtypes: int64(4)
memory usage: 39.3 MB


In [10]:
book_frequency = df.groupby("bookID").count()[["rating"]]
book_frequency.head()

Unnamed: 0_level_0,rating
bookID,Unnamed: 1_level_1
0,1
1,2
2,1
3,1
4,1


In [11]:
top_10k_books = book_frequency["rating"].nlargest(10000).index.values

for book in top_10k_books[:5]:
    print(book_lookup[book_encoder.inverse_transform(book)])

Wild Animus
The Lovely Bones: A Novel
The Da Vinci Code
Divine Secrets of the Ya-Ya Sisterhood: A Novel
The Red Tent (Bestselling Backlist)


In [12]:
top_books_df = df.loc[df["bookID"].isin(top_10k_books)]
top_books_df.head()

Unnamed: 0,userID,bookID,rating,interest
0,276725,45921,0,1
1,2313,45921,5,1
2,6543,45921,0,1
3,8680,45921,5,1
4,10314,45921,9,1


In [13]:
top_books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 432147 entries, 0 to 882115
Data columns (total 4 columns):
userID      432147 non-null int64
bookID      432147 non-null int64
rating      432147 non-null int64
interest    432147 non-null int64
dtypes: int64(4)
memory usage: 16.5 MB


In [14]:
ratings_df = pd.pivot_table(top_books_df, index="userID", columns="bookID", values="rating", fill_value=0)

ratings_df.head()

MemoryError: 

In [None]:
from scipy.sparse import coo_matrix, csr_matrix

sparse_ratings = csr_matrix(ratings_df)

train_sparse = sparse_ratings.copy()

test_sparse = sparse_ratings.copy()
test_sparse[test_sparse != 0] = 1

sparse_ratings.shape

In [None]:
import random

random.seed(17)

nonzero_inds = train_sparse.nonzero()
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))

num_samples = int(np.ceil(0.1 * len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples)

user_inds = [index[0] for index in samples]
artist_inds = [index[1] for index in samples]

train_sparse[user_inds, artist_inds] = 0
train_sparse.eliminate_zeros()

altered_users = np.sort(list(set(user_inds)))

altered_samples = {}
for user, artist in samples:
    if user in altered_samples:
        altered_samples[user].append(artist)
    else:
        altered_samples[user] = [artist]

In [None]:
from lightfm import LightFM

train_coo = train_sparse.tocoo()
test_coo = test_sparse.tocoo()

model = LightFM(loss="warp")
model.fit(train_coo, epochs=10)

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_precision = precision_at_k(model, train_coo, k=10).mean()
test_precision = precision_at_k(model, test_coo, k=10).mean()

print(train_precision, test_precision)

train_recall = recall_at_k(model, train_coo, k=10).mean()
test_recall = recall_at_k(model, test_coo, k=10).mean()

print(train_recall, test_recall)

train_auc = auc_score(model, train_coo).mean()
test_auc = auc_score(model, test_coo).mean()

print(train_auc, test_auc)