In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
import string
import re

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# change defaults
sns.set_context('notebook')
sns.set_style('darkgrid')
sns.set_palette('rainbow')
import csv

In [2]:
books = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1", engine = 'python')
users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")


Skipping line 815: ';' expected after '"'
Skipping line 1679: ';' expected after '"'
Skipping line 2531: ';' expected after '"'
Skipping line 2640: ';' expected after '"'
Skipping line 3236: ';' expected after '"'
Skipping line 3256: ';' expected after '"'
Skipping line 4906: ';' expected after '"'
Skipping line 6452: ';' expected after '"'
Skipping line 8349: ';' expected after '"'
Skipping line 9669: ';' expected after '"'
Skipping line 9723: ';' expected after '"'
Skipping line 10010: ';' expected after '"'
Skipping line 10192: ';' expected after '"'
Skipping line 10754: ';' expected after '"'
Skipping line 10824: ';' expected after '"'
Skipping line 10862: ';' expected after '"'
Skipping line 11442: ';' expected after '"'
Skipping line 11584: ';' expected after '"'
Skipping line 11859: ';' expected after '"'
Skipping line 12206: ';' expected after '"'
Skipping line 12229: ';' expected after '"'
Skipping line 12719: ';' expected after '"'
Skipping line 12885: ';' expected after '"'


In [3]:
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)
users.drop(['Location'], axis=1, inplace=True)

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [5]:
# Remove duplicate records
books.drop_duplicates(subset=['ISBN'], keep=False, inplace=True)
users.drop_duplicates(subset=['User-ID'], keep=False, inplace=True)

In [6]:
books.dropna(inplace=True)
users.dropna(inplace=True)
ratings.dropna(inplace=True)

In [7]:
# Convert data types
users['User-ID'] = users['User-ID'].astype(int)
books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
books['Year-Of-Publication'].fillna(books['Year-Of-Publication'].median(), inplace=True)
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)

In [8]:
user_ratings_count = ratings['User-ID'].value_counts()
book_ratings_count = ratings['ISBN'].value_counts()
users = users[users['User-ID'].isin(user_ratings_count[user_ratings_count >= 20].index)]
books = books[books['ISBN'].isin(book_ratings_count[book_ratings_count >= 20].index)]
ratings = ratings[ratings['User-ID'].isin(users['User-ID'])]
ratings = ratings[ratings['ISBN'].isin(books['ISBN'])]

In [9]:

ratings = ratings[ratings['Book-Rating'] != 0]

In [10]:
book_avg_rating = ratings.groupby('ISBN')['Book-Rating'].mean().reset_index()
book_avg_rating.columns = ['ISBN', 'book_avg_rating']
user_rating_count = ratings.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'user_rating_count']

In [11]:
ratings = pd.merge(ratings, book_avg_rating, on='ISBN')
ratings = pd.merge(ratings, user_rating_count, on='User-ID')

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [13]:
df = pd.merge(books, ratings, on='ISBN')

PointWise Ranking

In [14]:
counts = df['Book-Rating'].value_counts()
df = df[df['Book-Rating'].isin(counts[counts >= 10].index)]

In [15]:
train, test = train_test_split(df, test_size=0.2)

In [16]:
y_train = train['Book-Rating'] > 5

In [17]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['Book-Title'])

In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [19]:
X_test = vectorizer.transform(test['Book-Title'])
y_test = test['Book-Rating'] > 5
y_pred = model.predict(X_test)

In [20]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8816130225675176


In [21]:
X_all = vectorizer.transform(df['Book-Title'])
probs = model.predict_proba(X_all)[:, 1]

In [22]:
df['Recommendation Probability'] = probs
df = df.sort_values(by='Recommendation Probability', ascending=False)

In [23]:
n = 10
top_n = df.head(n)

In [38]:
top_n.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,book_avg_rating,user_rating_count,Recommendation Probability
33785,1400031346,The No. 1 Ladies' Detective Agency,Alexander McCall Smith,2002,Anchor (UK),23768,8,8.153846,56,0.976186
33800,1400031346,The No. 1 Ladies' Detective Agency,Alexander McCall Smith,2002,Anchor (UK),182506,10,8.153846,8,0.976186
33793,1400031346,The No. 1 Ladies' Detective Agency,Alexander McCall Smith,2002,Anchor (UK),64436,7,8.153846,33,0.976186
33795,1400031346,The No. 1 Ladies' Detective Agency,Alexander McCall Smith,2002,Anchor (UK),136326,8,8.153846,27,0.976186
33796,1400031346,The No. 1 Ladies' Detective Agency,Alexander McCall Smith,2002,Anchor (UK),25409,10,8.153846,92,0.976186


Pairwise Ranking 

In [24]:
!pip install lightfm
from lightfm import LightFM

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=867301 sha256=112b1308c726f897567f8107ef3eeac08d9b302b57df9f8d96fceeba1554e442
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [25]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [26]:
df = pd.merge(ratings, books, on='ISBN')


In [27]:
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,book_avg_rating,user_rating_count,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276762,380711524,5,8.125,1,See Jane Run,Joy Fielding,1992,Avon
1,28204,380711524,9,8.125,31,See Jane Run,Joy Fielding,1992,Avon
2,83287,380711524,10,8.125,51,See Jane Run,Joy Fielding,1992,Avon
3,132375,380711524,10,8.125,36,See Jane Run,Joy Fielding,1992,Avon
4,143294,380711524,8,8.125,9,See Jane Run,Joy Fielding,1992,Avon


In [36]:
util_matrix = df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
