In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import random
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./Books_Data_Clean.csv')

In [3]:
df.columns

Index(['Publishing Year', 'Book Name', 'Author', 'language_code',
       'Author_Rating', 'Book_average_rating', 'Book_ratings_count', 'genre',
       'gross sales', 'publisher revenue', 'sale price', 'sales rank',
       'Publisher ', 'units sold'],
      dtype='object')

In [4]:
df = df.drop_duplicates(subset='Book Name', keep='first')
df.head()

Unnamed: 0,Publishing Year,Book Name,Author,language_code,Author_Rating,Book_average_rating,Book_ratings_count,genre,gross sales,publisher revenue,sale price,sales rank,Publisher,units sold
0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000
1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250
2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500
3,2008.0,When You Are Engulfed in Flames,David Sedaris,en-US,Intermediate,4.04,150898,fiction,41250.0,24750.0,7.5,3,Hachette Book Group,5500
4,2011.0,Daughter of Smoke & Bone,Laini Taylor,eng,Intermediate,4.04,198283,genre fiction,37952.5,22771.5,7.99,4,Penguin Group (USA) LLC,4750


In [5]:
df=df.drop(['Publishing Year','Author_Rating','sale price','gross sales','publisher revenue','Publisher '], axis=1)
df['language_code'] = df['language_code'].replace(['en-GB','en-US','en-CA'], 'eng')
set(df.get('language_code'))

{'ara', 'eng', 'fre', nan, 'nl', 'spa'}

In [6]:
# Drop all rows where any column has a NaN value
df_cleaned = df.dropna()
# If you want to reset the index after dropping rows
df_cleaned = df_cleaned.reset_index(drop=True)
df=df_cleaned
set(df.get('language_code'))

{'ara', 'eng', 'fre', 'nl', 'spa'}

In [7]:
book_id_to_name = dict(zip(df['Book Name'].astype('category').cat.codes, df['Book Name']))

df['Book Name'] = df['Book Name'].astype('category').cat.codes
df['Author'] = df['Author'].astype('category').cat.codes
df['language_code'] = df['language_code'].astype('category').cat.codes
df['genre'] = df['genre'].astype('category').cat.codes

In [8]:
df.head()

Unnamed: 0,Book Name,Author,language_code,Book_average_rating,Book_ratings_count,genre,sales rank,units sold
0,96,661,1,3.42,155903,2,1,7000
1,84,204,1,4.23,145267,2,2,6250
2,276,235,1,3.31,138669,2,3,5500
3,948,152,1,4.04,150898,1,3,5500
4,164,407,1,4.04,198283,2,4,4750


In [9]:
print('Total data size',len(df))

Total data size 998


In [10]:
X_train, X_test = train_test_split(df,test_size=0.2,random_state=42)

In [11]:
X_train.shape, X_test.shape

((798, 8), (200, 8))

In [12]:
class BookDataset(Dataset):
    def __init__(self, df):
        self.book_name = torch.tensor(df['Book Name'].values, dtype=torch.long)
        self.author = torch.tensor(df['Author'].values, dtype=torch.long)
        self.lang = torch.tensor(df['language_code'].values, dtype=torch.long)
        self.ratings_count = torch.tensor(df['Book_ratings_count'].values, dtype=torch.long)
        self.avg_rating = torch.tensor(df['Book_average_rating'].values, dtype=torch.float32)
        self.genre = torch.tensor(df['genre'].values, dtype=torch.long)
        self.sales_rank = torch.tensor(df['sales rank'].values, dtype=torch.long)
        self.units_sold = torch.tensor(df['units sold'].values, dtype=torch.long)
    def __len__(self):
        return len(self.book_name)
    def __getitem__(self, idx):
        return (
            self.book_name[idx], 
            self.author[idx], 
            self.lang[idx], 
            self.ratings_count[idx], 
            self.avg_rating[idx], 
            self.genre[idx], 
            self.sales_rank[idx],
            self.units_sold[idx]
        )

train_dataset = BookDataset(X_train)
test_dataset = BookDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
print (len(train_dataset),len(test_dataset),len(train_dataset)+len(test_dataset))

798 200 998


In [13]:
class BookRecommendationModel(nn.Module):
    def __init__(self, num_books, num_authors, num_languages, num_genres):
        super(BookRecommendationModel, self).__init__()
        
        self.book_embedding = nn.Embedding(num_books, 50)
        self.author_embedding = nn.Embedding(num_authors, 10)
        self.lang_embedding = nn.Embedding(num_languages, 5)
        self.genre_embedding = nn.Embedding(num_genres, 10)
        
        self.fc1 = nn.Linear(50 + 10 + 5 + 10 + 1 + 1 + 1 + 1, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold):
        book_emb = self.book_embedding(book)
        author_emb = self.author_embedding(author)
        lang_emb = self.lang_embedding(lang)
        #print(genre)

        genre_emb = self.genre_embedding(genre)
        
        x = torch.cat([book_emb, author_emb, lang_emb, genre_emb, ratings_count.unsqueeze(1).float(), avg_rating.unsqueeze(1).float(), sales_rank.unsqueeze(1).float(), units_sold.unsqueeze(1).float()],dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        embeddings = x
        x = self.fc3(x)
        return x, embeddings

In [14]:
'''
in this order
            self.book_name[idx], 
            self.author[idx], 
            self.lang[idx], 
            self.ratings_count[idx], 
            self.avg_rating[idx], 
            self.genre[idx], 
            self.sales_rank[idx],
            self.units_sold[idx]
'''

num_books = df['Book Name'].nunique()
num_authors = df['Author'].nunique()
num_languages = df['language_code'].nunique()
num_genres = df['genre'].nunique()

model = BookRecommendationModel(num_books, num_authors, num_languages, num_genres)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
costs = []
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold in train_loader:
            optimizer.zero_grad()
            outputs, _ = model(book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold)
            loss = criterion(outputs.squeeze(), avg_rating.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')
        
train_model(model, train_loader, criterion, optimizer, num_epochs=50)

Epoch 1, Loss: 472205.212890625
Epoch 2, Loss: 56907.460274916426
Epoch 3, Loss: 21004.143334021934
Epoch 4, Loss: 5428.708470271184
Epoch 5, Loss: 1298.5066416813777
Epoch 6, Loss: 367.51610213059644
Epoch 7, Loss: 119.68659841097318
Epoch 8, Loss: 52.37412012540377
Epoch 9, Loss: 28.633227568406326
Epoch 10, Loss: 23.594075276301457
Epoch 11, Loss: 19.708483549264763
Epoch 12, Loss: 18.261761371905987
Epoch 13, Loss: 16.684018208430363
Epoch 14, Loss: 15.496722514812763
Epoch 15, Loss: 14.162885592533993
Epoch 16, Loss: 13.413637821490948
Epoch 17, Loss: 12.285075627840483
Epoch 18, Loss: 11.168742106511043
Epoch 19, Loss: 10.451807755690355
Epoch 20, Loss: 9.935318543360783
Epoch 21, Loss: 9.247640426342304
Epoch 22, Loss: 8.564735632676344
Epoch 23, Loss: 8.02027915074275
Epoch 24, Loss: 7.508404181553767
Epoch 25, Loss: 7.137778942401592
Epoch 26, Loss: 6.8513395969684305
Epoch 27, Loss: 6.7887276869553785
Epoch 28, Loss: 6.453699882213886
Epoch 29, Loss: 6.55537854708158
Epoch 30

In [15]:
def evaluate_model(model, test_loader):
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold in test_loader:
            outputs,embeddings  = model(book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold)
            loss = criterion(outputs.squeeze(), avg_rating.float())
            test_loss += loss.item()
    #print("Predicted score:", outputs)
    #print("Embeddings:", embeddings)    
    print(f'Test Loss: {test_loss / len(test_loader)}')
evaluate_model(model, test_loader)

Test Loss: 1.974026471376419


In [16]:
model.eval()
all_embeddings = []
with torch.no_grad():
    for batch in test_loader:
        book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold = batch
        _, embeddings = model(book, author, lang, ratings_count, avg_rating, genre, sales_rank, units_sold)
        all_embeddings.append(embeddings)

all_embeddings = torch.cat(all_embeddings, dim=0)

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(all_embeddings)
clusters = kmeans.labels_
def recommend_books(book_id, clusters, n_recommendations=5):
    book_cluster = clusters[book_id]
    cluster_books = [i for i, cluster in enumerate(clusters) if cluster == book_cluster]
    #cluster_books.remove(book_id)
    return cluster_books[:n_recommendations]


book_id = random.randint(0, len(test_dataset) - 1)
selected_book = test_dataset[book_id]

# Replace with actual book ID
recommended_books = recommend_books(book_id, clusters)

print("Recomming Books for:", book_id_to_name[selected_book[0].item()])
print("Recommended books:")
for book in recommended_books:
    print(book_id_to_name[test_dataset[book][0].item()])


Recomming Books for: Bridget Jones: The Edge of Reason
Recommended books:
A Memory of Light
The Story of Edgar Sawtelle
The Age of Miracles
All Creatures Great And Small
Bridget Jones: The Edge of Reason
