## Book recommendation system

#### Importing required libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import nltk
import ssl
import streamlit as st
import json
import random
import gzip
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

ssl._create_default_https_context = ssl._create_unverified_context

#### Load and preprocess data

In [5]:
# Function for taking needed data from the json file
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

# Taking data that has ratings more than 5
books_titles = []
with gzip.open("C:\\Users\\sruth\\Desktop\\Work\\Git_projects\\Book recommendation\\Data\\goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)
            
# Preprocessing with pandas

titles = pd.DataFrame.from_dict(books_titles)
titles["ratings"] = pd.to_numeric(titles["ratings"])
titles["new_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
titles["new_title"] = titles["new_title"].str.lower()
titles["new_title"] = titles["new_title"].str.replace("\s+", " ", regex=True)
titles = titles[titles["new_title"].str.len() > 0]
titles.to_json("books_titles.json")
titles.info()            

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1773389 entries, 0 to 1782578
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   book_id      object
 1   title        object
 2   ratings      int64 
 3   url          object
 4   cover_image  object
 5   new_title    object
dtypes: int64(1), object(5)
memory usage: 159.2+ MB


In [6]:
# Model traning with TfidfVectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["new_title"])

In [7]:
# Turn a search query into a vector and match it up with the matrix for comparison

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:] # To find indices of top 10 values.
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)

    return results.head(1)#.style.format({'url': make_clickable, 'cover_image': show_image})

In [8]:
# Taking book ids from book_id_map file

csv_book_mapping = {}

with open("C:\\Users\\sruth\\Desktop\\Work\\Git_projects\\Book recommendation\\Data\\book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id
        
books_titles = pd.read_json("C:\\Users\\sruth\\books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [10]:
# Finding users who liked our books and find top recs by similar readers
liked_books = (search("rebecca", vectorizer))["book_id"].tolist()
overlap_users = set()
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            continue

        try:
            rating = int(rating)
        except ValueError:
            continue
        
        book_id = csv_book_mapping[csv_id]
        
        if book_id in liked_books and rating >= 4:
                overlap_users.add(user_id)

In [11]:
# Find top recs by similar readers

recs = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            recs.append([user_id, book_id, rating])

recs = pd.DataFrame(recs, columns=["user_id", "book_id", "rating"])
recs["book_id"] = recs["book_id"].astype(str)

top_recs = recs["book_id"].value_counts().head(10)
top_recs = top_recs.index.values
top_recs
books_titles[books_titles["book_id"].isin(top_recs)]

all_recs = recs["book_id"].value_counts()
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ["book_id", "book_count"]

In [12]:
# Popular recs
all_recs = all_recs.merge(books_titles, how="inner", on="book_id")
all_recs.sort_values(by=['book_count'])
all_recs

popular_recs = all_recs.head(10)

popular_recs[~popular_recs["book_id"].isin(liked_books)].head(10).style.format({'url': make_clickable, 'cover_image': show_image})         

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,new_title
0,136251,5,"Harry Potter and the Deathly Hallows (Harry Potter, #7)",1784684,Goodreads,,harry potter and the deathly hallows harry potter 7
1,5,5,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1876252,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
2,2,5,"Harry Potter and the Order of the Phoenix (Harry Potter, #5)",1766895,Goodreads,,harry potter and the order of the phoenix harry potter 5
4,6,5,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",1792561,Goodreads,,harry potter and the goblet of fire harry potter 4
5,7624,4,Lord of the Flies,1638289,Goodreads,,lord of the flies
6,15881,4,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",1821802,Goodreads,,harry potter and the chamber of secrets harry potter 2
7,5890,4,The Woman in White,94556,Goodreads,,the woman in white
8,256683,4,"City of Bones (The Mortal Instruments, #1)",1181693,Goodreads,,city of bones the mortal instruments 1
9,100915,4,"The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)",1575387,Goodreads,,the lion the witch and the wardrobe chronicles of narnia 1


In [None]:
popular_recs