# Books Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder,StandardScaler
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
from scipy.sparse import csr_matrix, hstack
from scipy.sparse import vstack
import faiss
import implicit
from implicit.als import AlternatingLeastSquares
from scipy.sparse import save_npz, load_npz

#**Data Cleaning**


In [2]:
#importing the files
Books=pd.read_excel('Books.xlsx')
Ratings=pd.read_excel('Ratings.xlsx')
Users=pd.read_excel('Users.xlsx')

In [3]:
Books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [4]:
Ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [5]:
Users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


###Duplicates

In [None]:
#This function helps to remove the duplicates and show you the duplicates
def duplicates_remover(data,column_name1,keeps,prints_duplicates):
  data_copy=data.copy()
  if prints_duplicates == True:
    if keeps == False:
      duplicates=data_copy[data_copy.duplicated(subset=[column_name1],keep=keeps)]
      if duplicates.empty:
        print('No duplicates were found')
      else:
        duplicate_data=pd.DataFrame(duplicates)
        print(duplicate_data.head())
        print(duplicate_data.shape)
    else:
      print('please set keeps to False then olny you can able to see the duplicates')
  elif prints_duplicates == False:
    if keeps in 'first':
      data_copy=data_copy.drop_duplicates(subset=[column_name1],keep=keeps)
    elif keeps in 'last':
      data_copy=data_copy.drop_duplicates(subset=[column_name1],keep=keeps)
    return data_copy

In [7]:
duplicates_remover(Books,'ISBN',False,True)

             ISBN                                         Book-Title  \
111653  486404242  War in Kind: And Other Poems (Dover Thrift Edi...   
111808  486404242  War in Kind: And Other Poems (Dover Thrift Edi...   

          Book-Author Year-Of-Publication           Publisher  \
111653  Stephen Crane                1998  Dover Publications   
111808  Stephen Crane                1998  Dover Publications   

                                              Image-URL-S  \
111653  http://images.amazon.com/images/P/0486404242.0...   
111808  http://images.amazon.com/images/P/0486404242.0...   

                                              Image-URL-M  \
111653  http://images.amazon.com/images/P/0486404242.0...   
111808  http://images.amazon.com/images/P/0486404242.0...   

                                              Image-URL-L  
111653  http://images.amazon.com/images/P/0486404242.0...  
111808  http://images.amazon.com/images/P/0486404242.0...  
(2, 8)


In [8]:
# Function call will gives the duplicates from the data set
duplicates_remover(Ratings,'ISBN',False,True)

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726   155061224            5
2   276727   446520802            0
5   276733  2080674722            0
6   276736  3257224281            8
(953283, 3)




1. We are not going to remove the duplicates from Ratings dataset.
2. There is a chance of a single user may read different books and rated them due to that we are not going to remove them  






In [9]:
# Function call will gives the duplicates from the data set
duplicates_remover(Users,'User-ID',False,True)

No duplicates were found


In [10]:
#calling the function with parameters
#Function call will remove the duplicates from the data and return the data without duplicates
M_Books=duplicates_remover(Books,'ISBN','first',False)

###Missing Values

In [11]:
# This functions helps to fill the missing values using Random Forest Regressor
def filling_missing_values(data,column_name,replacement):
  data_copy=data.copy()
  data_copy1=data.copy()
  le=LabelEncoder()
  if column_name in data_copy.select_dtypes(include=['object']).columns:
    data_copy[column_name]=pd.to_numeric(data_copy[column_name],errors='coerce')
    data_copy1[column_name]=pd.to_numeric(data_copy1[column_name],errors='coerce')
  for col in data_copy.select_dtypes(include=['object']).columns:
    if col in ['Image-URL-S','Image-URL-M','Image-URL-L']:
      data_copy.drop(col,axis=1,inplace=True)
    else:
      data_copy[col]=data_copy[col].astype(str)
      data_copy[col]=le.fit_transform(data_copy[col])
  if column_name in data_copy.select_dtypes(include=['int64','float64','int32','float32']).columns:
    if replacement in 'zero':
      known_data=data_copy[data_copy[column_name]!=0]
      unknown_data=data_copy[data_copy[column_name]==0]
      if known_data.shape[0]>0 and unknown_data.shape[0]>0:
         model = RandomForestRegressor()
         known_data=known_data.dropna(subset=[column_name])
         model.fit(known_data.drop([column_name],axis=1), known_data[column_name])
         data_copy1.loc[data_copy1[column_name]== 0 ,column_name]=model.predict(unknown_data.drop([column_name],axis=1))
      else: print('check your dataset')
      return data_copy1
    elif replacement in 'NaN':
      known_data=data_copy[data_copy[column_name].notnull()]
      unknown_data=data_copy[data_copy[column_name].isnull()]
      if known_data.shape[0]>0 and unknown_data.shape[0]>0:
         model = RandomForestRegressor()
         known_data=known_data.dropna(subset=[column_name])
         model.fit(known_data.drop([column_name],axis=1),known_data[column_name])
         data_copy1.loc[data_copy1[column_name].isnull(),column_name] = model.predict(unknown_data.drop([column_name],axis=1))
      else: print('check your dataset')
      return data_copy1
    else: print("please enter either zero or NaN")
  else: print('please enter the numerical column')

In [12]:
# calling the function and stores the data in a new variable
Books_modified=filling_missing_values(M_Books,'Year-Of-Publication','zero')
# function call for Users data set
Users_modified=filling_missing_values(Users,'Age','NaN')

In [13]:
Books_modified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271359 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ISBN                 271359 non-null  object 
 1   Book-Title           271359 non-null  object 
 2   Book-Author          271358 non-null  object 
 3   Year-Of-Publication  271356 non-null  float64
 4   Publisher            271357 non-null  object 
 5   Image-URL-S          271359 non-null  object 
 6   Image-URL-M          271359 non-null  object 
 7   Image-URL-L          271356 non-null  object 
dtypes: float64(1), object(7)
memory usage: 18.6+ MB



*   If we observe closely the Book-Author column and also the Publisher column still two values are missing and Year-of-Publication also having the same issue.


###Data Modification

In [14]:
# Books data set modification

In [15]:
# data modification
year_Books=Books_modified['Year-Of-Publication'].isnull()
incorrect_data=Books_modified[year_Books].copy()
print('show the data of Year-Of-Publication feature where there are nan')
incorrect_data.head()

show the data of Year-Of-Publication feature where there are nan


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,
221678,789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,


In [16]:
indices=incorrect_data.index
mislead_data=Books.loc[indices]
print('This show the original data where there might missing entery may happend')
mislead_data.head()

This show the original data where there might missing entery may happend


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,
221678,789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,


In [17]:
#assigning the null values data set to a variables
Author_indices=Books_modified['Book-Author'].isnull()
incorrect_data1=Books_modified[Author_indices].copy()
Publisher_indices=Books_modified['Publisher'].isnull()
incorrect_data2=Books_modified[Publisher_indices].copy()

In [18]:
incorrect_data1_indices=incorrect_data1.index
print('This is the data where the Book-Author is missing')
incorrect_data1

This is the data where the Book-Author is missing


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995.0,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...


In [19]:
incorrect_data2_indices=incorrect_data2.index
print('This is the data where the Publisher is missing')
incorrect_data2

This is the data where the Publisher is missing


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002.0,,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...
129037,1931696993,Finders Keepers,Linnea Sinclair,2001.0,,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...


In [20]:
# There are some values in the Publisher column which are in integer format
integer_values_index= Books_modified[Books_modified['Publisher'].apply(lambda x: isinstance(x, int))].index.tolist()
integer_data=Books_modified.loc[integer_values_index]
integer_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
87898,967389305,Restoring Intimacy: The Patient's Guide to Mai...,Drew Pinsky,1999.0,3,http://images.amazon.com/images/P/0967389305.0...,http://images.amazon.com/images/P/0967389305.0...,http://images.amazon.com/images/P/0967389305.0...
183717,2264034173,Un troublant retour,Patricia Wentworth,2002.0,37547,http://images.amazon.com/images/P/2264034173.0...,http://images.amazon.com/images/P/2264034173.0...,http://images.amazon.com/images/P/2264034173.0...
258200,2264033932,David Bowie,Jร?ยฉrร?ยดme Soligny,2002.0,37547,http://images.amazon.com/images/P/2264033932.0...,http://images.amazon.com/images/P/2264033932.0...,http://images.amazon.com/images/P/2264033932.0...


In [21]:
invalid_year=Books_modified['Year-Of-Publication']<1800
invalid_years=Books_modified[invalid_year].copy()
invalid_years

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
227531,9643112136,Dalan-i bihisht (Dastan-i Irani),Nazi Safavi,1378.0,Intisharat-i Quqnus,http://images.amazon.com/images/P/9643112136.0...,http://images.amazon.com/images/P/9643112136.0...,http://images.amazon.com/images/P/9643112136.0...
253750,964442011X,Tasht-i khun,Ismaยฐil Fasih,1376.0,Nashr-i Alburz,http://images.amazon.com/images/P/964442011X.0...,http://images.amazon.com/images/P/964442011X.0...,http://images.amazon.com/images/P/964442011X.0...


In [22]:
invalid_years1=Books_modified[Books_modified['Year-Of-Publication']>2024]
invalid_years11=Books_modified.loc[invalid_years1.index]
invalid_years11

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
37487,671746103,MY TEACHER FRIED MY BRAINS (RACK SIZE) (MY TEA...,Coville,2030.0,Aladdin,http://images.amazon.com/images/P/0671746103.0...,http://images.amazon.com/images/P/0671746103.0...,http://images.amazon.com/images/P/0671746103.0...
55676,671791990,MY TEACHER FLUNKED THE PLANET (RACK SIZE) (MY ...,Bruce Coville,2030.0,Aladdin,http://images.amazon.com/images/P/0671791990.0...,http://images.amazon.com/images/P/0671791990.0...,http://images.amazon.com/images/P/0671791990.0...
78168,870449842,Crossing America,National Geographic Society,2030.0,National Geographic,http://images.amazon.com/images/P/0870449842.0...,http://images.amazon.com/images/P/0870449842.0...,http://images.amazon.com/images/P/0870449842.0...
80264,140301690,Alice's Adventures in Wonderland and Through t...,Lewis Carroll,2050.0,Puffin Books,http://images.amazon.com/images/P/0140301690.0...,http://images.amazon.com/images/P/0140301690.0...,http://images.amazon.com/images/P/0140301690.0...
97826,140201092,Outline of European Architecture (Pelican S.),Nikolaus Pevsner,2050.0,Penguin USA,http://images.amazon.com/images/P/0140201092.0...,http://images.amazon.com/images/P/0140201092.0...,http://images.amazon.com/images/P/0140201092.0...
116053,394701658,Three Plays of Eugene Oneill,Eugene O'Neill,2038.0,Vintage Books USA,http://images.amazon.com/images/P/0394701658.0...,http://images.amazon.com/images/P/0394701658.0...,http://images.amazon.com/images/P/0394701658.0...
118294,3442436893,Das groร?ย?e Bร?ยถse- Mร?ยคdchen- Lesebuch.,Kathy Lette,2026.0,Goldmann,http://images.amazon.com/images/P/3442436893.0...,http://images.amazon.com/images/P/3442436893.0...,http://images.amazon.com/images/P/3442436893.0...
192993,870446924,"Field Guide to the Birds of North America, 3rd...",National Geographic Society,2030.0,National Geographic,http://images.amazon.com/images/P/0870446924.0...,http://images.amazon.com/images/P/0870446924.0...,http://images.amazon.com/images/P/0870446924.0...
228173,671266500,FOREST PEOPLE (Touchstone Books (Hardcover)),Colin M. Turnbull,2030.0,Simon &amp; Schuster,http://images.amazon.com/images/P/0671266500.0...,http://images.amazon.com/images/P/0671266500.0...,http://images.amazon.com/images/P/0671266500.0...
240169,684718022,In Our Time: Stories (Scribner Classic),Ernest Hemingway,2030.0,Collier Books,http://images.amazon.com/images/P/0684718022.0...,http://images.amazon.com/images/P/0684718022.0...,http://images.amazon.com/images/P/0684718022.0...



*   After comparing with the original data, we can see that the data entry is just misled.
*   See in the place of Book-Author they just enterd the Year of publication in the place of they entered the publisher similarly other columns too.
*   There are alot of nan values and in publisher column also the data is miss entered
*   So lets just enter the data manually.


In [23]:
# Manually entering the correct data to the miss leaded rows
Books_modified.loc[[209538,220731,221678],['Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L']]=\
 [['Michael Teitelbaum',2000,'DK Publishing Inc','http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg','http://images.amazon.com/images/P/078946697X.01.MZZZZZZZ.jpg','http://images.amazon.com/images/P/078946697X.01.LZZZZZZZ.jpg'],
  ['Jean-Marie Gustave Le Clézio',2002,'Gallimard	','http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg','http://images.amazon.com/images/P/2070426769.01.MZZZZZZZ.jpg','http://images.amazon.com/images/P/2070426769.01.LZZZZZZZ.jpg'],
  ['James Buckley Jr',2000,'DK Publishing Inc','http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg','http://images.amazon.com/images/P/0789466953.01.MZZZZZZZ.jpg','http://images.amazon.com/images/P/0789466953.01.LZZZZZZZ.jpg']]
# Manually entering the correct data to the author where there are nan
Books_modified.loc[[118033,187689],'Book-Author']=[['Dorling Kindersley'],['Sophie Pyott (Editor)']]
# Manually entering the correct data to the Publisher where there are nan
Books_modified.loc[[128890,129037],'Publisher']=[['NovelBooks, Inc'],['NovelBooks, Inc']]
# entering the data using ffill method to the Publisher where there are integer values
Books_modified.loc[integer_values_index,'Publisher']=pd.NA
Books_modified['Publisher'].fillna(method='ffill',inplace=True)
# invalid years are replaced with the real years
Books_modified.loc[[227531,253750,260974,255409,240169,228173,192993,118294,116053,97826,80264,78168,55676,37487],'Year-Of-Publication']=[1999,1997,1991,1937,1925,1961,1999,2006,1952,1942,1977,1994,1992,1991]

In [24]:
Books_modified['Year-Of-Publication']=Books_modified['Year-Of-Publication'].astype(int)

In [25]:
Books_modified.loc[indices]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",Michael Teitelbaum,2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",Jean-Marie Gustave Le Clézio,2002,Gallimard\t,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...
221678,789466953,"DK Readers: Creating the X-Men, How Comic Book...",James Buckley Jr,2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...


In [26]:
Books_modified.loc[incorrect_data1_indices]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,Sophie Pyott (Editor),1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...


In [27]:
Books_modified.loc[incorrect_data2_indices]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,"NovelBooks, Inc",http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,"NovelBooks, Inc",http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...


In [28]:
Books_modified.loc[integer_values_index]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
87898,967389305,Restoring Intimacy: The Patient's Guide to Mai...,Drew Pinsky,1999,MacAdam/Cage Publishing,http://images.amazon.com/images/P/0967389305.0...,http://images.amazon.com/images/P/0967389305.0...,http://images.amazon.com/images/P/0967389305.0...
183717,2264034173,Un troublant retour,Patricia Wentworth,2002,Scholastic,http://images.amazon.com/images/P/2264034173.0...,http://images.amazon.com/images/P/2264034173.0...,http://images.amazon.com/images/P/2264034173.0...
258200,2264033932,David Bowie,Jร?ยฉrร?ยดme Soligny,2002,Silhouette,http://images.amazon.com/images/P/2264033932.0...,http://images.amazon.com/images/P/2264033932.0...,http://images.amazon.com/images/P/2264033932.0...


In [29]:
Books_modified.loc[invalid_years.index]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
227531,9643112136,Dalan-i bihisht (Dastan-i Irani),Nazi Safavi,1999,Intisharat-i Quqnus,http://images.amazon.com/images/P/9643112136.0...,http://images.amazon.com/images/P/9643112136.0...,http://images.amazon.com/images/P/9643112136.0...
253750,964442011X,Tasht-i khun,Ismaยฐil Fasih,1997,Nashr-i Alburz,http://images.amazon.com/images/P/964442011X.0...,http://images.amazon.com/images/P/964442011X.0...,http://images.amazon.com/images/P/964442011X.0...


In [30]:
Books_modified.loc[invalid_years11.index]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
37487,671746103,MY TEACHER FRIED MY BRAINS (RACK SIZE) (MY TEA...,Coville,1991,Aladdin,http://images.amazon.com/images/P/0671746103.0...,http://images.amazon.com/images/P/0671746103.0...,http://images.amazon.com/images/P/0671746103.0...
55676,671791990,MY TEACHER FLUNKED THE PLANET (RACK SIZE) (MY ...,Bruce Coville,1992,Aladdin,http://images.amazon.com/images/P/0671791990.0...,http://images.amazon.com/images/P/0671791990.0...,http://images.amazon.com/images/P/0671791990.0...
78168,870449842,Crossing America,National Geographic Society,1994,National Geographic,http://images.amazon.com/images/P/0870449842.0...,http://images.amazon.com/images/P/0870449842.0...,http://images.amazon.com/images/P/0870449842.0...
80264,140301690,Alice's Adventures in Wonderland and Through t...,Lewis Carroll,1977,Puffin Books,http://images.amazon.com/images/P/0140301690.0...,http://images.amazon.com/images/P/0140301690.0...,http://images.amazon.com/images/P/0140301690.0...
97826,140201092,Outline of European Architecture (Pelican S.),Nikolaus Pevsner,1942,Penguin USA,http://images.amazon.com/images/P/0140201092.0...,http://images.amazon.com/images/P/0140201092.0...,http://images.amazon.com/images/P/0140201092.0...
116053,394701658,Three Plays of Eugene Oneill,Eugene O'Neill,1952,Vintage Books USA,http://images.amazon.com/images/P/0394701658.0...,http://images.amazon.com/images/P/0394701658.0...,http://images.amazon.com/images/P/0394701658.0...
118294,3442436893,Das groร?ย?e Bร?ยถse- Mร?ยคdchen- Lesebuch.,Kathy Lette,2006,Goldmann,http://images.amazon.com/images/P/3442436893.0...,http://images.amazon.com/images/P/3442436893.0...,http://images.amazon.com/images/P/3442436893.0...
192993,870446924,"Field Guide to the Birds of North America, 3rd...",National Geographic Society,1999,National Geographic,http://images.amazon.com/images/P/0870446924.0...,http://images.amazon.com/images/P/0870446924.0...,http://images.amazon.com/images/P/0870446924.0...
228173,671266500,FOREST PEOPLE (Touchstone Books (Hardcover)),Colin M. Turnbull,1961,Simon &amp; Schuster,http://images.amazon.com/images/P/0671266500.0...,http://images.amazon.com/images/P/0671266500.0...,http://images.amazon.com/images/P/0671266500.0...
240169,684718022,In Our Time: Stories (Scribner Classic),Ernest Hemingway,1925,Collier Books,http://images.amazon.com/images/P/0684718022.0...,http://images.amazon.com/images/P/0684718022.0...,http://images.amazon.com/images/P/0684718022.0...


In [31]:
# so now i am just creating a copy of the modified dataset
Books_modified_copy=Books_modified.copy()

In [32]:
# here i am importing the extra stop words and also i want to remove the data like 3rd 4th etc
import nltk
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
extra_stopwords = set(word.lower() for word in stopwords.words('english'))
spacy_stopwords = set(nlp.Defaults.stop_words)
all_stopwords = extra_stopwords.union(spacy_stopwords)
pattern = re.compile(r'^\d+(st|nd|rd|th)?$', re.IGNORECASE)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
# This function removes the stop words
def stop_words_fun(texts,batch_size=5000):
  if isinstance(texts, str):
    texts = [texts]
  cleaned_texts = []
  for doc in nlp.pipe([str(text) for text in texts], batch_size=batch_size):
      cleaned_text = [
          token.text for token in doc
          if token.text.lower() not in all_stopwords
          and not token.is_punct
          and not token.is_digit
          and not pattern.match(token.text.lower())
        ]
      cleaned_texts.append(" ".join(cleaned_text))
  return cleaned_texts if len(cleaned_texts) > 1 else cleaned_texts[0]

In [34]:
print('Before removing the stop words:\n',Books_modified['Book-Title'].iloc[3])
print('After removing the stop words:\n',stop_words_fun(Books_modified['Book-Title'].iloc[3]))

Before removing the stop words:
 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It
After removing the stop words:
 Flu Story Great Influenza Pandemic Search Virus Caused




*   imported the stop words and also there are alot of patterns like this 4th so i dont want them also. i created a function  that will remove all the stop words from the data set


In [35]:
# make sure that the data is in string format
Books_modified_copy['Book-Title'] = Books_modified_copy['Book-Title'].astype(str)
Books_modified_copy['Book-Author'] = Books_modified_copy['Book-Author'].astype(str)
Books_modified_copy['Publisher'] = Books_modified_copy['Publisher'].astype(str)

In [36]:
# calling the function to remove the stop words and storing the data in the same column
Books_modified_copy['Book-Title']=stop_words_fun(Books_modified_copy['Book-Title'])
Books_modified_copy['Book-Author']=stop_words_fun(Books_modified_copy['Book-Author'])
Books_modified_copy['Publisher']=stop_words_fun(Books_modified_copy['Publisher'])

In [37]:
# Lets create a function which categorize the Year of Publication feature to a different categorizes
def categeries_year(year):
    if year < 1900:
        return 'eighteen'
    elif year >=1900 and year <=1925:
        return 'nineteenone'
    elif year >1925 and year <=1950:
        return 'nineteentwo'
    elif year >1950 and year <=1975:
        return 'nineteenthree'
    elif year >1975 and year <2000:
        return 'nineteenfour'
    else:
        return 'twentiesone'



*   Created a function that will return the year data into the certain names so that we can use this data in content based filter.

*   For every 25 year i categeried them i will helps later.


In [38]:
Books_modified_copy['Year-Of-Publication-categorizes']=Books_modified_copy['Year-Of-Publication'].apply(categeries_year)

In [39]:
# This data set contains the stop words so that we can retrive this data set when we are recommending the books based on the isbn code
Books_modified.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [40]:
#User dataset modification

In [41]:
Users_modified.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       278858 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [42]:
Users_modified.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",33.61
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",22.85
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",37.17


In [43]:
# this will split the location into city,state and country
location=[loc.split(',')[:3] for loc in Users_modified['Location'].tolist()]
location_data=pd.DataFrame(location,columns=['City','State_or_Region','Country'])
Users_modified1=pd.concat([Users_modified,location_data],axis=1)
Users_modified1.drop('Location',axis=1,inplace=True)
Users_modified1=Users_modified1.reindex(columns=['User-ID','City','State_or_Region','Country','Age'])

In [44]:
# this will replace the n/a,NA and space with NA
Users_modified1['State_or_Region'].replace(['n/a','NA',' '],pd.NA,inplace=True)
Users_modified1['Country'].replace(['n/a','NA',' '],pd.NA,inplace=True)
Users_modified1['City'].replace(['n/a','NA',' '],pd.NA,inplace=True)
print(Users_modified1.isna().sum())

User-ID               0
City                439
State_or_Region    3620
Country              16
Age                   0
dtype: int64


In [45]:
# this city feature should not contain the null values so i am filling the null values with ffill method
Users_modified1['City'].fillna(method='ffill', inplace=True)

In [46]:
# converting to string type
Users_modified1['State_or_Region'] = Users_modified1['State_or_Region'].astype(str)
Users_modified1['Country']= Users_modified1['Country'].astype(str)

In [47]:
# using groupby and mapping method to fill the missing values
city_to_state_or_region=Users_modified1.groupby('City')['State_or_Region'].agg(lambda x:x.mode()[0])
city_to_country=Users_modified1.groupby('City')['Country'].agg(lambda x:x.mode()[0])
Users_modified1['State_or_Region'].fillna(Users_modified1['City'].map(city_to_state_or_region),inplace=True)
Users_modified1['Country'].fillna(Users_modified1['City'].map(city_to_country),inplace=True)

# replacing the <NA> values with pd.NA
Users_modified1['Country'].replace('<NA>',pd.NA,inplace=True)
Users_modified1['State_or_Region'].replace('<NA>',pd.NA,inplace=True)

# using ffill method to fill the missing values if still there are any
Users_modified1['State_or_Region'].fillna(method='ffill', inplace=True)
Users_modified1['Country'].fillna(method='ffill', inplace=True)

In [48]:
# Converting the Age column to int datatype
Users_modified1['Age']=Users_modified1['Age'].astype(int)

In [49]:
# this function will categorize the age into different categories
def categeries_ages(age):
    if age < 20:
        return 'Teenager'
    elif age >=20 and age <=36:
        return 'Young'
    elif age >36 and age <=55:
        return 'Middle'
    else:
        return 'senior'

In [50]:
# calling the function and storing the data in a new column
Users_modified1['Age_category']=Users_modified1['Age'].apply(categeries_ages)

In [51]:
Users_modified1.head()

Unnamed: 0,User-ID,City,State_or_Region,Country,Age,Age_category
0,1,nyc,new york,usa,33,Young
1,2,stockton,california,usa,18,Teenager
2,3,moscow,yukon territory,russia,22,Young
3,4,porto,v.n.gaia,portugal,17,Teenager
4,5,farnborough,hants,united kingdom,37,Middle




*   For user data i just supperated the location into three different parts it may helps in content based filters.
*   I just wrote a function that categeries the ages in to four stages.



In [52]:
#Rating data set modification

In [53]:
# lets just check the duplicates with two columns
duplicates=Ratings[Ratings.duplicated(subset=['User-ID','ISBN'],keep=False)]
print('Before droping the duplicates:',Ratings.shape)
print('duplicates shape:',duplicates.shape)
# drop the duplicates
Ratings=Ratings.drop_duplicates(subset=['User-ID','ISBN'],keep='first')
print('After droping the duplicates:',Ratings.shape)

Before droping the duplicates: (1149780, 3)
duplicates shape: (79, 3)
After droping the duplicates: (1149735, 3)




*   After few analysis what i found that there are some user that are repeated i mean duplicates its like the user and isbn combinations were repeated few time. so i am removing the only users and at the same time isbn also.



In [54]:
Reindex_Ratings=Ratings.reset_index()

In [55]:
Reindex_Ratings.drop(columns='index')

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6
...,...,...,...
1149730,276704,1563526298,9
1149731,276706,679447156,0
1149732,276709,515107662,10
1149733,276721,590442449,10


In [56]:
# Using joins we will merge the all three dataset
# This data set contains all the columns without null values so we this data set more than the other dataset
Inner_join=pd.merge(Ratings,Books_modified_copy,on='ISBN',how='inner')
Inner_join_data=pd.merge(Inner_join,Users_modified1,on='User-ID',how='inner')


In [57]:
# Drop the unnecessary columns
Required_data=Inner_join_data.drop(['Image-URL-S','Image-URL-M','Image-URL-L'],axis=1)
# reindex the columns
Required_data=Required_data.reindex(columns=['ISBN','Book-Title','Book-Author','Year-Of-Publication','Publisher','Year-Of-Publication-categorizes','Book-Rating','User-ID','City','State_or_Region','Country','Age','Age_category'])

#**Exploratory Data Analysis**

In [60]:
# This function helps to visualize the data and understanding the data more
def data_visualization(data,X_column_name,Y_column_name,Z_column_name,graph_is_about):
  if graph_is_about == 'ratings':
    X=data[X_column_name].unique()
    Y=data[X_column_name].value_counts()
    title='Number of user and their ratings on books'
    y_axis_title='Num_of_User'
    x_axis_title='Ratings'
    fig=go.Figure()
    fig.add_trace(go.Bar(x=X,y=Y.values,marker_color='skyblue'))
    fig.update_layout(title_text=title, xaxis_title=x_axis_title,yaxis_title=y_axis_title)
    fig.show()
  elif graph_is_about == 'Hyper_active_users':
    Top_ratings= data[data[X_column_name].isin([6,7,8,9,10])]
    count=Top_ratings[Y_column_name].value_counts()
    indices=count[count>=1000]
    user_details=indices.index.tolist()
    active_users = Top_ratings[Top_ratings[Y_column_name].isin(user_details)]
    fig=px.scatter_3d(active_users,x='Year-Of-Publication', y=Y_column_name, z=X_column_name, color= 'Age', title= 'Hyper active users', opacity=0.8)
    fig.show()


In [None]:
# if the renering didnt supported then used this code to see in default browser 
'''
import plotly.io as pio
pio.renderers.default = 'browser'  
'''

In [63]:
data_visualization(Required_data,'Book-Rating','User-ID','ISBN','ratings')

In [64]:
data_visualization(Required_data,'Book-Rating','User-ID','c','Hyper_active_users')

**Analysis**

*   After understanding the bar graph we can conclude that most of the users didn't gave the ratings to books.
*   The data is total skewed towards left side which means its positively skewed.

*   Which is a bad sign from business point of view.


#**Content Based Filter**

In [14]:
import pickle

# Save mapping to a pickle file
def save_mapping_to_pickle(mapping, filename):
    with open(filename, 'wb') as f:
        pickle.dump(mapping, f)

# for loading the pickel file
def load_user_mapping(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)


In [None]:
# sample of a single string how does Tfidfvectorizer work
T_vectorizer = TfidfVectorizer(
    max_features=3000,
    smooth_idf=True,
    sublinear_tf=True,
)
word_count = T_vectorizer.fit_transform([
    "god kitchen wife",
    "kitchen god table",
    "wife god love"
])
print(word_count)
sample = pd.DataFrame.sparse.from_spmatrix(word_count, columns=T_vectorizer.get_feature_names_out())
sample

  (0, 4)	0.6198053799406072
  (0, 1)	0.6198053799406072
  (0, 0)	0.48133416873660545
  (1, 3)	0.7203334490549893
  (1, 1)	0.5478321549274363
  (1, 0)	0.4254405389711991
  (2, 2)	0.7203334490549893
  (2, 4)	0.5478321549274363
  (2, 0)	0.4254405389711991


Unnamed: 0,god,kitchen,love,table,wife
0,0.481334,0.619805,0.0,0.0,0.619805
1,0.425441,0.547832,0.0,0.720333,0.0
2,0.425441,0.0,0.720333,0.0,0.547832




*   This is the example of how tfidf vectorizer works.




In [32]:
# predifining the each tfidf vectorizer so that that we dont mix the words
author_vectorizer = TfidfVectorizer(max_features=600, smooth_idf=True, sublinear_tf=True,ngram_range=(1, 2), min_df=2, max_df=0.9)
publisher_vectorizer = TfidfVectorizer(max_features=394, smooth_idf=True, sublinear_tf=True,ngram_range=(1, 2), min_df=2, max_df=0.9)
year_Of_publication_categorizes_vectorizer = TfidfVectorizer(max_features=10, smooth_idf=True, sublinear_tf=True, ngram_range=(1, 2), min_df=2, max_df=0.9)

#fit the data with supperate tfidf vectorizer
author_matrix=author_vectorizer.fit_transform(Books_modified_copy['Book-Author'])
publisher_matrix=publisher_vectorizer.fit_transform(Books_modified_copy['Publisher'])
year_of_publication_matrix=year_Of_publication_categorizes_vectorizer.fit_transform(Books_modified_copy['Year-Of-Publication-categorizes'])

# so we are stacking all the matrix in a single var
Content_based_filter_sparse=hstack([author_matrix,publisher_matrix,year_of_publication_matrix])

In [None]:
# saving the dataset
''' 
save_npz("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\Content_based_filter_sparse",Content_based_filter_sparse)

'''

' \nsave_npz("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\Content_based_filter_sparse",Content_based_filter_sparse)\n\n'

In [34]:
Content_based_filter_sparse=load_npz("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\Content_based_filter_sparse.npz")

In [None]:
#content based recommendation system
def index_IVF_PQ(tfidf_data):
    # Dimension of the data set
    d=tfidf_data.shape[1]
    # Number of sub-vectors
    # NOTE: d % m must always come zero
    m=20
    # Nmumber of clusters
    clusters=2048
    #bits per sub-vector
    n_bits=8
    # Its a technique used to find the nearest cluster using euclidean distance between the query vector and all the vectors in the cluster
    # It also stores the all the clusters centroid to quickly access the nearest cluster
    quantizer=faiss.IndexFlatL2(d)
    # Its a vector search index that organizes and compresses data to perform fast approximate nearest neighbor (ANN) searches.
    index=faiss.IndexIVFPQ(quantizer,d,clusters,m,n_bits)
    # Collecting the sample data to train the index
    # If you get memoryError then us this two lines and comment the present train_data line
    #sample_data=min(100000,tfidf_data.shape[0])
    #train_data=tfidf_data[:sample_data].toarray().astype('float32')
    # Training data
    train_data=tfidf_data.toarray().astype('float32')
    # Training the index
    index.train(train_data)
    # Now we added the oringinal data to the index
    batch_size=1000
    for i in range(0,tfidf_data.shape[0],batch_size):
        # adding the data by chuncks to the index
        # we adding the rows data to the index the data is in pd dataframe format so we need to covert it to numpy
        single_batch=(tfidf_data[ i : i+batch_size].toarray().astype('float32'))
        index.add(single_batch)
    print(index.is_trained)
    return index



*   This fucntion will train the index ivf pq with the non sparse data.




In [52]:
# This function helps to search the index and return the similar books
def search_index(user_query,num_of_top_books,index,nprobe_value,books_info_data):
    query_vector=user_query.toarray().astype('float32')
    index.nprobe=nprobe_value
    # Searching for the similar vector
    Distances, Indices =index.search(query_vector,num_of_top_books)
    print('Retrived rows :',Indices)
    # Extracting the original books names and etc from the original data using the index
    recommended_books=[books_info_data.iloc[idx] for idx in Indices.flatten() if 0<= idx <len(books_info_data)]
    Recommended_books=pd.DataFrame(recommended_books)
    print('Distances between the points::',Distances)
    return Recommended_books





*   This function will search the nearst point to our query vector.




In [53]:
# This function will take the users requirements
def user_specification():
    # Asking the user to enter the specifications
    book_author=input('Enter the Book Author( or press enter to skip):').strip()
    publisher=input('Enter the Publisher( or press enter to skip):').strip()
    year_of_publication=input('Enter the year_of_publication ( or press enter to skip):').strip()

    if year_of_publication:
        try:
            year_of_publication=int(year_of_publication)
            #function call
            year_of_publication1=categeries_year(year_of_publication)
        except ValueError:
            print('Invalid input: please enter the year of publication in numbers')
            year_of_publication=None
    else:
        # If user doesn't want to give the year
        year_of_publication=None
    if book_author:
        author_matrix=author_vectorizer.transform([book_author])
        print('Selected Author:',book_author)
    else:
        author_matrix=csr_matrix((1,author_vectorizer.get_feature_names_out().shape[0]))
        print('Selected Author: None')
    if publisher:
        publisher_matrix=publisher_vectorizer.transform([publisher])
        print('Selected Publisher:',publisher)
    else:
        publisher_matrix=csr_matrix((1,publisher_vectorizer.get_feature_names_out().shape[0]))
        print('Selected Publisher: None')
    if year_of_publication:
        year_matrix=year_Of_publication_categorizes_vectorizer.transform([year_of_publication1])
        print('Selected Year of Publication:',year_of_publication)
    else:
        year_matrix=csr_matrix((1,year_Of_publication_categorizes_vectorizer.get_feature_names_out().shape[0]))
        print('Selected Year of Publication: None')
    user_query_matrix=hstack([author_matrix,publisher_matrix,year_matrix])
    # Creating the dict
    return user_query_matrix



*   This function will take the user specification



In [54]:
# This function call will train the index
Index_data=index_IVF_PQ(Content_based_filter_sparse)

True


In [55]:
# This function call will collect the data from user
query=user_specification()

Selected Author: None
Selected Publisher: Sun Books
Selected Year of Publication: None


In [56]:
print("Query vector shape:", query.shape)
print("FAISS index dimension:", Index_data.d)

Query vector shape: (1, 1000)
FAISS index dimension: 1000


In [57]:
# This function suggests the books
search_index(query,5,Index_data,30,Books_modified)

Retrived rows : [[ 164  306  847  924 1049]]
Distances between the points:: [[1.0000908 1.0000908 1.0000908 1.0000908 1.0000908]]


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
164,039575514X,My Antonia,Willa Cather,1995,Mariner Books,http://images.amazon.com/images/P/039575514X.0...,http://images.amazon.com/images/P/039575514X.0...,http://images.amazon.com/images/P/039575514X.0...
306,1573225487,The Romance Reader,Pearl Abraham,1996,Riverhead Books,http://images.amazon.com/images/P/1573225487.0...,http://images.amazon.com/images/P/1573225487.0...,http://images.amazon.com/images/P/1573225487.0...
847,1563411148,The Second Coming of Curly Red,Jody Seay,1999,Firebrand Books,http://images.amazon.com/images/P/1563411148.0...,http://images.amazon.com/images/P/1563411148.0...,http://images.amazon.com/images/P/1563411148.0...
924,753804700,Reader,Bernhard Schlink,1999,Phoenix Books,http://images.amazon.com/images/P/0753804700.0...,http://images.amazon.com/images/P/0753804700.0...,http://images.amazon.com/images/P/0753804700.0...
1049,805062971,Fight Club,Chuck Palahniuk,1999,Owl Books,http://images.amazon.com/images/P/0805062971.0...,http://images.amazon.com/images/P/0805062971.0...,http://images.amazon.com/images/P/0805062971.0...




*   This are the recommended books based on the user specification that we passed.




#**Collaberative Filter**

In [None]:
 # Collaberative Filter

In [None]:
# This function will convert the raw data to a pivot formate and then the pivot data will be converted to non sparse data
def pivot_matrix_converter(data,chunk_size,user_rows,isbn_col,values_col):
    # chunks size
    Chunk_size=chunk_size
    chunk_pivot_list=[]
    unique_data=data['ISBN'].unique()
    item_id_mapping={isbn:idx for idx , isbn in enumerate(unique_data)}
    user_id_mapping={}
    row_offset=0
    # selects the data on the size of chunk and incremments the i value each time it iterates
    for i in range(0, len(data),Chunk_size):
        #taking the data from dataset
        chunk_data=data.iloc[i:i+Chunk_size]
        #converting the data to pivot formate
        chunk_matrix=chunk_data.pivot(index=user_rows, columns=isbn_col, values=values_col)
        # we are doing this because of when we stack the sparse data then it will raise an like columns not matched, because each iteration i am passing new chuncks, so the isbn code may changes and it wont be same as previous chunk
        # so i am reindex with unique vaues of isbn so next chunk data ISBN code will be existed in the chunk_matrix. next time the data will directly enters to that particular code.
        chunk_matrix=chunk_matrix.reindex(columns=unique_data,fill_value=0)
        # Update user ID mapping
        for idx, user_id in enumerate(chunk_matrix.index):
            if user_id not in user_id_mapping:
                user_id_mapping[user_id] = row_offset + idx
        # converting pivot formate to sparse matrix
        chunk_sparse=csr_matrix(chunk_matrix.fillna(0).values)
        # appending the each chunk sparse data to chunk_pivot_list
        chunk_pivot_list.append(chunk_sparse)
    # vartically stacking the chunk_pivot_list
    final_sparse_data=vstack(chunk_pivot_list)
    return final_sparse_data,user_id_mapping,item_id_mapping




*   This function will converts that dataset into pivot table and also to reduce the memory space we are passing the data in to chunks formate.


*   The pivote table will again converts non sparse dataset. In non sparse there will be no zero values this prevents the memory error

In [None]:
# before running the content based filter make sure you ran this this
#Content_based_filter_sparse=load_npz("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\\\Content_based_filter_sparse.npz")

In [None]:
Ratings_sparse_data,user_id_mapping,item_id_mapping= pivot_matrix_converter(Reindex_Ratings,10000,'User-ID','ISBN','Book-Rating')

In [None]:
#saving userid and itemid index
'''   

make sure you saved this pkl file in the same directory where you run the streamlit app. this is the path where i saved the file  
D:\\work_space\\projects\\recommendation_system\\streamlit_app

you must run this code before running the streamlit app otherwise it will raise an error like "user_id_mapping not found" or "item_id_mapping not found"

'''
'''

# same goes for this files also 
save_mapping_to_pickle(user_id_mapping, "user_id_mapping.pkl")
save_mapping_to_pickle(item_id_mapping,"item_id_mapping.pkl")
save_mapping_to_pickle(Books_modified_copy,"books_no_stopwords_modified.pkl")
save_mapping_to_pickle(Books_modified,"books_modified_original.pkl")
save_mapping_to_pickle(Users_modified1,"user_modified_details.pkl")
save_mapping_to_pickle(Reindex_Ratings,"rating_modified.pkl")
save_npz("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\ratings_sparse_matrix",Ratings_sparse_data)
'''

In [None]:
# Load the matrix later
Ratings_sparse_data=load_npz("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\ratings_sparse_matrix.npz")

In [None]:
user_id_mapping=load_user_mapping("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\user_id_mapping.pkl")
item_id_mapping=load_user_mapping("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\item_id_mapping.pkl")
Reindex_Ratings=load_user_mapping("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\rating_modified.pkl")
Books_modified=load_user_mapping("D:\\work_space\\projects\\recommendation_system\\streamlit_app\\books_modified_original.pkl")

In [None]:
# This function is to train the model with hyper parameters
def model_train(non_sparse_matrix,factors,regularization,iteration):
    model=AlternatingLeastSquares(factors=factors,regularization=regularization,iterations=iteration)
    model.fit(non_sparse_matrix.T)
    return model



*   This helps to train the model with certain hyper parameters



In [None]:
def map_user_id_to_index(user_id, user_id_mapping):
    """Map a raw user ID to the matrix index."""
    return user_id_mapping.get(user_id, None)



*   Here we are mapping the raw user id from the user id mapping which contains non sparse data rows and this will take the index form userid mapping which is matched with the raw user id.



In [None]:
def calculate_biases(actual_ratings,user_index):
    # Global biases
    global_bias=actual_ratings.sum() / actual_ratings.nnz # Number of non zero elements

    # User biases
    user_sum=actual_ratings[user_index, :].sum()
    user_nnz_count=(actual_ratings[user_index, :] != 0).sum()  # It create a boolen array and .sum() will count the true thats how we are going to know that how item does user rated
    user_bias=(user_sum / user_nnz_count) - global_bias if user_nnz_count> 0 else 0

    # Item biases
    item_sum = actual_ratings[user_index, :].toarray().flatten()
    item_nnz_counts = (actual_ratings[user_index, :] != 0).toarray().flatten()
    item_bias = np.zeros(actual_ratings.shape[1])
    item_bias[item_nnz_counts > 0] = (item_sum[item_nnz_counts > 0] / item_nnz_counts[item_nnz_counts > 0]) - global_bias
    return global_bias ,user_bias, item_bias





*   To get more accuracy we calculating the biases and adds them with the predicted values.



In [None]:
def predict_ratings(user_factor,item_factors,global_bias,user_bias,item_bias):

    prediction_matrix = user_factor @ item_factors.T
    if user_bias==0 and np.all(item_bias == 0):
        return global_bias+prediction_matrix
    else:
        user_bias = np.array(user_bias).reshape(-1)
        item_bias = np.array(item_bias).flatten()
        # Align shapes by trimming item_bias if needed
        item_bias = item_bias[:prediction_matrix.shape[0]]
        return global_bias+user_bias+item_bias.flatten()+prediction_matrix



*   This function will predict the rating of certain user.



In [None]:
def top_recommended_isbn(predictions_ratings,item_id_mapping,n=20):
    top_indices=np.argsort(predictions_ratings)[::-1][:n]
    top_recommends=predictions_ratings[top_indices]
    isbn_code = [(list(item_id_mapping.keys())[i], score) for i, score in zip(top_indices, top_recommends)]
    return isbn_code




*   This function will return the isbn code that has more rating.



In [None]:
def recommend_books(isbn_code,original_data,top_n):
    # creating a dataframe
    isbn=pd.DataFrame(isbn_code,columns=['ISBN','Ratings'])
    # This will checks does we have the books details in our original dataset
    valid_books=isbn[isbn['ISBN'].isin(original_data['ISBN'].unique())]
    # using left join we are going to retrive the data from the orinal dataset
    top_recommends=pd.merge(valid_books,original_data,on='ISBN',how='left')
    return top_recommends.head(top_n)

In [None]:
model=model_train(Ratings_sparse_data,50,0.01,20)

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
raw_user_id=276727
no_of_top_ratings=5
user_index=map_user_id_to_index(raw_user_id,user_id_mapping)

In [None]:
if user_index is not None and user_index < Ratings_sparse_data.shape[0]:
    global_bias,user_bias,item_bias=calculate_biases(Ratings_sparse_data,user_index)
    user_factor=model.user_factors[user_index]
    item_factors=model.item_factors
    predictions_ratings=predict_ratings(user_factor,item_factors,global_bias,user_bias,item_bias)
    top_isbn=top_recommended_isbn(predictions_ratings,item_id_mapping)
    recommended_book=recommend_books(top_isbn,Books_modified,no_of_top_ratings)
else:
    print(f"we dont have {raw_user_id} this user details but still we can recommend them based on the content based recommendation system")



In [None]:
recommended_book

Unnamed: 0,ISBN,Ratings,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,725102586,7.601066,From Curtin to Kerr,Fred Daly,1977,Sun Books,http://images.amazon.com/images/P/0725102586.0...,http://images.amazon.com/images/P/0725102586.0...,http://images.amazon.com/images/P/0725102586.0...
1,843949082,7.601066,To Meet Again,Elaine Barbieri,2001,Leisure Books,http://images.amazon.com/images/P/0843949082.0...,http://images.amazon.com/images/P/0843949082.0...,http://images.amazon.com/images/P/0843949082.0...
2,1551665212,7.601066,Wanting You,Nan Ryan,1999,Mira,http://images.amazon.com/images/P/1551665212.0...,http://images.amazon.com/images/P/1551665212.0...,http://images.amazon.com/images/P/1551665212.0...
3,037312256X,7.601066,The Mistress Scandal (Passion) (Harlequin Pre...,Kim Lawrence,2002,Harlequin,http://images.amazon.com/images/P/037312256X.0...,http://images.amazon.com/images/P/037312256X.0...,http://images.amazon.com/images/P/037312256X.0...
4,449205975,7.601066,Jian,ERIC VAN LUSTBADER,1986,Fawcett,http://images.amazon.com/images/P/0449205975.0...,http://images.amazon.com/images/P/0449205975.0...,http://images.amazon.com/images/P/0449205975.0...




*   This are recommended books for the particular user id.



#**Popularity Based Recommendations**

In [None]:
#popularity based recommendations


In [None]:
class PopularityBasedRecommender:
    def __init__(self):
        self.item_scores = None
        self.ratings_data = None
    # this function will calculate the avg rating and also no of user read certain book i mean count
    def fit(self,ratings_data, item_col, rating_col):
        self.ratings_data=ratings_data
        item_popularity = ratings_data.groupby(item_col).agg(popularity_score=(item_col,'size'),
                                                             average_rating=(rating_col,'mean')).reset_index()

        sorted_item_popularity=item_popularity.sort_values(by=['popularity_score','average_rating'],ascending=[False,False])

        self.item_scores = sorted_item_popularity
    # this will recommend the top 20 books that are more popular
    def recommend_popular_items(self,top_n=20):
        if self.item_scores is None:
            raise ValueError("Model has not been trained. Call 'fit' with rating_data.")
        return self.item_scores.head(top_n)
    def checks_isbn_isin(self,original_books,top_num,item_col='ISBN'):
        valid_books=self.item_scores[self.item_scores[item_col].isin(original_books[item_col].unique())]
        popular_books=pd.merge(valid_books,original_books,on='ISBN',how='left')
        return popular_books.head(top_num)

In [None]:
if __name__ == "__main__":
    recommender=PopularityBasedRecommender()
    recommender.fit(ratings_data=Reindex_Ratings,item_col='ISBN',rating_col='Book-Rating')
    recommender.recommend_popular_items()
    popular_books=recommender.checks_isbn_isin(Books_modified,10)

In [None]:
popular_books.head(10)

Unnamed: 0,ISBN,popularity_score,average_rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,971880107,2503,1.019177,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...
1,316666343,1295,4.468726,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...
2,385504209,884,4.656109,The Da Vinci Code,Dan Brown,2003,Doubleday,http://images.amazon.com/images/P/0385504209.0...,http://images.amazon.com/images/P/0385504209.0...,http://images.amazon.com/images/P/0385504209.0...
3,60928336,732,3.448087,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial,http://images.amazon.com/images/P/0060928336.0...,http://images.amazon.com/images/P/0060928336.0...,http://images.amazon.com/images/P/0060928336.0...
4,312195516,724,4.339779,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,http://images.amazon.com/images/P/0312195516.0...,http://images.amazon.com/images/P/0312195516.0...,http://images.amazon.com/images/P/0312195516.0...
5,044023722X,647,3.187017,A Painted House,John Grisham,2001,Dell Publishing Company,http://images.amazon.com/images/P/044023722X.0...,http://images.amazon.com/images/P/044023722X.0...,http://images.amazon.com/images/P/044023722X.0...
6,142001740,615,4.219512,The Secret Life of Bees,Sue Monk Kidd,2003,Penguin Books,http://images.amazon.com/images/P/0142001740.0...,http://images.amazon.com/images/P/0142001740.0...,http://images.amazon.com/images/P/0142001740.0...
7,067976402X,614,3.2557,Snow Falling on Cedars,David Guterson,1995,Vintage Books USA,http://images.amazon.com/images/P/067976402X.0...,http://images.amazon.com/images/P/067976402X.0...,http://images.amazon.com/images/P/067976402X.0...
8,671027360,586,3.71843,Angels &amp; Demons,Dan Brown,2001,Pocket Star,http://images.amazon.com/images/P/0671027360.0...,http://images.amazon.com/images/P/0671027360.0...,http://images.amazon.com/images/P/0671027360.0...
9,446672211,585,4.105983,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,1998,Warner Books,http://images.amazon.com/images/P/0446672211.0...,http://images.amazon.com/images/P/0446672211.0...,http://images.amazon.com/images/P/0446672211.0...




*   This are the top 10 popular books that user gave more rating and also read by a lot users



In [None]:
 # trending books on each year

In [None]:
# Here i am going to take only the data which we have the book details. so by joining the books details with the user we can find the trending books on specific year
#data_for_trending_books=pd.merge(Reindex_Ratings,Books_modified,on='ISBN',how='inner')
# Now we dont need the user id
#data_for_trending_books=data_for_trending_books.drop(columns='User-ID')
# Saving the dataset
#save_mapping_to_pickle(data_for_trending_books,'data_for_trending.pkl')

 #**Trending Books**

In [None]:
# load the dataset
data_for_trending=load_user_mapping('data_for_trending.pkl')

In [None]:
class Trending_books:
    def __init__(self):
        self.dataset = None
        self.treding_items = None
    def fit(self,dataset):
        self.dataset=dataset
    def pick_trending_books(self,year,data_col,item_col,rating_col,top_n):
        if self.dataset is None:
            raise ValueError("Model has not been trained. Call 'fit' with interaction data.")
        year_data=self.dataset[self.dataset[data_col] == year]
        if year_data is None:
            raise ValueError(f"we dont have the books details of {year} year ")
        item_trending=year_data.groupby(item_col).agg(popularity_score=(item_col,'size'),
                                                       average_rating=(rating_col,'mean')).reset_index()
        trending_items=item_trending.sort_values(by=['popularity_score','average_rating'], ascending=[False, False])
        self.trending_items = trending_items.head(top_n)
    def book_recommender(self,original_data):
        book_details=pd.merge(self.trending_items,original_data,on="ISBN",how='left')
        return book_details



In [None]:
if __name__ == "__main__":
    trend_recommender=Trending_books()
    trend_recommender.fit(data_for_trending)
    trend_recommender.pick_trending_books(2002,'Year-Of-Publication','ISBN','Book-Rating',5)
    trending_books=trend_recommender.book_recommender(Books_modified)

In [None]:
trending_books

Unnamed: 0,ISBN,popularity_score,average_rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,316666343,1295,4.468726,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...
1,312278586,474,3.597046,The Nanny Diaries: A Novel,Emma McLaughlin,2002,St. Martin's Press,http://images.amazon.com/images/P/0312278586.0...,http://images.amazon.com/images/P/0312278586.0...,http://images.amazon.com/images/P/0312278586.0...
2,743418174,470,4.040426,Good in Bed,Jennifer Weiner,2002,Washington Square Press,http://images.amazon.com/images/P/0743418174.0...,http://images.amazon.com/images/P/0743418174.0...,http://images.amazon.com/images/P/0743418174.0...
3,440241073,456,3.195175,The Summons,John Grisham,2002,Dell Publishing Company,http://images.amazon.com/images/P/0440241073.0...,http://images.amazon.com/images/P/0440241073.0...,http://images.amazon.com/images/P/0440241073.0...
4,446610038,391,3.498721,1st to Die: A Novel,James Patterson,2002,Warner Vision,http://images.amazon.com/images/P/0446610038.0...,http://images.amazon.com/images/P/0446610038.0...,http://images.amazon.com/images/P/0446610038.0...
