##Loading the Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
books = pd.read_csv('/content/drive/MyDrive/Books.csv')
users = pd.read_csv('/content/drive/MyDrive/Users.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Ratings.csv')

  books = pd.read_csv('/content/drive/MyDrive/Books.csv')


In [4]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 8)
(278858, 3)
(1149780, 3)


In [8]:
books.isnull().sum()

Unnamed: 0,0
ISBN,0
Book-Title,0
Book-Author,2
Year-Of-Publication,0
Publisher,2
Image-URL-S,0
Image-URL-M,0
Image-URL-L,3


In [9]:
users.isnull().sum()

Unnamed: 0,0
User-ID,0
Location,0
Age,110762


In [10]:
ratings.isnull().sum()

Unnamed: 0,0
User-ID,0
ISBN,0
Book-Rating,0


In [None]:
books.duplicated().sum()

0

In [None]:
ratings.duplicated().sum()

0

In [None]:
users.duplicated().sum()

0

##Popularity Based Recommender System

In [13]:
ratings_with_name = ratings.merge(books,on='ISBN')
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [41]:
print(ratings_with_name['Book-Rating'].dtype)

float64


In [42]:
import pandas as pd

# Assuming 'ratings_with_name' is your dataframe
# Convert 'Book-Rating' to numeric, coercing errors to NaN and then dropping them
ratings_with_name['Book-Rating'] = pd.to_numeric(ratings_with_name['Book-Rating'], errors='coerce')
ratings_with_name = ratings_with_name.dropna(subset=['Book-Rating'])

# Ensure 'Book-Rating' is a float after cleaning
ratings_with_name['Book-Rating'] = ratings_with_name['Book-Rating'].astype(float)

# Now group by 'Book-Title' and calculate the mean rating
avg_rating_df = ratings_with_name.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'avg_ratings'}, inplace=True)

# Display the resulting dataframe
print(avg_rating_df)


                                               Book-Title  avg_ratings
0        A Light in the Storm: The Civil War Diary of ...     2.250000
1                                   Always Have Popsicles     0.000000
2                    Apple Magic (The Collector's series)     0.000000
3        Ask Lily (Young Women of Faith: Lily Series, ...     8.000000
4        Beyond IBM: Leadership Marketing and Finance ...     0.000000
...                                                   ...          ...
241066                                      Ã?Â?lpiraten.     0.000000
241067                     Ã?Â?rger mit Produkt X. Roman.     5.250000
241068                                Ã?Â?sterlich leben.     7.000000
241069                              Ã?Â?stlich der Berge.     2.666667
241070                                  Ã?Â?thique en toc     4.000000

[241071 rows x 2 columns]


In [43]:
popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [44]:
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_ratings',ascending=False).head(50)
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
80445,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
211384,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
219741,To Kill a Mockingbird,510,4.7


In [45]:
popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_ratings']]
popular_df.shape

(50, 5)

In [46]:
popular_df['Image-URL-M'][0]

'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'

## Collaborative Filtering Recommender System

In [47]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
padhe_likhe_users = x[x].index

In [48]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(padhe_likhe_users)]

In [49]:
y = ratings_with_name.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index

In [51]:
final_rating = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]
final_rating.shape

(112147, 10)

In [52]:
final_rating.drop_duplicates()
final_rating.shape

(112147, 10)

In [53]:
pt = final_rating.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')

In [54]:
pt.fillna(0,inplace=True)

In [55]:
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)
similarity_scores.shape

(2444, 2444)

In [35]:
def recommend(book_name):

    # Find the index of the book
    index = np.where(pt.index == book_name)[0][0]
    # Get the similarity scores for the book
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:6]
    data = []
    # Loop through similar items and print details
    for i in similar_items:
      print(pt.index[i[0]])
      item = []
      temp_df = books[books['Book-Title'] == pt.index[i[0]]]
      item.extend(temp_df.drop_duplicates('Book-Title')['Book-Title'])
      item.extend(temp_df.drop_duplicates('Book-Title')['Book-Author'])
      item.extend(temp_df.drop_duplicates('Book-Title')['Image-URL-M'])

      data.append(item)
    return data

In [65]:
recommend('The Hobbit')

Memoirs of a Geisha Uk
Foundation (Foundation Novels (Paperback))
Girl With a Pearl Earring
Hawaii
The Return of the King (The Lord of the Rings, Part 3)


[['Memoirs of a Geisha Uk',
  'Arthur Golden',
  'http://images.amazon.com/images/P/0099771519.01.MZZZZZZZ.jpg'],
 ['Foundation (Foundation Novels (Paperback))',
  'Isaac Asimov',
  'http://images.amazon.com/images/P/0345308999.01.MZZZZZZZ.jpg'],
 ['Girl With a Pearl Earring',
  'Tracy Chevalier',
  'http://images.amazon.com/images/P/052594527X.01.MZZZZZZZ.jpg'],
 ['Hawaii',
  'James A. Michener',
  'http://images.amazon.com/images/P/0449213358.01.MZZZZZZZ.jpg'],
 ['The Return of the King (The Lord of the Rings, Part 3)',
  'J.R.R. TOLKIEN',
  'http://images.amazon.com/images/P/0345339738.01.MZZZZZZZ.jpg']]

In [None]:
import pickle
pickle.dump(popular_df,open('popular.pkl', 'wb'))

In [None]:
pickle.dump(pt,open('pt.pkl', 'wb'))
pickle.dump(books,open('books.pkl', 'wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl', 'wb'))