<a href="https://colab.research.google.com/github/sof1a03/KDE-group6/blob/main/ratings_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ratings Data

* Contains the book rating information. Ratings (Book-Rating) are either explicit, expressed on a scale from 1-10 (higher values denoting higher appreciation), or implicit, expressed by 0.

In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
rating_df = pd.read_csv('Ratings.csv')

In [3]:
#displaying the first 5 rows
rating_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [5]:
# checking null values
rating_df.isna().sum()

Unnamed: 0,0
User-ID,0
ISBN,0
Book-Rating,0


In [6]:
# checking for unique user ids and isbn values
print('Number of unique user ids is {} and ISBN no. is {}'.format(rating_df['User-ID'].nunique(), rating_df['ISBN'].nunique()))

Number of unique user ids is 105283 and ISBN no. is 340556


This means that many users are buying multiple books. Also some books are really famous and hence are bought by multiple users.*



In [7]:
# making all the ISBN no. uppercase
rating_df['ISBN'].apply(lambda x: x.upper())

Unnamed: 0,ISBN
0,034545104X
1,0155061224
2,0446520802
3,052165615X
4,0521795028
...,...
1149775,1563526298
1149776,0679447156
1149777,0515107662
1149778,0590442449


In [8]:
# checking for duplicates
rating_df[rating_df.duplicated()].sum()

Unnamed: 0,0
User-ID,0
ISBN,0
Book-Rating,0


In [9]:
books_df=pd.read_csv('books_cleaned.csv', low_memory=False)
# lets see if all the books in rating_df are also in books_df
rating_df_new = rating_df[rating_df['ISBN'].isin(books_df['ISBN'])]
print('Shape of rating_df: {} and rating_df_new: {}'.format(rating_df.shape, rating_df_new.shape))

Shape of rating_df: (1149780, 3) and rating_df_new: (828658, 3)


In [10]:
# book ratings
rating_df_new['Book-Rating'].value_counts().reset_index()

Unnamed: 0,Book-Rating,count
0,0,521340
1,8,73135
2,10,56982
3,7,53312
4,9,48227
5,5,36789
6,6,25479
7,4,6149
8,3,4100
9,2,1926


In [11]:
# most popular books
rating_df_new.groupby('ISBN')['Book-Rating'].count().reset_index().sort_values(by='Book-Rating', ascending=False)[:10]

Unnamed: 0,ISBN,Book-Rating
180565,971880107,2502
32143,316666343,1295
59290,385504209,883
5903,60928336,732
26924,312195516,723
17539,142001740,615
109744,671027360,586
77573,446672211,585
32070,316601950,568
52399,375727345,552


In [12]:
explicit_rating = rating_df_new[rating_df_new['Book-Rating'] != 0]
implicit_rating = rating_df_new[rating_df_new['Book-Rating'] == 0]
print('Shape of explicit rating: {} and implicit rating: {}'.format(explicit_rating.shape, implicit_rating.shape))

Shape of explicit rating: (307318, 3) and implicit rating: (521340, 3)


In [13]:
# most purchased books including the implicitely rated books
rating_df_new.groupby('ISBN')['User-ID'].count().reset_index().sort_values(by='User-ID', ascending=False)[:10]['ISBN'].values

array(['0971880107', '0316666343', '0385504209', '0060928336',
       '0312195516', '0142001740', '0671027360', '0446672211',
       '0316601950', '0375727345'], dtype=object)

In [14]:
# getting the book names corresponding to these ISBNs
isbn_nums = ['0971880107', '0316666343', '0385504209', '0060928336',
       '0312195516', '044023722X', '0142001740', '067976402X',
       '0671027360', '0446672211']
books_df[books_df['ISBN'].isin(isbn_nums)]

Unnamed: 0,ISBN,Book_Title,Book_Author,Year_Of_Publication,Publisher
26,971880107,wild animus,rich shapero,2004.0,too far
118,671027360,angels &amp; demons,dan brown,2001.0,pocket star
354,142001740,the secret life of bees,sue monk kidd,2003.0,penguin books
405,316666343,the lovely bones: a novel,alice sebold,2002.0,"little, brown"
517,312195516,the red tent (bestselling backlist),anita diamant,1998.0,picador usa
696,446672211,where the heart is (oprah's book club (paperba...,billie letts,1998.0,warner books
738,385504209,the da vinci code,dan brown,2003.0,doubleday
1086,60928336,divine secrets of the ya-ya sisterhood: a novel,rebecca wells,1997.0,perennial


In [15]:
# most popular explicitely rated books
explicit_rating.groupby('ISBN')['Book-Rating'].count().reset_index().sort_values(by='Book-Rating', ascending=False)[:10]

Unnamed: 0,ISBN,Book-Rating
18187,316666343,707
97285,971880107,581
32102,385504209,487
14895,312195516,383
3297,60928336,320
10046,142001740,307
42226,446672211,295
47024,452282152,278
18139,316601950,272
59665,671027360,269


In [16]:
# getting the book names corresponding to these ISBNs
isbn_nums = ['0316666343', '0971880107', '0385504209', '0312195516', '0060928336']
books_df[books_df['ISBN'].isin(isbn_nums)]

Unnamed: 0,ISBN,Book_Title,Book_Author,Year_Of_Publication,Publisher
26,971880107,wild animus,rich shapero,2004.0,too far
405,316666343,the lovely bones: a novel,alice sebold,2002.0,"little, brown"
517,312195516,the red tent (bestselling backlist),anita diamant,1998.0,picador usa
738,385504209,the da vinci code,dan brown,2003.0,doubleday
1086,60928336,divine secrets of the ya-ya sisterhood: a novel,rebecca wells,1997.0,perennial


In [17]:
rating_df.to_csv('ratings_cleaned.csv', index=False)