# Preprocessing of Data

#### Importing important libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from rake_nltk import Rake
import re

#### Reading the initial raw csv file using pandas

In [5]:
books = pd.read_csv('books2.csv')

#### Getting the book head, shape and column

In [6]:
books.head()

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780000000000.0,YoungAdult Fiction Dystopia Fantasy ScienceFic...,KatnissEverdeen PeetaMellark Cato(HungerGames)...,...,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,District12 Panem Capitol Panem Panem(UnitedSta...,https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780000000000.0,Fantasy YoungAdult Fiction Magic Childrens Adv...,SiriusBlack DracoMalfoy RonWeasley PetuniaDurs...,...,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,HogwartsSchoolofWitchcraftandWizardry(UnitedKi...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,10000000000000.0,Classics Fiction HistoricalFiction School Lite...,ScoutFinch AtticusFinch JemFinch ArthurRadley ...,...,07-11-1960,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,Maycomb Alabama(UnitedStates),https://i.gr-assets.com/images/S/compressed.ph...,2269402,23328,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,10000000000000.0,Classics Fiction Romance HistoricalFiction Lit...,Mr.Bennet Mrs.Bennet JaneBennet ElizabethBenne...,...,01/28/13,[],2998241,"['1617567', '816659', '373311', '113934', '767...",94.0,UnitedKingdom Derbyshire England(UnitedKingdom...,https://i.gr-assets.com/images/S/compressed.ph...,1983116,20452,
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780000000000.0,YoungAdult Fantasy Romance Vampires Fiction Pa...,EdwardCullen JacobBlack Laurent Renee BellaSwa...,...,10-05-2005,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,Forks Washington(UnitedStates) Phoenix Arizona...,https://i.gr-assets.com/images/S/compressed.ph...,1459448,14874,2.1


In [7]:
books.shape

(52478, 25)

In [6]:
books.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price'],
      dtype='object')

#### dropping the columns we do not require for our further operations

In [8]:
books.drop(columns=['bookId','rating','language', 'isbn','bookFormat', 'edition',
       'pages','publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent','bbeScore', 'bbeVotes', 'price'],inplace=True)

In [9]:
books.columns

Index(['title', 'series', 'author', 'description', 'genres', 'characters',
       'publisher', 'setting', 'coverImg'],
      dtype='object')

#### remaning the field title by original title

In [10]:
books.rename(columns={'title':'original_title'},inplace=True)

In [12]:
books.head(1)

Unnamed: 0,original_title,series,author,description,genres,characters,publisher,setting,coverImg
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,YoungAdult Fiction Dystopia Fantasy ScienceFic...,KatnissEverdeen PeetaMellark Cato(HungerGames)...,Scholastic Press,District12 Panem Capitol Panem Panem(UnitedSta...,https://i.gr-assets.com/images/S/compressed.ph...


#### Getting the info of all the columns

In [14]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_title  52478 non-null  object
 1   series          23470 non-null  object
 2   author          52478 non-null  object
 3   description     51140 non-null  object
 4   genres          47855 non-null  object
 5   characters      13766 non-null  object
 6   publisher       48782 non-null  object
 7   setting         11578 non-null  object
 8   coverImg        51873 non-null  object
dtypes: object(9)
memory usage: 3.6+ MB


In [15]:
books['description'].fillna('',inplace=True)
books['publisher'].fillna('',inplace=True)
books['genres'].fillna('',inplace=True)
books['setting'].fillna('',inplace=True)
books['characters'].fillna('',inplace=True)

In [16]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_title  52478 non-null  object
 1   series          23470 non-null  object
 2   author          52478 non-null  object
 3   description     52478 non-null  object
 4   genres          52478 non-null  object
 5   characters      52478 non-null  object
 6   publisher       52478 non-null  object
 7   setting         52478 non-null  object
 8   coverImg        51873 non-null  object
dtypes: object(9)
memory usage: 3.6+ MB


In [17]:
books.head(1)

Unnamed: 0,original_title,series,author,description,genres,characters,publisher,setting,coverImg
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,YoungAdult Fiction Dystopia Fantasy ScienceFic...,KatnissEverdeen PeetaMellark Cato(HungerGames)...,Scholastic Press,District12 Panem Capitol Panem Panem(UnitedSta...,https://i.gr-assets.com/images/S/compressed.ph...


#### Converting data into lower case

In [21]:
def remove_spaces(row):
    x=[]
    for i in row:
        i=i.replace(' ','')
        i=i.lower()
        x.append(i)
    return(x)

In [22]:
#books['genres'] = books['genres'].apply(remove_spaces).str.join(' ')

In [23]:
#books['characters'] = books['characters'].apply(remove_spaces).str.join(' ')

In [24]:
#books['setting'] = books['setting'].apply(remove_spaces).str.join(' ')

In [25]:
#books['description'] = books['description'].apply(remove_spaces).str.join(' ')

In [27]:
books.head(1)

Unnamed: 0,original_title,series,author,description,genres,characters,publisher,setting,coverImg
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,YoungAdult Fiction Dystopia Fantasy ScienceFic...,KatnissEverdeen PeetaMellark Cato(HungerGames)...,Scholastic Press,District12 Panem Capitol Panem Panem(UnitedSta...,https://i.gr-assets.com/images/S/compressed.ph...


In [28]:
books['characters'] = books['characters'].str.lower()

In [29]:
books['original_title'] = books['original_title'].str.lower()

In [30]:
books['author'] = books['author'].str.lower()

In [31]:
books['genres'] = books['genres'].str.lower()

In [32]:
books['series'] = books['series'].str.lower()

In [33]:
books['description'] = books['description'].str.lower()

In [34]:
books['setting'] = books['setting'].str.lower()

In [36]:
books.head(1)

Unnamed: 0,original_title,series,author,description,genres,characters,publisher,setting,coverImg
0,the hunger games,the hunger games #1,suzanne collins,winning means fame and fortune.losing means ce...,youngadult fiction dystopia fantasy sciencefic...,katnisseverdeen peetamellark cato(hungergames)...,Scholastic Press,district12 panem capitol panem panem(unitedsta...,https://i.gr-assets.com/images/S/compressed.ph...


#### To extract important data from the dataset columns, I have used RAKE algorithm .  Rapid Automatic Keyword Extraction (RAKE) algorithm is obtained from rake_nltk library


In [37]:
from rake_nltk import Rake
import nltk
nltk.download('punkt')
r=Rake()
def extract_important_words(row):
    word = r.extract_keywords_from_text(row)
    word = r.get_ranked_phrases()
    return ' '.join(word)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sinja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
books['author'] = books['author'].apply(extract_important_words)

In [39]:
books['description'] = books['description'].apply(extract_important_words)

In [40]:
books['genres'] = books['genres'].apply(extract_important_words)

In [41]:
books['publisher'] = books['publisher'].apply(extract_important_words)

#### Further dropping the columns we won't require

In [42]:
books.drop(columns=['series', 'characters'],inplace=True)

In [35]:
books.head(1)

Unnamed: 0,original_title,author,description,genres,publisher,setting,coverImg
0,the hunger games,suzanne collins,old katniss everdeen regards losing means cert...,youngadult fiction dystopia fantasy sciencefic...,scholastic press,district12 panem capitol panem panem(unitedsta...,"""https://i.gr-assets.com/images/S/compressed.p..."


#### Creating a new column by the name of ciombined features which combines the specified columns

In [43]:
books['combined_features'] = books['author']+' '+books['description']+' '+books['genres']+' '+books['publisher']+' '+books['setting']

In [44]:
books.head(4)

Unnamed: 0,original_title,author,description,genres,publisher,setting,coverImg,combined_features
0,the hunger games,suzanne collins,old katniss everdeen regards losing means cert...,youngadult fiction dystopia fantasy sciencefic...,scholastic press,district12 panem capitol panem panem(unitedsta...,https://i.gr-assets.com/images/S/compressed.ph...,suzanne collins old katniss everdeen regards l...
1,harry potter and the order of the phoenix,mary grandpré rowling k j illustrator,personality like poisoned honey ordinary wizar...,fantasy youngadult fiction magic childrens adv...,scholastic inc,hogwartsschoolofwitchcraftandwizardry(unitedki...,https://i.gr-assets.com/images/S/compressed.ph...,mary grandpré rowling k j illustrator personal...
2,to kill a mockingbird,harper lee,young alabama woman claims universal appeal ha...,classics fiction historicalfiction school lite...,harper perennial modern classics,maycomb alabama(unitedstates),https://i.gr-assets.com/images/S/compressed.ph...,harper lee young alabama woman claims universa...
3,pride and prejudice,jane austen anna quindlen introduction,radiant wit sparkles alternate cover edition j...,classics fiction romance historicalfiction lit...,modern library,unitedkingdom derbyshire england(unitedkingdom...,https://i.gr-assets.com/images/S/compressed.ph...,jane austen anna quindlen introduction radiant...


In [45]:
books.columns

Index(['original_title', 'author', 'description', 'genres', 'publisher',
       'setting', 'coverImg', 'combined_features'],
      dtype='object')

In [46]:
books.drop(columns=[ 'description', 'genres', 'publisher','setting'],inplace=True)

In [47]:
books.head(10)

Unnamed: 0,original_title,author,coverImg,combined_features
0,the hunger games,suzanne collins,https://i.gr-assets.com/images/S/compressed.ph...,suzanne collins old katniss everdeen regards l...
1,harry potter and the order of the phoenix,mary grandpré rowling k j illustrator,https://i.gr-assets.com/images/S/compressed.ph...,mary grandpré rowling k j illustrator personal...
2,to kill a mockingbird,harper lee,https://i.gr-assets.com/images/S/compressed.ph...,harper lee young alabama woman claims universa...
3,pride and prejudice,jane austen anna quindlen introduction,https://i.gr-assets.com/images/S/compressed.ph...,jane austen anna quindlen introduction radiant...
4,twilight,stephenie meyer,https://i.gr-assets.com/images/S/compressed.ph...,stephenie meyer three things extraordinarily s...
5,the book thief,markus zusak goodreads author,https://i.gr-assets.com/images/S/compressed.ph...,markus zusak goodreads author winning author m...
6,animal farm,"preface ), c russell baker george orwell woodh...",https://i.gr-assets.com/images/S/compressed.ph...,"preface ), c russell baker george orwell woodh..."
7,the chronicles of narnia,pauline baynes lewis illustrator c,https://i.gr-assets.com/images/S/compressed.ph...,pauline baynes lewis illustrator c seven books...
8,j.r.r. tolkien 4-book boxed set: the hobbit an...,tolkien r r j,https://i.gr-assets.com/images/S/compressed.ph...,tolkien r r j mysterious stranger called strid...
9,gone with the wind,margaret mitchell,https://i.gr-assets.com/images/S/compressed.ph...,margaret mitchell must use every means georgia...


#### Removing all the characters other than alphanumeric characters from the original title and adding them to a new column title

In [48]:
def clear(data):
    result=[]
    result.append(re.sub("[^a-z1-9]","",data))
    return result

In [49]:
books['title'] = books['original_title'].apply(clear).str.join('')

In [50]:
#books['combined_features'] = books['combined_features'].apply(clear).str.join('')

In [51]:
books.head()

Unnamed: 0,original_title,author,coverImg,combined_features,title
0,the hunger games,suzanne collins,https://i.gr-assets.com/images/S/compressed.ph...,suzanne collins old katniss everdeen regards l...,thehungergames
1,harry potter and the order of the phoenix,mary grandpré rowling k j illustrator,https://i.gr-assets.com/images/S/compressed.ph...,mary grandpré rowling k j illustrator personal...,harrypotterandtheorderofthephoenix
2,to kill a mockingbird,harper lee,https://i.gr-assets.com/images/S/compressed.ph...,harper lee young alabama woman claims universa...,tokillamockingbird
3,pride and prejudice,jane austen anna quindlen introduction,https://i.gr-assets.com/images/S/compressed.ph...,jane austen anna quindlen introduction radiant...,prideandprejudice
4,twilight,stephenie meyer,https://i.gr-assets.com/images/S/compressed.ph...,stephenie meyer three things extraordinarily s...,twilight


#### Coverting the processed data into CSV file which we will use in our recommendation system

In [45]:
books.to_csv('final_data.csv',index=False)