In [29]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [11]:
# Download NLTK data files 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chrysentiaclarissa/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrysentiaclarissa/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chrysentiaclarissa/nltk_data...


True

In [13]:
# Read the CSV files
df1 = pd.read_csv('books_with_descriptions_chunk_0.csv')
df2 = pd.read_csv('books_with_descriptions_chunk_1.csv')
df3 = pd.read_csv('books_with_descriptions_chunk_2.csv')
df4 = pd.read_csv('books_with_descriptions_chunk_3.csv')

# Concatenate the DataFrames by rows
combined_data = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [14]:
combined_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,Provides an introduction to classical myths pl...
1,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct..."
2,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,"Here, for the first time in paperback, is an o..."
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,"""Scientists have recently discovered shards of..."
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999.0,W. W. Norton &amp; Company,A look at the incredibly well-preserved ancien...


In [15]:
combined_data.shape

(30000, 6)

In [16]:
# Save the combined DataFrame to a new CSV file
combined_data.to_csv('books_with_descriptions.csv', index=False)

In [17]:
# Fetch preprocessed cleaned data with descriptions
dir_path = '/Users/chrysentiaclarissa/Downloads/NLP Book Recommendation System/Book-Recommendation-System-main/books_with_descriptions.csv'
data = pd.read_csv(dir_path)

## Remove unwanted rows

Filter out rows with 'Description not found'.

In [18]:
filtered_data = data[data['description'] != 'Description not found']
filtered_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,Provides an introduction to classical myths pl...
1,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,"In a small town in Canada, Clara Callan reluct..."
2,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,"Here, for the first time in paperback, is an o..."
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,"""Scientists have recently discovered shards of..."
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999.0,W. W. Norton &amp; Company,A look at the incredibly well-preserved ancien...


In [19]:
filtered_data.shape

(13327, 6)

## Text cleaning

Clean the 'Description' text by: (1) Convert text to lowercase; (2) Remove punctuation; (3) Remove stopwords; (4) Tokenize the text; and (5) 
Perform stemming or lemmatization.

In [20]:
# Initialize the stopwords set, lemmatizer, and punctuation set
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuation = set(string.punctuation)

In [30]:
# Function to clean the text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize the words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join the tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [31]:
filtered_data['description'] = filtered_data['description'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['description'] = filtered_data['description'].apply(clean_text)


In [33]:
pd.set_option('display.max_colwidth', 1)

In [32]:
filtered_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,provides introduction classical myth placing addressed topic within historical context discussion archaeological evidence support mythical event theme portrayed literature art music film
1,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,small town canada clara callan reluctantly take leave sister nora bound new york time growing threat fascism europe constant worry people escape reality radio movie meanwhile two sister vastly different personality yet inextricably linked shared past try find place within complex web social expectation young woman 1930s nora embarks glamorous career radiosoap opera star clara strong independentminded woman struggle observe traditional boundary small tightknit community without relinquishing dream love freedom adventure however thing nt simple appear nora letter eventually reveal life big city le exotic seems tranquil solitude clara life shattered series unforeseeable event twist fate require clara courage strength finally put seemingly unbreakable bond sister test
2,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,first time paperback outstanding military history offer dramatic new perspective allied campaign began invasion dday beach normandy nationa advertising military history
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999.0,Farrar Straus Giroux,scientist recently discovered shard flu virus human remains frozen arctic tundra scrap tissue preserved government warehouse flu gina kolata reporter new york time unravels mystery lethal virus high drama great adventure story alaska norway street hong kong corridor white house kolata track race recover live pathogen probe fear impelled government policy delf history flu previous epidemic profile expert hot trail amateur woefully misguided detail science latest understanding mortal disease book jackettitle summary field provided blackwell north america inc right reserved
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999.0,W. W. Norton &amp; Company,look incredibly wellpreserved ancient mummy found western china describes clothing appearance attempt reconstruct culture speculates caucasian could found way foot himalayan mountain


## Drop books with the same descriptions

In [35]:
# Remove rows with duplicate description
filtered_data = filtered_data.drop_duplicates(subset=['description'])

In [36]:
filtered_data.shape

(13094, 6)

In [37]:
filtered_data.to_csv('preprocessed_books_descriptions.csv', index=False)