### Task 1: Load and Read Text Files

In [1]:
import os

# Define the folder path
folder_path = "potter"  # If it's in the current working directory

# Get all text files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".txt")]

# Read all books into a list
books = []
for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        books.append(file.read())

# Display a snippet of each book to verify loading
for i, book in enumerate(books):
    print(f"Book {i+1} preview:\n", book[:500], "\n" + "-"*50)


Book 1 preview:
 HARRY 

POTTER 




I 




DUDLEY DEMENTED 

The hottest day of the summer so far was drawing to 
a close and a drowsy silence lay over the large, square 
houses of Privet Drive. Cars that were usually 
gleaming stood dusty in their drives and lawns that 
were once emerald green lay parched and yellowing; 
the use of hosepipes had been banned due to 
drought. Deprived of their usual car-washing and 
lawn-mowing pursuits, the inhabitants of Privet Drive 
had retreated into the shade of their cool 
--------------------------------------------------
Book 2 preview:
 / 




OWL POST 

Harry Potter was a highly unusual boy in many ways. 
For one thing, he hated the summer holidays more 
than any other time of year. For another, he really 
wanted to do his homework but was forced to do it in 
secret, in the dead of night. And he also happened to 
be a wizard. 

It was nearly midnight, and he was lying on his 
stomach in bed, the blankets drawn right over his 
head like a ten

### Task 2: Extract Book Titles Using Regular Expressions

In [2]:
import re

# Function to extract book title
def extract_title(text):
    match = re.search(r"Page \| \d+ (.*?) - J\.K\. Rowling", text)
    return match.group(1) if match else "Unknown Title"

# Extract titles
book_titles = [extract_title(book) for book in books]

# Display extracted titles
for i, title in enumerate(book_titles):
    print(f"Book {i+1}: {title}")


Book 1: Harry Potter and the Order of the Phoenix
Book 2: Harry Potter and the Prisoner of Azkaban
Book 3: Harry Potter and the Chamber of Secrets
Book 4: Harry Potter and the Goblet of Fire
Book 5: Harry Potter and the Deathly Hallows
Book 6: Harry Potter and the Half Blood Prince
Book 7: Harry Potter and the Philosophers Stone


### Task 3: Preprocess Text

In [6]:
import re
from tqdm import tqdm

# Precompile regex patterns for better performance
page_indicator_pattern = re.compile(r"Page \| \d+ .*? - J\.K\. Rowling")
chapter_trim_pattern = re.compile(r".*?CHAPTER", re.DOTALL)
chapter_header_pattern = re.compile(r"\n[A-Z\s]+\n")

# Function to clean text
def clean_text(text):
    # Remove page indicators
    text = page_indicator_pattern.sub("", text)

    # Trim content before first chapter
    text = chapter_trim_pattern.sub("CHAPTER", text)

    # Remove chapter headers (all caps)
    text = chapter_header_pattern.sub("\n", text)

    # Replace line breaks with spaces
    text = text.replace("\n", " ")

    return text.strip()

# Apply cleaning with tqdm progress bar
cleaned_books = []
for book in tqdm(books, desc="Cleaning Books", unit="book"):
    cleaned_books.append(clean_text(book))

# Display snippet of cleaned text
for i, book in enumerate(cleaned_books):
    print(f"Book {i+1} cleaned preview:\n", book[:500], "\n" + "-"*50)


Cleaning Books:   0%|          | 0/7 [00:57<?, ?book/s]


KeyboardInterrupt: 

In [7]:
import cudf  # RAPIDS GPU DataFrame
import cupy as cp
import re
from tqdm import tqdm

# Convert list to cuDF DataFrame
df_gpu = cudf.DataFrame({"book_text": books})

# Precompile regex patterns for GPU efficiency
page_indicator_pattern = re.compile(r"Page \| \d+ .*? - J\.K\. Rowling")
chapter_trim_pattern = re.compile(r".*?CHAPTER", re.DOTALL)
chapter_header_pattern = re.compile(r"\n[A-Z\s]+\n")

# Define GPU-accelerated cleaning function
def clean_text_gpu(text):
    text = page_indicator_pattern.sub("", text)  # Remove page indicators
    text = chapter_trim_pattern.sub("CHAPTER", text)  # Trim before first chapter
    text = chapter_header_pattern.sub("\n", text)  # Remove all-caps chapter headers
    text = text.replace("\n", " ")  # Replace newlines with spaces
    return text.strip()

# Apply cleaning function on GPU
df_gpu["cleaned_text"] = df_gpu["book_text"].applymap(clean_text_gpu)

# Convert back to Pandas for compatibility (if needed)
cleaned_books = df_gpu["cleaned_text"].to_pandas().tolist()

# Display progress and verify
for i, book in enumerate(cleaned_books[:3]):  # Show only first 3 books
    print(f"Book {i+1} cleaned preview:\n", book[:500], "\n" + "-"*50)


ModuleNotFoundError: No module named 'cudf'

### Task 4: Text Preprocessing

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download("stopwords")
nltk.download("wordnet")

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation and numbers
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)

    # Tokenization and lemmatization
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return words

# Apply preprocessing
preprocessed_books = [preprocess_text(book) for book in cleaned_books]

# Display snippet of processed text
for i, book in enumerate(preprocessed_books):
    print(f"Book {i+1} processed preview:\n", book[:20], "\n" + "-"*50)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NameError: name 'cleaned_books' is not defined

### Task 5: Calculate TF-IDF and Find Important Words

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert list of lists into list of strings
book_strings = [" ".join(book) for book in preprocessed_books]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(book_strings)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Extract top words per book
def top_tfidf_words(tfidf_matrix, feature_names, book_index, top_n=10):
    row = tfidf_matrix[book_index].toarray()[0]
    top_indices = row.argsort()[-top_n:][::-1]
    return [(feature_names[i], row[i]) for i in top_indices]

# Display top words for each book
for i, title in enumerate(book_titles):
    print(f"\nTop words for '{title}':")
    print(top_tfidf_words(tfidf_matrix, feature_names, i))


NameError: name 'preprocessed_books' is not defined