### 🚀 Task 1: Load All Books

In [12]:
import os
import re
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download("punkt")
nltk.download("stopwords")

# Define the folder path
folder_path = "ASoIaF"

# Automatically detect and read all text files in the folder
book_files = sorted([os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".txt")])

# Dictionary to store book contents
books = {}

# Load books dynamically
for file_path in book_files:
    book_name = os.path.basename(file_path).replace(".txt", "")
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        books[book_name] = file.read()

print("✅ All books have been loaded successfully!")

✅ All books have been loaded successfully!


[nltk_data] Downloading package punkt to /home/rajubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 🚀 TASK 2: CLEAN TEXT (Remove Metadata, Extra Spaces, and Special Characters)

In [13]:
def clean_text(text):
    # Remove metadata (first 1000 characters usually contain book title and unnecessary info)
    text = text[1000:]

    # Remove special characters and multiple spaces
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    text = re.sub(r"[^\w\s']", "", text)  # Keep words and apostrophes only

    return text.strip()

# Clean all books
books_cleaned = {book: clean_text(text) for book, text in books.items()}
print("✅ Text cleaning completed!")

✅ Text cleaning completed!


### 🚀 TASK 3: SPLIT TEXT INTO CHAPTERS

In [14]:
def split_into_chapters(text):
    # Use regex to detect chapter headings (e.g., "CHAPTER 1", "Chapter 2", etc.)
    chapters = re.split(r"(CHAPTER\s+\d+|Chapter\s+\d+)", text)
    
    # Reconstruct chapters (keep chapter headings)
    chapter_list = []
    for i in range(1, len(chapters), 2):  # Skip non-matching text
        chapter_name = chapters[i]
        chapter_text = chapters[i + 1] if i + 1 < len(chapters) else ""
        chapter_list.append((chapter_name, chapter_text.strip()))

    return chapter_list

# Apply chapter splitting
books_chapters = {book: split_into_chapters(text) for book, text in books_cleaned.items()}
print("✅ Chapter splitting completed!")

✅ Chapter splitting completed!


### 🚀 TASK 4: WORD FREQUENCY ANALYSIS

In [None]:
import spacy
from collections import Counter
import pandas as pd

# Load SpaCy English tokenizer model
nlp = spacy.load("en_core_web_sm")

# Increase the max_length limit
nlp.max_length = 2000000  # Increase limit to 2 million characters

# Function to tokenize text in chunks to avoid max length errors
def word_frequency(text, top_n=20, chunk_size=500000):
    words = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i : i + chunk_size]  # Process in chunks
        doc = nlp(chunk.lower())  # Tokenize and lowercase
        words.extend([token.text for token in doc if token.is_alpha])  # Keep only words (no punctuation/numbers)
    
    word_counts = Counter(words)
    return word_counts.most_common(top_n)

# Compute word frequencies for each book
word_frequencies = {book: word_frequency(text) for book, text in books_cleaned.items()}

# Convert to DataFrame
df_word_freq = pd.DataFrame({book: dict(word_freq) for book, word_freq in word_frequencies.items()})
df_word_freq.fillna(0, inplace=True)  # Fill NaN with 0

import ace_tools as tools
tools.display_dataframe_to_user(name="Word Frequency Analysis", dataframe=df_word_freq)

print("✅ Word frequency analysis completed using SpaCy with chunk processing!")


### 🚀 TASK 5: CHARACTER MENTION ANALYSIS

In [None]:
import re
import pandas as pd

# Define characters to track
characters = ["Jon", "Daenerys", "Tyrion", "Arya", "Sansa", "Bran", "Cersei", "Jaime", "Stannis", "Davos"]

# Function to count character mentions in chunks
def count_character_mentions(text, character_list, chunk_size=500000):
    mentions = {char: 0 for char in character_list}  # Initialize counts
    
    for i in range(0, len(text), chunk_size):  # Process in chunks
        chunk = text[i : i + chunk_size]  # Get chunk
        for char in character_list:
            mentions[char] += len(re.findall(r"\b" + char + r"\b", chunk, re.IGNORECASE))  # Count matches
    
    return mentions

# Count character mentions for each book
character_mentions = {book: count_character_mentions(text, characters) for book, text in books_cleaned.items()}

# Convert to DataFrame
df_character_mentions = pd.DataFrame(character_mentions)

import ace_tools as tools
tools.display_dataframe_to_user(name="Character Mentions", dataframe=df_character_mentions)

print("✅ Character mention analysis completed!")


### 🚀 TASK 6: COMMON BIGRAMS AND TRIGRAMS

In [None]:
import spacy
from collections import Counter
from nltk.util import ngrams
import pandas as pd
from nltk.corpus import stopwords

# Load SpaCy English tokenizer model
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

# Function to compute n-grams in chunks
def common_ngrams(text, n=2, top_n=15, chunk_size=500000):
    ngram_counter = Counter()
    
    for i in range(0, len(text), chunk_size):  # Process text in chunks
        chunk = text[i : i + chunk_size]
        doc = nlp(chunk.lower())  # Tokenize with SpaCy
        
        words = [token.text for token in doc if token.is_alpha and token.text not in stop_words]  # Keep words only
        n_grams = list(ngrams(words, n))  # Generate n-grams
        ngram_counter.update(n_grams)  # Update counts
    
    return ngram_counter.most_common(top_n)

# Compute bigrams and trigrams for each book
bigrams = {book: common_ngrams(text, n=2) for book, text in books_cleaned.items()}
trigrams = {book: common_ngrams(text, n=3)}

# Convert bigrams to DataFrame
df_bigrams = pd.DataFrame({book: {" ".join(bg[0]): bg[1] for bg in bigrams[book]} for book in bigrams})

# Convert trigrams to DataFrame
df_trigrams = pd.DataFrame({book: {" ".join(tg[0]): tg[1] for tg in trigrams[book]} for book in trigrams})

import ace_tools as tools
tools.display_dataframe_to_user(name="Common Bigrams", dataframe=df_bigrams)
tools.display_dataframe_to_user(name="Common Trigrams", dataframe=df_trigrams)

print("✅ N-gram analysis completed with SpaCy and chunk processing!")
