In [1]:
import pandas as pd
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import textstat 
from collections import Counter # To easily count POS tags
from pathlib import Path
import random

In [5]:
def clean_text_for_readability(text):
    """
    Cleans text by stripping PG headers/footers and standardizing whitespace,
    retaining punctuation and capitalization for readability scores.
    """
    # Remove Project Gutenberg headers/footers
    text = re.sub(r'\*\*\* START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'\*\*\* END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'[\s]*\*+\s*END OF THE PROJECT GUTENBERG EBOOK.*?$', '', text, flags=re.DOTALL | re.IGNORECASE) # Alternative end pattern
    text = re.sub(r'[\s]*Transcriber\'s Note.*$', '', text, flags=re.DOTALL | re.IGNORECASE) # Common transcriber's notes
    text = re.sub(r'[\s]*\[Illustration:.*?\][\s]*', '', text, flags=re.DOTALL | re.IGNORECASE) # Remove image descriptions

    # Remove extra newlines and leading/trailing whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text) # Reduce multiple newlines
    text = text.strip()
    return text

In [7]:
def get_tokens_and_sentences(text):
    """
    Tokenizes text into sentences and words, preserving punctuation.
    Returns raw words (with punctuation), cleaned words (alphabetic only, lowercased),
    and sentences.
    """
    sentences = sent_tokenize(text)
    
    # Raw words (for punctuation count, POS tagging, etc.)
    raw_words = word_tokenize(text)
    
    # Cleaned words (alphabetic, lowercased, for lexical diversity, word length)
    # This list is used for metrics that expect 'words' and not punctuation/numbers.
    cleaned_words_alpha_only = [word.lower() for word in raw_words if word.isalpha()]
    
    return raw_words, cleaned_words_alpha_only, sentences


In [9]:
def classify_pos_tags(raw_words_list, total_words_alpha_only):
    """
    Performs POS tagging on raw word list and calculates percentages for major categories.
    `total_words_alpha_only` is used as the denominator for percentages.
    """
    if not raw_words_list or total_words_alpha_only == 0:
        return {}

    # POS tag all tokens (including punctuation, will filter tags later)
    tagged_words = nltk.pos_tag(raw_words_list)
    
    pos_counts = Counter()
    for word, tag in tagged_words:
        # Simplified mapping for common POS categories
        if tag.startswith('NN'): # Noun (singular, plural, proper)
            pos_counts['Nouns'] += 1
        elif tag.startswith('VB'): # Verb (various forms)
            pos_counts['Verbs'] += 1
        elif tag.startswith('JJ'): # Adjective (comparative, superlative)
            pos_counts['Adjectives'] += 1
        elif tag.startswith('RB'): # Adverb (comparative, superlative)
            pos_counts['Adverbs'] += 1
        elif tag.startswith('PRP'): # Pronoun
            pos_counts['Pronouns'] += 1
        elif tag.startswith('IN'): # Preposition/subordinating conjunction
            pos_counts['Prepositions'] += 1
        elif tag.startswith('DT'): # Determiner
            pos_counts['Determiners'] += 1
        elif tag.startswith('CC'): # Coordinating Conjunction
            pos_counts['Conjunctions'] += 1
        # Add more categories if desired, or refine existing ones

    # Convert counts to percentages based on total *alpha only* words
    # This ensures that we're talking about the distribution of *words* not punctuation tags
    pos_percentages = {
        f"Perc_{pos}": (count / total_words_alpha_only) * 100
        for pos, count in pos_counts.items()
    }
    
    return pos_percentages


In [11]:
# --- Paths ---
pg_catalog_path = 'gutenberg/pg_catalog.csv'
extracted_books_dir = Path('gutenberg/text/cache/epub/')
output_base_dir = Path('gutenberg/') 
output_filename = 'author_styles_analysis.csv'
output_csv_path = output_base_dir / output_filename

In [13]:
# --- Load and filter metadata ---
print("Loading Project Gutenberg metadata...")
metadata_df = pd.read_csv(pg_catalog_path)
print(f"Original metadata loaded: {len(metadata_df)} entries.")

# Initial language and type filtering
english_books_df = metadata_df[
    (metadata_df['Language'].str.contains('en', na=False)) &
    (metadata_df['Type'].str.contains('text', case=False, na=False))
].copy()
print(f"Filtered for English texts: {len(english_books_df)} entries.")

# --- Define Filters for Unwanted Bookshelves/Subjects ---
# These keywords will be used to exclude books from analysis
exclude_subjects_keywords = [
    'dictionary', 'encyclopedia', 'grammar', 'reference', 'hymns', 'poetry',
    'verse', 'plays', 'juvenile', 'children', 'anthologies',
    'collections', 'letters', 'speeches', 'manuals', 'guides', 'cookbook',
    'periodicals', 'newspapers', 'bibles', 'translation', 'translated',
    'directories', 'catalogs', 'bibliographies', 'biographies', 'memoirs'
]

exclude_bookshelves_keywords = [
    'children\'s literature', 'reference', 'poetry', 'anthologies',
    'cookbook',
    'chambers\'s encyclopaedia', 'project gutenberg',
    'language education', 'encyclopedia','dictionary'
]


Loading Project Gutenberg metadata...
Original metadata loaded: 76320 entries.
Filtered for English texts: 59700 entries.


In [15]:
# Create regex patterns for filtering
# Note: Use word boundaries \b to avoid partial matches (e.g., 'art' in 'party')
# But for broad exclusion, sometimes contains is fine. Let's stick with contains for now
# based on your previous example, but be aware of its broadness.
subject_pattern = '|'.join(re.escape(k) for k in exclude_subjects_keywords)
bookshelf_pattern = '|'.join(re.escape(k) for k in exclude_bookshelves_keywords)

# Apply filters
print("Applying subject and bookshelf filters...")
filtered_books_df = english_books_df[
    ~english_books_df['Subjects'].fillna('').str.contains(subject_pattern, case=False, na=False) &
    ~english_books_df['Bookshelves'].fillna('').str.contains(bookshelf_pattern, case=False, na=False)
].copy()

print(f"After subject/bookshelf filtering: {len(filtered_books_df)} entries.")


Applying subject and bookshelf filters...
After subject/bookshelf filtering: 42452 entries.


In [17]:
# --- Author Handling ---
print("Processing authors...")
filtered_books_df = filtered_books_df.dropna(subset=['Authors']).copy()

# Filter for single authors. Assumes authors are separated by ';'. Adjust if needed.
filtered_books_df['num_authors'] = filtered_books_df['Authors'].apply(lambda x: len(str(x).split(';')) if isinstance(x, str) else 1)
single_author_books_df = filtered_books_df[filtered_books_df['num_authors'] == 1].copy()

print(f"After filtering for single authors: {len(single_author_books_df)} entries.")

# Define keywords that indicate the 'Author' is a translator OR that indicate an Anonoymous Author.
translator_keywords = ['translator', 'trans. by', 'transl', 'anon', 'anonymous', 'unknown', 'various'] 
translator_pattern = '|'.join(re.escape(k) for k in translator_keywords)

books_without_translators_df = single_author_books_df[
    ~single_author_books_df['Authors'].str.contains(translator_pattern, case=False, na=False)
].copy()

print(f"After excluding translators: {len(books_without_translators_df)} entries.")


Processing authors...
After filtering for single authors: 30735 entries.
After excluding translators: 30007 entries.


In [19]:
# Count books per author
author_book_counts = books_without_translators_df['Authors'].value_counts()

# Filter authors with a minimum number of books (e.g., 3 or more books published in the catalog)
min_books_per_author = 1 #1 for all authors with books
prolific_authors = author_book_counts[author_book_counts >= min_books_per_author].index.tolist()

print(f"Found {len(prolific_authors)} prolific authors (at least {min_books_per_author} books).")

#{ # Randomly select a few authors for testing (e.g., 5 authors)
# num_test_authors = 5

# selected_authors_for_test = random.sample(prolific_authors, min(num_test_authors, len(prolific_authors)))

# print(f"Randomly selected authors for testing: {selected_authors_for_test}")} If you uncomment this change authors to process as well.
# authors_to_process_df = books_without_translators_df[books_without_translators_df['Authors'].isin(selected_authors_for_test)].copy()


# Create a DataFrame for selected authors' books
selected_authors_for_test = prolific_authors
authors_to_process_df = books_without_translators_df[books_without_translators_df['Authors'].isin(selected_authors_for_test)].copy()
print(f"Total books for selected authors: {len(authors_to_process_df)}")



Found 13585 prolific authors (at least 1 books).
Total books for selected authors: 30007


In [None]:
# --- Main Processing Loop ---

# Flag to check if it's the first author being written, to include header
is_first_author = True

for author in selected_authors_for_test:
    print(f"\nProcessing author: {author}")
    author_books = authors_to_process_df[authors_to_process_df['Authors'] == author]
    
    # Combine all text for this author
    author_full_text_raw = ""
    num_books_processed = 0
    
    for index, row in author_books.iterrows():
        book_id = row['Text#']
        book_filename = 'pg' + str(book_id) + '.txt' 
        book_path = extracted_books_dir / str(book_id) / book_filename
        print(f"Type of extracted_books_dir: {type(extracted_books_dir)}, to {book_path}, as {type(book_path)}")
        
        if book_path.exists():
            try:
                with open(book_path, 'r', encoding='utf-8') as f:
                    author_full_text_raw += f.read() + "\n\n"
                num_books_processed += 1
            except Exception as e: # Keep this to catch encoding issues, permissions etc.
                print(f"  Error reading {book_path}: {e}")
        else:
            print(f"  Book file not found: {book_path}")

    if not author_full_text_raw or num_books_processed == 0:
        print(f"  No valid text found for {author}. Skipping.")
        continue

    # Clean text for readability scores (keeping punctuation, etc.)
    cleaned_text_for_readability = clean_text_for_readability(author_full_text_raw)
    
    # Tokenize and get different word lists
    raw_tokens, cleaned_words_alpha_only, sentences = get_tokens_and_sentences(cleaned_text_for_readability)
    
    # --- Calculate Metrics ---
    
    total_characters = len(re.sub(r'\s', '', cleaned_text_for_readability)) # Exclude whitespace
    total_words_alpha_only = len(cleaned_words_alpha_only) # Renamed for clarity
    unique_words = len(set(cleaned_words_alpha_only))
    total_sentences = len(sentences)
    
    if total_words_alpha_only == 0 or total_sentences == 0:
        print(f"  Insufficient text for {author} after cleaning/tokenization. Skipping.")
        continue

    # Lexical Diversity (TTR)
    type_token_ratio = (unique_words / total_words_alpha_only) * 100

    # Word-Level Characteristics
    avg_word_length = sum(len(word) for word in cleaned_words_alpha_only) / total_words_alpha_only
    
    # Word length distribution (simple bins)
    word_lengths = [len(word) for word in cleaned_words_alpha_only]
    short_words_count = sum(1 for l in word_lengths if l <= 4)
    medium_words_count = sum(1 for l in word_lengths if 5 <= l <= 8)
    long_words_count = sum(1 for l in word_lengths if l >= 9)
    
    perc_short_words = (short_words_count / total_words_alpha_only) * 100
    perc_medium_words = (medium_words_count / total_words_alpha_only) * 100
    perc_long_words = (long_words_count / total_words_alpha_only) * 100

    # Sentence-Level Characteristics
    avg_sentence_length_words = total_words_alpha_only / total_sentences
    avg_sentence_length_chars = total_characters / total_sentences

    # Punctuation Usage
    # Count specific punctuation from raw_tokens
    punctuation_counts = {
        'num_commas': raw_tokens.count(','),
        'num_periods': raw_tokens.count('.'),
        'num_qmarks': raw_tokens.count('?'),
        'num_exmarks': raw_tokens.count('!'),
        'num_semicolons': raw_tokens.count(';'),
        'num_colons': raw_tokens.count(':'),
        'num_dashes': raw_tokens.count('-') + raw_tokens.count('--') + raw_tokens.count('—'), # Basic dash count
        'num_quotes': raw_tokens.count('"') + raw_tokens.count("''") + raw_tokens.count("``") # Simple quote count
    }

    # Convert to per 1000 words (using total_words_alpha_only as denominator for consistency with other percentages)
    punctuation_per_1000_words = {
        k.replace('num_', ''): (v / total_words_alpha_only) * 1000
        for k, v in punctuation_counts.items()
    }
    
    # POS Distribution
    pos_percentages = classify_pos_tags(raw_tokens, total_words_alpha_only)

    # Readability Scores (using textstat on the cleaned_text_for_readability)
    try:
        flesch_reading_ease = textstat.flesch_reading_ease(cleaned_text_for_readability)
        flesch_kincaid_grade = textstat.flesch_kincaid_grade(cleaned_text_for_readability)
        gunning_fog_index = textstat.gunning_fog(cleaned_text_for_readability)
        smog_index = textstat.smog_index(cleaned_text_for_readability)
        ari_index = textstat.automated_readability_index(cleaned_text_for_readability)
    except Exception as e:
        print(f"  Error calculating readability for {author}: {e}. Setting to NaN.")
        flesch_reading_ease = flesch_kincaid_grade = gunning_fog_index = smog_index = ari_index = float('nan')


    # Store results
    author_data_row = {
        'Author': author,
        'Num_Books': num_books_processed,
        'Total_Characters': total_characters,
        'Total_Words_Alpha_Only': total_words_alpha_only, # Renamed for clarity
        'Unique_Words': unique_words,
        'Total_Sentences': total_sentences,
        'Type_Token_Ratio': type_token_ratio,
        'Avg_Word_Length': avg_word_length,
        'Perc_Short_Words': perc_short_words,
        'Perc_Medium_Words': perc_medium_words,
        'Perc_Long_Words': perc_long_words,
        'Avg_Sentence_Length_Words': avg_sentence_length_words,
        'Avg_Sentence_Length_Chars': avg_sentence_length_chars,
        'Flesch_Reading_Ease': flesch_reading_ease,
        'Flesch_Kincaid_Grade': flesch_kincaid_grade,
        'Gunning_Fog_Index': gunning_fog_index,
        'SMOG_Index': smog_index,
        'ARI_Index': ari_index
    }
    
    # Add punctuation metrics
    author_data_row.update(punctuation_per_1000_words)

    # Add POS percentages
    author_data_row.update(pos_percentages)
    
    # Convert the single author's data to a DataFrame
    single_author_df = pd.DataFrame([author_data_row]) # Note the list around author_data_row

    # Append to CSV:
    # If it's the first author, write with header.
    # Otherwise, append without header.
    if is_first_author:
        single_author_df.to_csv(output_csv_path, mode='w', header=True, index=False)
        is_first_author = False # Set flag to False after first write
        print(f"  Saved header and data for {author} to {output_csv_path}")
    else:
        single_author_df.to_csv(output_csv_path, mode='a', header=False, index=False)
        print(f"  Appended data for {author} to {output_csv_path}")

# Final message after the loop completes
print(f"\nProcessing complete. All processed author data saved to {output_csv_path}")


# # Add results to DataFrame and save as csv
# if author_style_data:
#     author_styles_df = pd.DataFrame(author_style_data)
#     output_base_dir = Path('gutenberg/') 
#     output_filename = 'author_styles_analysis.csv'
#     output_csv_path = output_base_dir / output_filename
#     author_styles_df.to_csv(output_csv_path, index=False)
#     print(f"\nAuthor style analysis saved to {output_csv_path}")
#     print(author_styles_df.head())
# else:
#     print("\nNo author data was processed. Check filters and file paths.")



Processing author: Lytton, Edward Bulwer Lytton, Baron, 1803-1873
Type of extracted_books_dir: <class 'pathlib.WindowsPath'>, to E:\Siwe\Projects\gutenberg\text\cache\epub\1396\pg1396.txt, as <class 'pathlib.WindowsPath'>
Type of extracted_books_dir: <class 'pathlib.WindowsPath'>, to E:\Siwe\Projects\gutenberg\text\cache\epub\1565\pg1565.txt, as <class 'pathlib.WindowsPath'>
Type of extracted_books_dir: <class 'pathlib.WindowsPath'>, to E:\Siwe\Projects\gutenberg\text\cache\epub\1951\pg1951.txt, as <class 'pathlib.WindowsPath'>
Type of extracted_books_dir: <class 'pathlib.WindowsPath'>, to E:\Siwe\Projects\gutenberg\text\cache\epub\2461\pg2461.txt, as <class 'pathlib.WindowsPath'>
Type of extracted_books_dir: <class 'pathlib.WindowsPath'>, to E:\Siwe\Projects\gutenberg\text\cache\epub\2664\pg2664.txt, as <class 'pathlib.WindowsPath'>
Type of extracted_books_dir: <class 'pathlib.WindowsPath'>, to E:\Siwe\Projects\gutenberg\text\cache\epub\6151\pg6151.txt, as <class 'pathlib.WindowsPath