In [1]:
import PyPDF2
import re
import spacy
import pandas as pd

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

def count_nouns_verbs_adjectives_per_page(pdf_path, start_page=1, end_page=None):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        word_count_per_page = {}
        
        # Set end page if not provided
        if end_page is None:
            end_page = len(reader.pages)

        # Iterate through the specified pages
        for i in range(start_page-1, end_page):
            # Extract text from the page
            page = reader.pages[i]
            text = page.extract_text()
            
            # Use regex to find words and create a spaCy document
            words = re.findall(r'\b\w+\b', text)
            doc = nlp(" ".join(words))

            # Count nouns, verbs, and adjectives
            count = sum(1 for token in doc if token.pos_ in ('NOUN', 'VERB', 'ADJ'))
            word_count_per_page[i + 1] = count
        
        return word_count_per_page

def save_to_excel(word_count_per_page, output_path='word_count.xlsx'):
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(list(word_count_per_page.items()), columns=['Page #', 'Word Count'])
    
    # Save the DataFrame to an Excel file
    df.to_excel(output_path, index=False)
    print(f"Data successfully saved to {output_path}")

# Example usage
pdf_file_path = 'scorpiarising.pdf'
word_count = count_nouns_verbs_adjectives_per_page(pdf_file_path, start_page=0, end_page=410)

# Save the data to an Excel file
save_to_excel(word_count, output_path='word_count_output.xlsx')
