In [42]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ssegg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ssegg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
import fitz #PyMuPDF
import os

In [44]:
def extract_text_from_pdf(pdf_path, output_dir="extracted_texts"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        document = fitz.open(pdf_path)
        text = ""
        for page_num in range(document.page_count):
            page = document.load_page(page_num)
            text += page.get_text() + "\n" # add new line to separate text from different pages
        document.close()

        # create a clean filename for the text file
        base_filename = os.path.basename(pdf_path)
        text_filename = os.path.splitext(base_filename)[0] + ".txt"
        output_path = os.path.join(output_dir, text_filename)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"Successfully extracted text from {pdf_path} to {output_path}")

    except Exception as e:
        print(f"Error extracting text from {pdf_path} to {output_path}: {e}")


In [45]:
def process_all_pdfs_in_directory(pdf_dir, output_dir="extracted_texts"):

    for filename in os.listdir(pdf_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            extract_text_from_pdf(pdf_path, output_dir)

In [46]:
pdf_input_directory = "pdf_papers"
text_output_directory = "extracted_texts"

process_all_pdfs_in_directory(pdf_input_directory, text_output_directory)

Successfully extracted text from pdf_papers\vo2_1.pdf to extracted_texts\vo2_1.txt
Successfully extracted text from pdf_papers\vo2_2.pdf to extracted_texts\vo2_2.txt
Successfully extracted text from pdf_papers\vo2_3.pdf to extracted_texts\vo2_3.txt


## Preprocessing of the extracted text files

In [47]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [48]:
# set of english stopwords
STOP_WORDS = set(stopwords.words('english'))

def clean_and_preprocess(text):
    # remove URLs and emails
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\S*@\S*\s?', '', text)

    # convert to lower case
    text = text.lower()

    # remove punctuation except for some essential ones for sentence structure like periods
    # keeps letters, numbers, and common sentence terminators (. ! ?)
    text = re.sub(r'[^a-z0-9\s\.\!\?\-]', '', text)

        # remove short lines
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if len(line.strip()) > 10 or (len(line.strip()) > 0 and not line.strip().isdigit()):
            cleaned_lines.append(line)
    text = '\n'.join(cleaned_lines)

    # re-join periods (etc) to sentences after cleanup if they were separated
    # better tokenization later
    text = text.replace(' .', '.').replace(' ?','?').replace(' !','!')

    # other pdf cleaning depending on the actual files I have here...
    # headers, footers, page numbers, reference sections?
    text = re.split(r'(?i)References|Bibliography|Acknowledgements|Appendix', text)[0]
    return text

In [49]:
def preprocess_all_text_files(input_dir, output_dir="preprocessed_texts"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".txt"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)

            try:
                with open(input_path, "r", encoding="utf-8") as f_in:
                    raw_text = f_in.read()

                cleaned_text = clean_and_preprocess(raw_text)

                with open(output_path, "w", encoding="utf-8") as f_out:
                    f_out.write(cleaned_text)
                print(f"Successfully preprocessed '{input_path}' to '{output_path}'")

            except Exception as e:
                print(f"Error preprocessing '{input_path}' to '{output_path}': {e}")

In [50]:
extracted_text_dictionary = "extracted_texts"
preprocessed_text_dictionary = "preprocessed_texts"

preprocess_all_text_files(extracted_text_dictionary, preprocessed_text_dictionary)

Successfully preprocessed 'extracted_texts\vo2_1.txt' to 'preprocessed_texts\vo2_1.txt'
Successfully preprocessed 'extracted_texts\vo2_2.txt' to 'preprocessed_texts\vo2_2.txt'
Successfully preprocessed 'extracted_texts\vo2_3.txt' to 'preprocessed_texts\vo2_3.txt'
