#### Section 1. import the necessary libraries and download tools

In [1]:
# This section is for importing the necessary libraries
import os
import PyPDF2
import textwrap
import tiktoken
import nltk
import json
from tqdm import tqdm
# Download the NLTK data required for sentence tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sining/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Section 2. define the functions to process original pdf
* function 1. convert pdf to text
* function 2. split the text string to batches (and save as json files)

In [8]:
def pdf_to_text(path_to_file, file_name):
    """
    this function is designed to convert pdf to text
    :param path_to_file: the path to the pdf file
    :param file_name: the name of the pdf file
    :return: the text of the pdf
    """
    # open the pdf file
    pdf_file = open(path_to_file + file_name, 'rb')

    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Get the number of pages
    num_pages = len(pdf_reader.pages)

    # Extract text from each page
    text = ''
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    
    # make sure the text start with letter, delete /n or empty space at the beginning
    while text[0].isalpha() == False:
        text = text[1:]

    # Close the PDF file
    pdf_file.close()
    return text

In [9]:
def split_text_into_batches(text, max_tokens=50, model='gpt2'):
    """
    this function is designed to split the text into batches
    :param text: the text to be split
    :param max_tokens: the maximum number of tokens per batch
    :return: a list of batches
    """
    # Load the encoding for the language model
    encoding = tiktoken.get_encoding(model)

    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Initialize an empty list to store the batches
    batches = []
    
    # Initialize the current batch with an empty list
    current_batch = []
    current_batch_tokens = []
    
    for sentence in sentences:
        # Encode the sentence into tokens
        sentence_tokens = encoding.encode(sentence)
        
        # If adding the sentence to the current batch would make it too long,
        # add the current batch to the list of batches and start a new batch
        if len(current_batch_tokens) + len(sentence_tokens) > max_tokens:
            batches.append(encoding.decode(current_batch_tokens))
            current_batch = [sentence]
            current_batch_tokens = sentence_tokens
        else:
            # Otherwise, add the sentence to the current batch
            current_batch.append(sentence)
            current_batch_tokens.extend(sentence_tokens)
    
    # Add the last batch to the list of batches
    if current_batch_tokens:
        batches.append(encoding.decode(current_batch_tokens))
    
    return batches

#### Section 3. execute the functions to process the pdf files

In [11]:
if __name__ == '__main__':
    # specify the local path to the pdf files
    folder_path = '/Users/sining/Library/CloudStorage/GoogleDrive-sxw924@case.edu/My Drive/Research/pedagogical research/GenerativeAI for teaching and learning/GenAI in Essay Writing/NudgeEssayResearch/clean_file/test/'

    # get a list of the pdf files in the directory
    pdf_files = os.listdir(folder_path)


    # process the pdf files automatically. If the file not found, catch the exception, skip the file and print the error message
    error_file = []
    for file in tqdm(pdf_files):
        print(f'Processing {file}')
        try:
            text = pdf_to_text(folder_path, file)
            batches = split_text_into_batches(text)
            batches = [str(batch) for batch in batches]      
            # remove the .pdf from the file name
            file_name = file.replace('.pdf', '')
            # export the batches to a JSON file, use the file name as the key, and the batches as the value
            json.dump(batches, open('rawtext/' + file_name + '.json', 'w'))
            print(f'Successfully Processed {file}')
        except Exception as e:
            print(f'Error processing {file}: {e}')
            error_file.append(file) 
    
    print("All DONE!====================================")


  0%|          | 0/5 [00:00<?, ?it/s]

Processing essay4.pdf


 40%|████      | 2/5 [00:00<00:01,  2.47it/s]

Successfully Processed essay4.pdf
Processing essay5.pdf
Successfully Processed essay5.pdf
Processing essay2.pdf


 60%|██████    | 3/5 [00:01<00:00,  2.92it/s]

Successfully Processed essay2.pdf
Processing essay3.pdf


100%|██████████| 5/5 [00:01<00:00,  3.07it/s]

Successfully Processed essay3.pdf
Processing essay1.pdf
Successfully Processed essay1.pdf



