#### Section 1. import the necessary libraries and download tools

In [1]:
# This section is for importing the necessary libraries
import os
import PyPDF2
import textwrap
import tiktoken
import nltk
import json
from tqdm import tqdm
# Download the NLTK data required for sentence tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sining/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Section 2. define the functions to process original pdf
* function 1. convert pdf to text
* function 2. split the text string to batches (and save as json files)

In [2]:
def pdf_to_text(path_to_file, file_name):
    """
    this function is designed to convert pdf to text
    :param path_to_file: the path to the pdf file
    :param file_name: the name of the pdf file
    :return: the text of the pdf
    """
    # open the pdf file
    pdf_file = open(path_to_file + file_name, 'rb')

    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Get the number of pages
    num_pages = len(pdf_reader.pages)

    # Extract text from each page
    text = ''
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    
    # make sure the text start with letter, delete /n or empty space at the beginning
    while text[0].isalpha() == False:
        text = text[1:]

    # Close the PDF file
    pdf_file.close()
    return text

In [3]:
def split_text_into_batches(text, max_tokens=50, model='gpt2'):
    """
    this function is designed to split the text into batches
    :param text: the text to be split
    :param max_tokens: the maximum number of tokens per batch
    :return: a list of batches
    """
    # Load the encoding for the language model
    encoding = tiktoken.get_encoding(model)

    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Initialize an empty list to store the batches
    batches = []
    
    # Initialize the current batch with an empty list
    current_batch = []
    current_batch_tokens = []
    
    for sentence in sentences:
        # Encode the sentence into tokens
        sentence_tokens = encoding.encode(sentence)
        
        # If adding the sentence to the current batch would make it too long,
        # add the current batch to the list of batches and start a new batch
        if len(current_batch_tokens) + len(sentence_tokens) > max_tokens:
            batches.append(encoding.decode(current_batch_tokens))
            current_batch = [sentence]
            current_batch_tokens = sentence_tokens
        else:
            # Otherwise, add the sentence to the current batch
            current_batch.append(sentence)
            current_batch_tokens.extend(sentence_tokens)
    
    # Add the last batch to the list of batches
    if current_batch_tokens:
        batches.append(encoding.decode(current_batch_tokens))
    
    return batches

#### Section 3. execute the functions to process the pdf files

In [6]:
if __name__ == '__main__':
    # specify the local path to the pdf files
    folder_path = '/Users/sining/Library/CloudStorage/GoogleDrive-sxw924@case.edu/My Drive/Research/pedagogical research/GenerativeAI for teaching and learning/GenAI in Essay Writing/NudgeEssayResearch/clean_file/S23/'

    # get a list of the pdf files in the directory
    pdf_files = os.listdir(folder_path)


    # process the pdf files automatically. If the file not found, catch the exception, skip the file and print the error message
    error_file = []
    for file in tqdm(pdf_files):
        print(f'Processing {file}')
        try:
            text = pdf_to_text(folder_path, file)
            batches = split_text_into_batches(text)
            batches = [str(batch) for batch in batches]      
            # remove the .pdf from the file name
            file_name = file.replace('.pdf', '')
            # export the batches to a JSON file, use the file name as the key, and the batches as the value
            json.dump(batches, open('rawtext/' + file_name + '.json', 'w'))
            print(f'Successfully Processed {file}')
        except Exception as e:
            print(f'Error processing {file}: {e}')
            error_file.append(file) 
    
    print("All DONE!====================================")


  0%|          | 0/50 [00:00<?, ?it/s]

Processing essay95.pdf


  2%|▏         | 1/50 [00:02<01:59,  2.44s/it]

Successfully Processed essay95.pdf
Processing essay81.pdf


  4%|▍         | 2/50 [00:04<01:44,  2.17s/it]

Successfully Processed essay81.pdf
Processing essay56.pdf


  6%|▌         | 3/50 [00:06<01:43,  2.21s/it]

Successfully Processed essay56.pdf
Processing essay57.pdf


  8%|▊         | 4/50 [00:08<01:31,  1.99s/it]

Successfully Processed essay57.pdf
Processing essay80.pdf


 10%|█         | 5/50 [00:09<01:23,  1.86s/it]

Successfully Processed essay80.pdf
Processing essay94.pdf


 12%|█▏        | 6/50 [00:11<01:21,  1.86s/it]

Successfully Processed essay94.pdf
Processing essay82.pdf


 14%|█▍        | 7/50 [00:12<01:09,  1.62s/it]

Successfully Processed essay82.pdf
Processing essay96.pdf


 16%|█▌        | 8/50 [00:14<01:13,  1.75s/it]

Successfully Processed essay96.pdf
Processing essay69.pdf


 18%|█▊        | 9/50 [00:16<01:03,  1.55s/it]

Successfully Processed essay69.pdf
Processing essay55.pdf


 20%|██        | 10/50 [00:18<01:10,  1.75s/it]

Successfully Processed essay55.pdf
Processing essay54.pdf


 22%|██▏       | 11/50 [00:19<01:05,  1.67s/it]

Successfully Processed essay54.pdf
Processing essay68.pdf


 24%|██▍       | 12/50 [00:21<01:04,  1.70s/it]

Successfully Processed essay68.pdf
Processing essay97.pdf


 26%|██▌       | 13/50 [00:22<00:55,  1.50s/it]

Successfully Processed essay97.pdf
Processing essay83.pdf


 28%|██▊       | 14/50 [00:24<01:00,  1.67s/it]

Successfully Processed essay83.pdf
Processing essay87.pdf


 30%|███       | 15/50 [00:26<01:01,  1.74s/it]

Successfully Processed essay87.pdf
Processing essay93.pdf


 32%|███▏      | 16/50 [00:28<01:01,  1.81s/it]

Successfully Processed essay93.pdf
Processing essay78.pdf


 34%|███▍      | 17/50 [00:30<01:06,  2.00s/it]

Successfully Processed essay78.pdf
Processing essay79.pdf


 36%|███▌      | 18/50 [00:33<01:13,  2.29s/it]

Successfully Processed essay79.pdf
Processing essay92.pdf


 38%|███▊      | 19/50 [00:36<01:15,  2.43s/it]

Successfully Processed essay92.pdf
Processing essay86.pdf


 40%|████      | 20/50 [00:38<01:07,  2.24s/it]

Successfully Processed essay86.pdf
Processing essay90.pdf


 42%|████▏     | 21/50 [00:40<01:01,  2.12s/it]

Successfully Processed essay90.pdf
Processing essay84.pdf


 44%|████▍     | 22/50 [00:42<00:57,  2.04s/it]

Successfully Processed essay84.pdf
Processing essay53.pdf


 46%|████▌     | 23/50 [00:43<00:52,  1.95s/it]

Successfully Processed essay53.pdf
Processing essay52.pdf


 48%|████▊     | 24/50 [00:45<00:49,  1.90s/it]

Successfully Processed essay52.pdf
Processing essay85.pdf


 50%|█████     | 25/50 [00:47<00:47,  1.89s/it]

Successfully Processed essay85.pdf
Processing essay91.pdf


 52%|█████▏    | 26/50 [00:48<00:40,  1.67s/it]

Successfully Processed essay91.pdf
Processing essay88.pdf


 54%|█████▍    | 27/50 [00:51<00:43,  1.91s/it]

Successfully Processed essay88.pdf
Processing essay77.pdf


 56%|█████▌    | 28/50 [00:53<00:44,  2.00s/it]

Successfully Processed essay77.pdf
Processing essay63.pdf


 58%|█████▊    | 29/50 [00:55<00:42,  2.03s/it]

Successfully Processed essay63.pdf
Processing essay102.pdf


 60%|██████    | 30/50 [00:57<00:38,  1.93s/it]

Successfully Processed essay102.pdf
Processing essay62.pdf


 62%|██████▏   | 31/50 [00:59<00:35,  1.89s/it]

Successfully Processed essay62.pdf
Processing essay76.pdf


 64%|██████▍   | 32/50 [01:00<00:34,  1.89s/it]

Successfully Processed essay76.pdf
Processing essay89.pdf


 66%|██████▌   | 33/50 [01:02<00:33,  1.95s/it]

Successfully Processed essay89.pdf
Processing essay74.pdf


 68%|██████▊   | 34/50 [01:04<00:29,  1.82s/it]

Successfully Processed essay74.pdf
Processing essay100.pdf


 70%|███████   | 35/50 [01:06<00:27,  1.83s/it]

Successfully Processed essay100.pdf
Processing essay101.pdf


 72%|███████▏  | 36/50 [01:07<00:22,  1.61s/it]

Successfully Processed essay101.pdf
Processing essay75.pdf


 74%|███████▍  | 37/50 [01:08<00:19,  1.49s/it]

Successfully Processed essay75.pdf
Processing essay61.pdf


 76%|███████▌  | 38/50 [01:10<00:18,  1.56s/it]

Successfully Processed essay61.pdf
Processing essay65.pdf


 78%|███████▊  | 39/50 [01:12<00:17,  1.61s/it]

Successfully Processed essay65.pdf
Processing essay71.pdf


 80%|████████  | 40/50 [01:13<00:15,  1.50s/it]

Successfully Processed essay71.pdf
Processing essay59.pdf


 82%|████████▏ | 41/50 [01:14<00:12,  1.43s/it]

Successfully Processed essay59.pdf
Processing essay58.pdf


 84%|████████▍ | 42/50 [01:17<00:14,  1.84s/it]

Successfully Processed essay58.pdf
Processing essay70.pdf


 86%|████████▌ | 43/50 [01:18<00:12,  1.73s/it]

Successfully Processed essay70.pdf
Processing essay64.pdf


 88%|████████▊ | 44/50 [01:20<00:09,  1.66s/it]

Successfully Processed essay64.pdf
Processing essay99.pdf


 90%|█████████ | 45/50 [01:22<00:09,  1.91s/it]

Successfully Processed essay99.pdf
Processing essay72.pdf


 92%|█████████▏| 46/50 [01:24<00:07,  1.87s/it]

Successfully Processed essay72.pdf
Processing essay66.pdf


 94%|█████████▍| 47/50 [01:26<00:05,  1.89s/it]

Successfully Processed essay66.pdf
Processing essay67.pdf


 96%|█████████▌| 48/50 [01:28<00:03,  1.84s/it]

Successfully Processed essay67.pdf
Processing essay73.pdf


 98%|█████████▊| 49/50 [01:30<00:01,  1.88s/it]

Successfully Processed essay73.pdf
Processing essay98.pdf


100%|██████████| 50/50 [01:31<00:00,  1.84s/it]

Successfully Processed essay98.pdf



