In [3]:
!pip install -qU langchain tiktoken matplotlib seaborn tqdm PyPDF2 PyMuPDF

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
from google.colab import files

uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))


Saving SoftwareEngineering.pdf to SoftwareEngineering.pdf
User uploaded file "SoftwareEngineering.pdf" with length 14269200 bytes


Now we extract the text

In [5]:

import json
import PyPDF2
from io import BytesIO

# Function to extract text from pdf
def extract_text_from_pdf(pdf_content):
    pdf_file = BytesIO(pdf_content)
    reader = PyPDF2.PdfReader(pdf_file)
    text_data = {}
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text_data[page_num + 1] = page.extract_text()
    return text_data

# Extract text
pdf_content = uploaded[fn]
text_data = extract_text_from_pdf(pdf_content)

In [34]:
# Convert to JSONL format and store in 'docs' list for in-memory usage
docs = []
chapter = 0
for page_num, content in text_data.items():
    if f'''Chapter {chapter+1}
''' in content:
      chapter+=1

    entry = {
        "id": str(page_num),
        "source": "Page " + str(page_num),
        "page_content": content
    }
    if (chapter > 0):
      entry['chapter'] = str(chapter)
    else:
      entry['chapter'] = "N/A"

    docs.append(entry)

# Save to a JSONL file
with open("extracted_data.jsonl", "w") as out_file:
    for entry in docs:
        out_file.write(json.dumps(entry) + '\n')

print("Data has been extracted and saved to extracted_data.jsonl")


Data has been extracted and saved to extracted_data.jsonl


Intialize our token model that will be useful for tokenizing the text.  Feel free to play around with the encoding model but for this example we will use GPT-3.5-turbo

In [35]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken.encoding_for_model('gpt-3.5-turbo')

token_counts = [tiktoken_len(doc['page_content']) for doc in docs]

# Chunking the Text

Feel free to change the chunk size and overlap size for different results.

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

In [37]:
from tqdm.auto import tqdm
import hashlib

# Assuming you've already defined the text_splitter with the split_text function

m = hashlib.md5()  # this will convert source into a unique ID (which, in this case, is a page number)

documents = []

for doc in tqdm(docs):
    source = doc['source']  # get source (page number) from 'source' key
    m.update(source.encode('utf-8'))
    uid = m.hexdigest()[:12]
    chunks = text_splitter.split_text(doc['page_content'])  # get page content from 'page_content' key
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': source,
            'chapter': doc['chapter']
        })

print(f"Total number of documents: {len(documents)}")


  0%|          | 0/627 [00:00<?, ?it/s]

Total number of documents: 1603


In [38]:
import json

# Convert and save documents to a .jsonl file
with open('documents.jsonl', 'w') as file:
    for doc in documents:
        file.write(json.dumps(doc) + '\n')

# For downloading the .jsonl file from Google Colab to your local machine
from google.colab import files

files.download('documents.jsonl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Testing trying to extract Chapter from the text.

Restart your Runtime using the 'Runtime' tab on the top right and run the code below to see how my current code (ChatGPT generated) extracts each chapter.  This mode is not incorporating NER (Named Entity Recognizer) which can be read about and researched here: Research (https://nlp.stanford.edu/software/CRF-NER.shtml) and this example repo which uses a .txt file (https://github.com/wpm/stanford-ner-standoff/tree/master)

In [None]:
from google.colab import files

uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))


In [None]:
import json
import fitz  # PyMuPDF
import hashlib
from tqdm.auto import tqdm
import re
import tiktoken

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text_data = {}
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text_data[page_num + 1] = page.get_text()
    return text_data


In [None]:
import re

def assign_chapter_to_pages(text_data):
    """Assign a chapter to each page based on detected chapter headings."""
    chapter_pattern = re.compile(r"CHAPTER (\d+):")
    current_chapter = None

    # To store the start and end page of each chapter
    chapter_page_ranges = {}

    # A flag to determine if the first chapter has been encountered
    first_chapter_encountered = False

    # Detect chapters and note start page
    for page_num, page_content in text_data.items():
        chapter_match = chapter_pattern.search(page_content)
        if chapter_match:
            first_chapter_encountered = True

            # If there's a current chapter, its end is the previous page
            if current_chapter:
                chapter_page_ranges[current_chapter]["end_page"] = page_num - 1
            current_chapter = int(chapter_match.group(1))
            chapter_page_ranges[current_chapter] = {"start_page": page_num, "end_page": None}

        # Skip pages until the first chapter is encountered
        if not first_chapter_encountered:
            continue

    # The end page of the last chapter is the last page of the book
    if current_chapter:
        chapter_page_ranges[current_chapter]["end_page"] = page_num

    # Assign a chapter to each page
    page_chapters = {}
    for page_num in text_data:
        for chapter_num, page_range in chapter_page_ranges.items():
            if page_range["start_page"] <= page_num <= page_range["end_page"]:
                page_chapters[page_num] = chapter_num
                break

    return page_chapters

# Ensure text_data is defined before calling the function
page_chapters = assign_chapter_to_pages(text_data)


In [None]:
tokenizer = tiktoken.get_encoding('cl100k_base')

# Create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)


In [None]:
# Extract text
pdf_path = list(uploaded.keys())[0]  # Assuming you've already uploaded the PDF as 'uploaded'
text_data = extract_text_from_pdf(pdf_path)


In [None]:
documents = []
m = hashlib.md5()

for page_num, page_content in text_data.items():
    # Lookup the current chapter using page_chapters dictionary
    current_chapter = page_chapters.get(page_num, None)

    chunks = text_splitter.split_text(page_content)
    for i, chunk in enumerate(chunks):
        m.update(str(page_num).encode('utf-8'))
        uid = m.hexdigest()[:12]
        documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': f"Page {page_num}",
            'chapter_title': f"Chapter {current_chapter}" if current_chapter else "Unknown Chapter"
        })

# Displaying the 20th document for verification
print(documents[19])

{'id': 'ea28d914bcc6-2', 'text': 'ISBN 9781119702665 (epub)\nSubjects: LCSH: Data transmission systems. | Computer networks. | Office practice–Automation.\nClassification: LCC TK5105 .F577 2021 (print) | LCC TK5105 (ebook) | DDC 004.6–dc23\nLC record available at https://lccn.loc.gov/2020028461\nLC ebook record available at https://lccn.loc.gov/2020028462\nThe inside back cover will contain printing identification and country of origin if omitted from this page. In addition, if the ISBN on the back\ncover differs from the ISBN on this page, the one on the back cover is correct.', 'source': 'Page 14', 'chapter_title': 'Chapter 11'}


In [None]:
# Save to a JSONL file
with open("processed_data.jsonl", "w") as out_file:
    for entry in documents:
        out_file.write(json.dumps(entry) + '\n')


In [None]:
from google.colab import files

files.download("processed_data.jsonl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>