In [1]:
import os

In [2]:
import requests

In [3]:
import pymupdf

In [4]:
from tqdm.auto import tqdm

In [5]:
import pandas as pd
import spacy

In [6]:
def text_formatter(text):
    cleaned_text = text.replace("\n","").strip()
    return cleaned_text

In [7]:
def open_and_read_pdf(pdf_path):
    doc =pymupdf.open(pdf_path)
    pages_and_text = []
    for page_number,page in tqdm(enumerate(doc),total=doc.page_count):
        text = page.get_text() #get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_text.append({"page_number":page_number -1,
                               "page_char_count":len(text),
                               "page_word_count":len(text.split(" ")),
                               "page_sentence_count_raw":len(text.split(".")),
                               "page_token_count":len(text)/4,
                               "text":text})
        
    return pages_and_text

In [9]:
pdf_path = "US_Declaration.pdf"

In [11]:
pages_and_texts = open_and_read_pdf(pdf_path = pdf_path)

  0%|          | 0/5 [00:00<?, ?it/s]

In [12]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 2,
  'page_char_count': 1131,
  'page_word_count': 212,
  'page_sentence_count_raw': 6,
  'page_token_count': 282.75,
  'text': 'between them and the State of Great Britain, is and ought to be totally dissolved; and that as Freeand Independent States, they have full Power to levy War, conclude Peace, contract Alliances,establish Commerce, and to do all other Acts and Things which Independent States may of rightdo. And for the support of this Declaration, with a firm reliance on the protection of divineProvidence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor.[The 56 signatures on the Declaration were arranged in six columns:] [Column 1] Georgia:   Button Gwinnett   Lyman Hall   George Walton [Column 2] North Carolina:   William Hooper   Joseph Hewes   John Penn South Carolina:   Edward Rutledge   Thomas Heyward, Jr.  Thomas Lynch, Jr.  Arthur Middleton [Column 3] Massachusetts:   John Hancock Maryland:   Samuel Chase   William Paca   Tho

In [13]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-1,2878,448,14,719.5,"Declaration of IndependenceIN CONGRESS, July 4..."
1,0,1980,300,9,495.0,He has dissolved Representative Houses repeate...
2,1,2724,420,15,681.0,to render it at once an example and fit instru...
3,2,1131,212,6,282.75,"between them and the State of Great Britain, i..."
4,3,623,134,1,155.75,George Clymer James Smith George Taylor ...


In [14]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,5.0,5.0,5.0,5.0,5.0
mean,1.0,1867.2,302.8,9.0,466.8
std,1.58,982.16,133.76,5.79,245.54
min,-1.0,623.0,134.0,1.0,155.75
25%,0.0,1131.0,212.0,6.0,282.75
50%,1.0,1980.0,300.0,9.0,495.0
75%,2.0,2724.0,420.0,14.0,681.0
max,3.0,2878.0,448.0,15.0,719.5


In [16]:

nlp = spacy.load('en_core_web_sm')

for item in tqdm(pages_and_texts):
    item["sentences"]=list(nlp(item["text"]).sents)
    
    #Make sure all sentences are strings
    item["sentences"]=[str(sentence) for sentence in item["sentences"]]

    #count the sentences
    item["page_sentence_count_spacy"]=len(item["sentences"])

  0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,1.0,1867.2,302.8,9.0,466.8,9.8
std,1.58,982.16,133.76,5.79,245.54,5.07
min,-1.0,623.0,134.0,1.0,155.75,3.0
25%,0.0,1131.0,212.0,6.0,282.75,6.0
50%,1.0,1980.0,300.0,9.0,495.0,12.0
75%,2.0,2724.0,420.0,14.0,681.0,13.0
max,3.0,2878.0,448.0,15.0,719.5,15.0


In [18]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10
# Create a function that recursively split a list into desired sizes
def split_list(input_list,slice_size):
    return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentences"] = item["text"].split(".")
    item["sentences"] = [sentence.strip() + '.' for sentence in item["sentences"] if sentence.strip()]
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
#create a dataframe to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,1.0,1867.2,302.8,9.0,466.8,9.8,1.4
std,1.58,982.16,133.76,5.79,245.54,5.07,0.55
min,-1.0,623.0,134.0,1.0,155.75,3.0,1.0
25%,0.0,1131.0,212.0,6.0,282.75,6.0,1.0
50%,1.0,1980.0,300.0,9.0,495.0,12.0,1.0
75%,2.0,2724.0,420.0,14.0,681.0,13.0,2.0
max,3.0,2878.0,448.0,15.0,719.5,15.0,2.0


In [21]:
#splitting each chunk into its own size

import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/5 [00:00<?, ?it/s]

7

In [22]:
#Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,7.0,7.0,7.0,7.0
mean,0.71,1328.14,210.43,332.04
std,1.5,654.54,100.47,163.63
min,-1.0,596.0,98.0,149.0
25%,-0.5,820.5,129.5,205.12
50%,1.0,1105.0,185.0,276.25
75%,1.5,1864.5,290.0,466.12
max,3.0,2226.0,351.0,556.5


In [31]:
from happytransformer import HappyTextToText, TTSettings


In [24]:
# Initialize the grammar correction model and tokenizer
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)

07/03/2024 17:12:18 - INFO - happytransformer.happy_transformer -   Using device: cpu


In [25]:
def check_grammar(sentence):
    result = happy_tt.generate_text("gec: " + sentence, args=args)
    return result.text


In [32]:
def find_errors(original_text, corrected_text):
    original_tokens = original_text.split()
    corrected_tokens = corrected_text.split()
    errors = []

    for i in range(max(len(original_tokens), len(corrected_tokens))):
        if i < len(original_tokens) and (i >= len(corrected_tokens) or original_tokens[i] != corrected_tokens[i]):
            errors.append(f"- {original_tokens[i]}")
        if i < len(corrected_tokens) and (i >= len(original_tokens) or original_tokens[i] != corrected_tokens[i]):
            errors.append(f"+ {corrected_tokens[i]}")

    return errors

In [35]:
# Detect grammetical errors
errors=[]
for item in tqdm(pages_and_texts):
    for sentence in item["sentences"]:
        corrected_sentence = check_grammar(sentence)
        sentence_errors = find_errors(sentence,corrected_sentence)
        if sentence_errors:
            errors.append({
                "page_number": item["page_number"],
                "original_sentence": sentence.strip(),
                "corrected_sentence": corrected_sentence.strip(),
                "errors":sentence_errors
            })


  0%|          | 0/5 [00:00<?, ?it/s]

In [36]:
# Print errors
print("Errors detected:")
for error in errors:
    print(f"Page {error['page_number']}:")
    print(f"Original: {error['original_sentence']}")
    print(f"Corrected: {error['corrected_sentence']}")
    print("Differences:")
    for err in error['errors']:
        print(err)
    print()

Errors detected:
Page -1:
Original: Declaration of IndependenceIN CONGRESS, July 4, 1776.
Corrected: Declaration of Independence, July 4, 1776.
Differences:
- IndependenceIN
+ Independence,
- CONGRESS,
+ July
- July
+ 4,
- 4,
+ 1776.
- 1776.

Page -1:
Original: The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve thepolitical bands which have connected them with another, and to assume among the powers of theearth, the separate and equal station to which the Laws of Nature and of Nature's God entitlethem, a decent respect to the opinions of mankind requires that they should declare the causeswhich impel them to the separation.
Corrected: The unanimous Declaration of the thirteen United States of America, When in the course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth 