In [1]:
import os
import pymupdf
from tqdm.auto import tqdm
import pandas as pd
import spacy
from happytransformer import HappyTextToText, TTSettings
import re




In [2]:
#Define the grammercheck class
class GrammarChecker:
     def __init__(self):
         self.grammar_check = HappyTextToText("T5","vennify/t5-base-grammar-correction")
         self.beam_settings = TTSettings(num_beams=5, min_length=1, max_length=20)

     def correct_grammar(self, text):
         # Generate corrected text from the grammar model
         matches = self.grammar_check.generate_text("gec: " + text, args=self.beam_settings)

         # Extract original and corrected text
         original_text = text
         corrected_text = matches.text if matches and matches.text else text

         return original_text, corrected_text

     def highlight_errors(self, original_text, corrected_text):
         original_tokens = original_text.split()
         corrected_tokens = corrected_text.split()
         highlighted_text = original_text

         # Find the incorrect words and highlight them
         for i in range(max(len(original_tokens),len(corrected_tokens))):
             if i < len(original_tokens) and (i >= len(corrected_tokens) or original_tokens[i] != corrected_tokens[i]):
                 highlighted_text = highlighted_text.replace(original_tokens[i], f"**{original_tokens[i]}**", 1)

         return highlighted_text

In [24]:
#Initialize grammer checker
grammer_checker = GrammarChecker()

07/04/2024 10:55:46 - INFO - happytransformer.happy_transformer -   Using device: cpu


In [9]:
def text_formatter(text):
    cleaned_text = text.replace("\n", "").strip()
    return cleaned_text

In [10]:
def open_and_read_pdf(pdf_path):
    doc =pymupdf.open(pdf_path)
    pages_and_text = []
    for page_number,page in tqdm(enumerate(doc),total=doc.page_count):
        text = page.get_text() #get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_text.append({"page_number":page_number ,
                               "page_char_count":len(text),
                               "page_word_count":len(text.split(" ")),
                               "page_sentence_count_raw":len(text.split(".")),
                               "page_token_count":len(text)/4,
                               "text":text})
        
    return pages_and_text

In [11]:
pdf_path = "US_Declaration.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

  0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 4,
  'page_char_count': 623,
  'page_word_count': 134,
  'page_sentence_count_raw': 1,
  'page_token_count': 155.75,
  'text': 'George Clymer   James Smith   George Taylor   James Wilson   George Ross Delaware:   Caesar Rodney   George Read   Thomas McKean [Column 5] New York:   William Floyd   Philip Livingston   Francis Lewis   Lewis Morris New Jersey:   Richard Stockton   John Witherspoon   Francis Hopkinson   John Hart   Abraham Clark [Column 6] New Hampshire:   Josiah Bartlett   William Whipple Massachusetts:   Samuel Adams   John Adams   Robert Treat Paine   Elbridge Gerry Rhode Island:   Stephen Hopkins   William Ellery Connecticut:   Roger Sherman   Samuel Huntington   William Williams   Oliver Wolcott New Hampshire: Matthew Thornton'},
 {'page_number': 3,
  'page_char_count': 1131,
  'page_word_count': 212,
  'page_sentence_count_raw': 6,
  'page_token_count': 282.75,
  'text': 'between them and the State of Great Britain, is and ought to be totally dissolved;

In [14]:
df = pd.DataFrame(pages_and_texts)
print(df.head())

   page_number  page_char_count  page_word_count  page_sentence_count_raw  \
0            0             2878              448                       14   
1            1             1980              300                        9   
2            2             2724              420                       15   
3            3             1131              212                        6   
4            4              623              134                        1   

   page_token_count                                               text  
0            719.50  Declaration of IndependenceIN CONGRESS, July 4...  
1            495.00  He has dissolved Representative Houses repeate...  
2            681.00  to render it at once an example and fit instru...  
3            282.75  between them and the State of Great Britain, i...  
4            155.75  George Clymer   James Smith   George Taylor   ...  


In [15]:
# Get stats
print(df.describe().round(2))

       page_number  page_char_count  page_word_count  page_sentence_count_raw  \
count         5.00             5.00             5.00                     5.00   
mean          2.00          1867.20           302.80                     9.00   
std           1.58           982.16           133.76                     5.79   
min           0.00           623.00           134.00                     1.00   
25%           1.00          1131.00           212.00                     6.00   
50%           2.00          1980.00           300.00                     9.00   
75%           3.00          2724.00           420.00                    14.00   
max           4.00          2878.00           448.00                    15.00   

       page_token_count  
count              5.00  
mean             466.80  
std              245.54  
min              155.75  
25%              282.75  
50%              495.00  
75%              681.00  
max              719.50  


In [16]:
nlp = spacy.load('en_core_web_sm')


In [17]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/5 [00:00<?, ?it/s]

In [18]:
df = pd.DataFrame(pages_and_texts)
print(df.describe().round(2))

       page_number  page_char_count  page_word_count  page_sentence_count_raw  \
count         5.00             5.00             5.00                     5.00   
mean          2.00          1867.20           302.80                     9.00   
std           1.58           982.16           133.76                     5.79   
min           0.00           623.00           134.00                     1.00   
25%           1.00          1131.00           212.00                     6.00   
50%           2.00          1980.00           300.00                     9.00   
75%           3.00          2724.00           420.00                    14.00   
max           4.00          2878.00           448.00                    15.00   

       page_token_count  page_sentence_count_spacy  
count              5.00                       5.00  
mean             466.80                       9.80  
std              245.54                       5.07  
min              155.75                       3.00  
25%  

In [19]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10
# Create a function that recursively splits a list into desired sizes
def split_list(input_list, slice_size):
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentences"] = item["text"].split(".")
    item["sentences"] = [sentence.strip() + '.' for sentence in item["sentences"] if sentence.strip()]
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])


  0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
# Create a dataframe to get stats
df = pd.DataFrame(pages_and_texts)
print(df.describe().round(2))

       page_number  page_char_count  page_word_count  page_sentence_count_raw  \
count         5.00             5.00             5.00                     5.00   
mean          2.00          1867.20           302.80                     9.00   
std           1.58           982.16           133.76                     5.79   
min           0.00           623.00           134.00                     1.00   
25%           1.00          1131.00           212.00                     6.00   
50%           2.00          1980.00           300.00                     9.00   
75%           3.00          2724.00           420.00                    14.00   
max           4.00          2878.00           448.00                    15.00   

       page_token_count  page_sentence_count_spacy  num_chunks  
count              5.00                       5.00        5.00  
mean             466.80                       9.80        1.40  
std              245.54                       5.07        0.55  
min       

In [21]:
#splitting each chunk into its own size
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
print(len(pages_and_chunks))

  0%|          | 0/5 [00:00<?, ?it/s]

7


In [22]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
print(df.describe().round(2))

       page_number  chunk_char_count  chunk_word_count  chunk_token_count
count         7.00              7.00              7.00               7.00
mean          1.71           1328.14            210.43             332.04
std           1.50            654.54            100.47             163.63
min           0.00            596.00             98.00             149.00
25%           0.50            820.50            129.50             205.12
50%           2.00           1105.00            185.00             276.25
75%           2.50           1864.50            290.00             466.12
max           4.00           2226.00            351.00             556.50


In [27]:
# Detect grammatical errors
errors = []
for item in tqdm(pages_and_texts):
    for sentence in item["sentences"]:
        original_sentence, corrected_sentence = grammer_checker.correct_grammar(sentence)
        highlighted_sentence = grammer_checker.highlight_errors(original_sentence, corrected_sentence)
        if original_sentence != corrected_sentence:
            errors.append({
                "page_number": item["page_number"],
                "original_sentence": original_sentence.strip(),
                "corrected_sentence": corrected_sentence.strip(),
                "highlighted_sentence": highlighted_sentence.strip()
            })

  0%|          | 0/5 [00:00<?, ?it/s]

In [28]:
# Display highlighted sentences with errors
for error in errors:
    print(f"Page {error['page_number']}: {error['highlighted_sentence']}")

Page 0: Declaration of **IndependenceIN** **CONGRESS,** **July** **4,** **1776.**
Page 0: The un**a**nimous Declaration ********of******** ************the************ thirteen **un**it**ed** States of America, When in the **Course** of human events, it **becomes** **necessary** **for** **one** **people** **********to********** **dissolve** **thepolitical** **b******and******s** ****which**** **have** **connected** ****them**** **with** **another,** and to **assume** **among** the **powers** of **theearth,** the **separate** and **equal** **station** to which the **Laws** of **Nature** and of **Nature's** **God** **entitlethem,** a **decent** **respect** to the **opinions** of **mankind** **requires** **that** **they** **should** **declare** the **causeswhich** **impel** them to the **separation.**
Page 0: We hold ****the**se** truths to be self-evident, ****that**** all men ****are**** created **equal,** that **they** are **endowed** **bytheir** **Creator** **with** **certain** **unali