In [None]:
!pip install pdfminer.six

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pdfminer.six
  Downloading pdfminer.six-20220524-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 5.1 MB/s 
Collecting cryptography>=36.0.0
  Downloading cryptography-37.0.2-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 37.0 MB/s 
Installing collected packages: cryptography, pdfminer.six
Successfully installed cryptography-37.0.2 pdfminer.six-20220524


## pdf2TextParser - class to read pdf file , convert to text, split the text into sentences

In [None]:
import io,re,string
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import pandas as pd
import nltk
nltk.download('punkt')

class pdf2TextParser:
  def __init__(self,pdf_path):
    self.pdf_path= pdf_path

  def extract_text_from_pdf(self):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(self.pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()

    if text:
        return text

  def extract_text_by_page(self):
    with open(self.pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=False):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()

  # extract_cleaned_sentense -  clean the text and return sentences split using nltk sentence tokenizer
  def extract_cleaned_sentense(self,text):
        

        nonascii = set(string.printable)
        text = "".join(filter(lambda x: x in nonascii, text))

        text = re.sub(r"\t+", r" ", text)

        fragments = []
        prev = ""
        for line in re.split(r"\n+", text):
            if line.isupper():
                prev = "."  
            elif line and (line.startswith(" ") or line[0].islower()
                  or not prev.endswith(".")):
                prev = f"{prev} {line}"  
            else:
                fragments.append(prev)
                prev = line
        fragments.append(prev)

        # Clean the lines into sentences
        sentences = []
        for line in fragments:
            # Use regular expressions to clean text
            url_str = (r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\."
                       r"([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*")
            line = re.sub(url_str, r" ", line)  # URLs
            line = re.sub(r"^\s?\d+(.*)$", r"\1", line)  # headers
            line = re.sub(r"\.+", ".", line)  # multiple periods
            
            line = line.strip()  
            line = re.sub(r"\s+", " ", line)  
            line = re.sub(r"\s?([,:;\.])", r"\1", line)  
            line = re.sub(r"\s?-\s?", "-", line)  

            
            for sentence in nltk.sent_tokenize(line):
                s = str(sentence).strip().lower()  
                if "table of contents" not in s and "appendix" not in s and "glossary" not in s and len(s) > 5:
                    sentences.append(s)
        return sentences


  def extract_text(self):
      final_df=pd.DataFrame()
      for i, page in enumerate(self.extract_text_by_page()):
        df = pd.DataFrame(self.extract_cleaned_sentense(page), columns=['sentences'])
        final_df=final_df.append(df,ignore_index=True)
        # if i > 10:
        #   break
      final_df.insert(loc = 0,
          column = 'filename',
          value = self.pdf_path )
      return final_df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Download Data

In [None]:
!unzip /content/drive/MyDrive/500-reports-dataset.zip 
%cd 500-reports-dataset

## parse pdf file and save text to dataframe as sentences

In [None]:
import os
files = [f for f in os.listdir('.') if os.path.isfile(f)]
sustain_df=pd.DataFrame()
for f in files:
  if f.endswith('.pdf'):
    pdf_reader = pdf2TextParser(f)
   
    sustain_df = sustain_df.append(pdf_reader.extract_text(),ignore_index=True)

In [None]:
sustain_df.tail(20)

Unnamed: 0,filename,sentences
61624,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,"with the first store open-ings in february, th..."
61625,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,"the peak, with calls more than 300 percent up ..."
61626,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,"the longer the lockdown meas-ures lasted, the ..."
61627,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,they often expected this: if the government pr...
61628,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,if a further easing of measures was announced ...
61629,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,but it soon became clear that the rules varied...
61630,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,the team opts in favor of clear communications...
61631,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,"it adds well-meaning advice that, to be on the..."
61632,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,that goes down well with many users.
61633,2b0daf6b-32c7-49db-b0ba-7149cc4e5d6a.pdf,"at the same time, we also noticed that people ..."
