# Extract Text

In [1]:
import sys, pathlib, fitz
fname = "cc0003.pdf"
with fitz.open(fname) as doc:  # open document
    text = chr(12).join([block[4] for page in doc for block in page.get_text("blocks")])
# write as a binary file to support non-ASCII characters
pathlib.Path(fname + ".txt").write_bytes(text.encode())

91740

# Extract images

In [2]:
import fitz
import PIL.Image
import io

pdf = fitz.open("cc0003.pdf")
counter = 1
# for i in range(len(pdf)): # for each page
for i in range(5):
    page = pdf[i]
    images = page.get_images()
    for image in images:
        base_img = pdf.extract_image(image[0])
        print(base_img) # contains a dict about the metadata. The actual data is in the "image" key
        image_data = base_img["image"]
        img = PIL.Image.open(io.BytesIO(image_data))
        extension = base_img["ext"]
        img.save(open(f"image{counter}.{extension}", "wb"))
        counter += 1

In [1]:
import datetime
import os
import requests
from dotenv import load_dotenv

load_dotenv("../.env")

config = {
    'client_id': os.environ.get('CLIENT_ID'),
    'client_secret': os.environ.get('CLIENT_SECRET'),
    'authority': os.environ.get('AUTHORITY'),
    'scope': [os.environ.get('SCOPE')],
    'site_id': os.environ.get('SITE_ID'),
}

headers = {
        'Authorization': f'Bearer {os.environ.get("ACCESS_TOKEN")}',
        'Content-Type': 'application/json'
    }



drive_url = f"https://graph.microsoft.com/v1.0/drives/{os.environ.get('DRIVE_ID')}"

In [2]:
import docx
import sys, pathlib, fitz
import io

# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
# To perform OCR to extract text from images 
import pytesseract 
# To remove the additional created files
import os

for .pdf files

In [3]:
class AzureSyncError(Exception):

    def __init__(self, message):
        self.message = message
        super().__init__(self.message)

def read_pdf(_bytes):

    doc = fitz.open(stream=_bytes, filetype="pdf")
    text = chr(12).join([block[4] for page in doc for block in page.get_text("blocks")])
    doc.close()
    text += chr(12)
    return text


In [4]:
def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()
    
    # Find the formats of the text
    # Initialize the list with all the formats that appeared in the line of text
    top_words_dict = {}
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font size of the character
                    if character.size not in top_words_dict:
                        top_words_dict[character.size] = [character]
                    else:
                        # line_formats.append(character.fontname) # TODO maybe bold?
                        top_words_dict[character.size].append(character)
    
    # Return a tuple with the text in each line along with its format
    return (line_text, top_words_dict)

Extract from scanned pdf

In [5]:
# Create a function to crop the image elements from PDFs
def crop_image_to_text(element, page):
    # Get the coordinates to crop the image from the PDF
    [x0, y0, x1, y1] = [element.x0, element.y0, element.x1, element.y1]

    # Define the rectangle to crop
    clip_rect = fitz.Rect(x0, y0, x1, y1)

    # Crop the page to the size of the image
    pix = page.get_pixmap(clip=clip_rect)

    # Convert the pixmap to an image
    img_data = pix.tobytes("png")  # Convert the image to PNG bytes
    img = Image.open(io.BytesIO(img_data))
    text = pytesseract.image_to_string(img)
    return text

Extract table from page

In [6]:
def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    return table

# Convert table into the appropriate format
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapped texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

In [7]:
# code is taken from https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517 by George Stavrakis


# Find the PDF path
pdf_path = 'cc0003.pdf'

# create a PDF file object
pdfFileObj = open(pdf_path, 'rb')
# create a PDF reader object

# Create the dictionary to extract text from each image
text_per_page = {}

doc = fitz.open(pdf_path)

# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
    
    # Initialize the variables needed for the text extraction from the page
    fitz_page = doc.load_page(pagenum)
    # page_text = []
    top_words_dict = {}
    # text_from_images = []
    # text_from_tables = []
    page_content = []
    # Initialize the number of the examined tables
    table_num = 0
    first_element= True
    table_extraction_flag= False
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    page_tables = pdf.pages[pagenum]
    # Find the number of tables on the page
    tables = page_tables.find_tables()


    # Find all the elements
    page_elements = [(element.y1, element) for element in page._objs]
    # Sort all the elements as they appear in the page 
    page_elements.sort(key=lambda a: a[0], reverse=True)

    # Find the elements that composed a page
    for i, component in enumerate(page_elements):
        # Extract the element of the page layout
        element = component[1]
        
        # Check if the element is a text element
        if isinstance(element, LTTextContainer):
            # Check if the text appeared in a table
            if table_extraction_flag == False:
                # Use the function to extract the text and format for each text element
                (line_text, new_top_words) = text_extraction(element)
                # Append the text of each line to the page text
                # page_text.append(line_text)
                # Append the format for each line containing text
                for length in new_top_words:
                    if length in top_words_dict:
                        top_words_dict[length].extend(new_top_words[length])
                    else:
                        top_words_dict[length] = new_top_words[length]
                page_content.append(line_text)
            else:
                # Omit the text that appeared in a table
                pass

        # Check the elements for images
        if isinstance(element, LTFigure):
            # Crop the image from the PDF
            image_text = crop_image_to_text(element, fitz_page)
            # text_from_images.append(image_text)
            page_content.append(image_text)

        # Check the elements for tables
        if isinstance(element, LTRect):
            # If the first rectangular element
            if first_element == True and (table_num + 1) <= len(tables):
                # Find the bounding box of the table
                lower_side = page.bbox[3] - tables[table_num].bbox[3]
                upper_side = element.y1 
                # Extract the information from the table
                table = extract_table(pdf_path, pagenum, table_num)
                # Convert the table information in structured string format
                table_string = table_converter(table)
                # Append the table string into a list
                # text_from_tables.append(table_string)
                page_content.append(table_string)
                # Set the flag as True to avoid the content again
                table_extraction_flag = True
                # Make it another element
                first_element = False

            # Check if we already extracted the tables from the page
            if element.y0 >= lower_side and element.y1 <= upper_side:
                pass
            elif not isinstance(page_elements[i+1][1], LTRect):
                table_extraction_flag = False
                first_element = True
                table_num += 1


    # Create the key of the dictionary
    dctkey = 'Page_' + str(pagenum)
    # Add the list of list as the value of the page key
    text_per_page[dctkey] = page_content

# Closing the pdf file object
doc.close()

# Display the content of the page
result = ''.join(text_per_page['Page_6'])
print(result)

|Advantages|Disadvantages|
|Avoid discrimination|Not an international law|
|Improve the quality of human beings|Lacks the power to legally force any directive|
|Set the standard for individuals to be treated equally||
|Procedure and framework for governments to protect and promote human rights|None| 

and Disadvantages of UDHR
‘Advantages

‘Avoid discrimination
Improve the quality of human beings
Set the standard for individuals to be treated equally

Procedure and framework for governments to protect
and promote human rights

Countries
Apply some from of Human Rights legislations

 

Disadvantages

Not an international law

Lacks the power to legally force any
Universal Declaration of Human Rights (UDHR)
Universal Declaration of Human Rights (UDHR)

“Adnan tongs ar bom ac eon gnty an rs They are enone mh eason
{rs corscaren ana eho omath ow sneer a sat ot rod”

ag against craton

‘Ont an poten rg, rg rh, ow, pero secu. nd
putea paneraton

Econom pc ana cain rs an aoe anced ng so
‘cor

For .docx and .txt files

In [11]:
def read_word(content):
    file_stream = io.BytesIO(content)
    doc = docx.Document(file_stream)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + chr(12)
    text += chr(12)
    return text

def read_txt(content):
    text = content.decode('utf-16')
    text += chr(12)
    return text

def read_document(stream, filename, f_out):
    if filename.endswith(".pdf"):
        text = read_pdf(stream)
    elif filename.endswith(".docx"):
        text = read_word(stream)
    elif filename.endswith(".txt"):
        text = read_txt(stream)
    with pathlib.Path(f_out).open('ab') as file:
        file.write(text.encode())

In [12]:
def loop_through_files(url, filename=None):

    response = requests.get(url=url, headers=headers)
    # response.raise_for_status()
    if not 200 <= response.status_code < 300:
        raise AzureSyncError(f"Something went wrong. {response.json()}")

    if filename:
        read_document(response.content, filename, "output.txt")
        print("added")
        return
    
    
    for value in response.json()['value']:
        new_filename = None
        relevant = 1
        if value['name'].endswith(".pdf") or value['name'].endswith(".docx") or value['name'].endswith(".txt"):
            new_filename = value['name']
            new_url = value['@microsoft.graph.downloadUrl']
        elif value.get('folder'):
            new_url = url[:-len(':/children')] + '/' + value['name'] + ':/children'
        else:
            relevant = 0
        
        if relevant:
            loop_through_files(new_url, new_filename)

In [15]:
temp_url = os.environ.get('TEMP_FOLDER_URL')

loop_through_files(temp_url)

added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
added
