In [1]:
import datetime
import os
import requests
from dotenv import load_dotenv
import docx
import sys, pathlib, fitz
import io
import math

# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
# To perform OCR to extract text from images 
import pytesseract 
import pandas as pd

load_dotenv("../.env")

config = {
    'client_id': os.environ.get('CLIENT_ID'),
    'client_secret': os.environ.get('CLIENT_SECRET'),
    'authority': os.environ.get('AUTHORITY'),
    'scope': [os.environ.get('SCOPE')],
    'site_id': os.environ.get('SITE_ID'),
}

headers = {
        'Authorization': f'Bearer {os.environ.get("ACCESS_TOKEN")}',
        'Content-Type': 'application/json'
    }



drive_url = f"https://graph.microsoft.com/v1.0/drives/{os.environ.get('DRIVE_ID')}"

In [2]:
def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    return table

# Convert table into the appropriate format
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapped texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

def crop_image_to_text(element, page):
    # Get the coordinates to crop the image from the PDF
    [x0, y0, x1, y1] = [element.x0, element.y0, element.x1, element.y1]

    # Define the rectangle to crop
    clip_rect = fitz.Rect(x0, y0, x1, y1)

    # Crop the page to the size of the image
    pix = page.get_pixmap(clip=clip_rect)

    # Convert the pixmap to an image
    img_data = pix.tobytes("png")  # Convert the image to PNG bytes
    img = Image.open(io.BytesIO(img_data))
    text = pytesseract.image_to_string(img)
    return text

def is_bold(font_name):
    return "Bold" in font_name or "Bd" in font_name

def process_text_element(element):

    line_text = ""
    current_word = ""
    sum_words_font_size = 0
    word_count = 0
    current_word_bold = False

    # if isinstance(element, LTTextContainer):
    #     # Iterating through each character in the line of text
    for text_line in element:
        if isinstance(text_line, LTTextContainer):  # Check if it's a text line
            for character in text_line:
                if isinstance(character, LTChar):
                    current_word += character.get_text()
            
                    # Check for bold
                    if is_bold(character.fontname):
                        current_word_bold = True

                    # Store font size
                    current_font_size = round(character.size)

                if character.get_text().isspace():
                    # Space encountered, word ends
                    if current_word:
                        word_count += 1
                        sum_words_font_size += current_font_size
                        yield [current_word, current_font_size, current_word_bold]
                    
                    # Reset for next word
                    current_word = ""
                    current_font_size = []
                    current_word_bold = False

    return [sum_words_font_size, word_count]

def read_pdf(stream):

    # Create the dictionary to extract text from each image
    # text_per_page = {}

    doc = fitz.open(stream=stream, filetype="pdf")

    # Open the pdf file
    pdf_stream = io.BytesIO(stream)
    pdf = pdfplumber.open(pdf_stream)

    doc_content = []

    words_info = []
    sum_words_font_size = 0
    word_count = 0
    
    # We extract the pages from the PDF
    for pagenum, page in enumerate(extract_pages(pdf_stream)):
        
        # Initialize the variables needed for the text extraction from the page
        fitz_page = doc.load_page(pagenum)

        table_num = 0
        first_element= True
        table_extraction_flag= False
        
        # Find the examined page
        page_tables = pdf.pages[pagenum]
        # Find the number of tables on the page
        tables = page_tables.find_tables()


        # Find all the elements
        page_elements = [(element.y1, element) for element in page._objs]
        # Sort all the elements as they appear in the page 
        page_elements.sort(key=lambda a: a[0], reverse=True)

        lower_side = upper_side = None

        # Find the elements that composed a page
        for i, component in enumerate(page_elements):
            # Extract the element of the page layout
            element = component[1]
            
            # Check if the element is a text element

            if isinstance(element, LTTextContainer):
                # Check if the text appeared in a table
                if table_extraction_flag == False:
                    # Use the function to extract the text and format for each text element
                    generator = process_text_element(element)

                    try:
                        while True:
                            # Process each yielded value
                            word_info = next(generator)
                            # Do something with word_info
                            doc_content.append(word_info[0])
                            words_info.append(word_info)
                    except StopIteration as e:
                        # Catch the return value from the generator
                        sum_words_font_size += e.value[0]
                        word_count += e.value[1]

                else:
                    # Omit the text that appeared in a table
                    pass

            # Check the elements for images
            if isinstance(element, LTFigure):
                # Crop the image from the PDF
                image_text = crop_image_to_text(element, fitz_page)
                # text_from_images.append(image_text)
                doc_content.append(image_text)

            # Check the elements for tables
            if isinstance(element, LTRect):
                # If the first rectangular element
                if first_element == True and (table_num + 1) <= len(tables):
                    # Find the bounding box of the table
                    lower_side = page.bbox[3] - tables[table_num].bbox[3]
                    upper_side = element.y1 
                    # Extract the information from the table
                    table = extract_table(pdf_stream, pagenum, table_num)
                    # Convert the table information in structured string format
                    table_string = table_converter(table)
                    # Append the table string into a list
                    # text_from_tables.append(table_string)
                    doc_content.append(table_string)
                    # Set the flag as True to avoid the content again
                    table_extraction_flag = True
                    # Make it another element
                    first_element = False

                # Check if we already extracted the tables from the page
                if lower_side and upper_side and element.y0 >= lower_side and element.y1 <= upper_side:
                    pass
                elif i + 1 < len(page_elements) and not isinstance(page_elements[i+1][1], LTRect):
                    table_extraction_flag = False
                    first_element = True
                    table_num += 1

    # Closing the pdf file object
    doc.close()

    text = ''.join(doc_content)

    avg_font_size = sum_words_font_size / word_count

    print(words_info)

    words_df = pd.DataFrame(data = words_info, columns = ["word", "font_size", "bold"])

    # return words_df
    return [text, words_df]

In [3]:
def loop_through_files(url, fileobj=None):


    response = requests.get(url=url, headers=headers)
    # response.raise_for_status()
    if not 200 <= response.status_code < 300:
        print("something went wrong when getting files")
        return

    if fileobj:
        filename = fileobj["name"]
        data = read_pdf(response.content)
        text = data[0]
        print(text)
        words_df = data[1]
        print(filename)
        return words_df
    
    for value in response.json()['value']:

        new_fileobj = None
        if value['name'].endswith(".pdf"):
            new_fileobj = value
            new_url = value['@microsoft.graph.downloadUrl']
            return loop_through_files(new_url, new_fileobj)
        elif value.get('folder'):
            new_url = url[:-len(':/children')] + '/' + value['name'] + ':/children'
            loop_through_files(new_url, new_fileobj)
            return loop_through_files(new_url, new_fileobj)
        else:
            continue

In [4]:
temp_url = os.environ.get('TEMP_FOLDER_URL')

words_df = loop_through_files(temp_url)

[['REF: ', 11, False], ['2023/PRI/01/01 ', 11, False], [' ', 11, False], ['6 ', 11, False], ['January ', 11, False], ['2023 ', 11, False], ['Dear ', 11, False], ['Parents/Guardians, ', 11, False], ['Welcome ', 11, False], ['to ', 11, False], ['the ', 11, False], ['start ', 11, False], ['of ', 11, False], ['another ', 11, False], ['new ', 11, False], ['school ', 11, False], ['year! ', 11, False], [' ', 11, False], ['2023 ', 11, False], ['looks ', 11, False], ['to ', 11, False], ['be ', 11, False], ['an ', 11, False], ['exciting ', 11, False], ['one ', 11, False], ['as ', 11, False], ['we ', 11, False], ['resume ', 11, False], ['many ', 11, False], ['programmes ', 11, False], ['and ', 11, False], ['activities ', 11, False], ['that ', 11, False], ['had ', 11, False], ['to ', 11, False], ['be ', 11, False], ['halted ', 11, False], ['or ', 11, False], ['modified ', 11, False], ['due ', 11, False], ['to ', 11, False], ['COVID. ', 11, False], ['There ', 11, False], ['are ', 11, False], ['a ',

In [5]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
# words_df
words_df.head(10)

Unnamed: 0,word,font_size,bold
0,REF:,11,False
1,2023/PRI/01/01,11,False
2,,11,False
3,6,11,False
4,January,11,False
5,2023,11,False
6,Dear,11,False
7,"Parents/Guardians,",11,False
8,Welcome,11,False
9,to,11,False


In [6]:
import numpy as np
words_df = words_df.replace(' ', np.nan).dropna().reset_index(drop=True)
words_df
words_df.head(10)

Unnamed: 0,word,font_size,bold
0,REF:,11,False
1,2023/PRI/01/01,11,False
2,6,11,False
3,January,11,False
4,2023,11,False
5,Dear,11,False
6,"Parents/Guardians,",11,False
7,Welcome,11,False
8,to,11,False
9,the,11,False


In [7]:
words_df['font_size'] = words_df.apply(lambda x: x['font_size'] * 2 if x['bold'] else x['font_size'], axis=1)
words_df = words_df.reset_index().rename(columns={'index': 'seq_no'})
words_df.head(100)

Unnamed: 0,seq_no,word,font_size,bold
0,0,REF:,11,False
1,1,2023/PRI/01/01,11,False
2,2,6,11,False
3,3,January,11,False
4,4,2023,11,False
5,5,Dear,11,False
6,6,"Parents/Guardians,",11,False
7,7,Welcome,11,False
8,8,to,11,False
9,9,the,11,False


In [8]:
mode_font_size = words_df['font_size'].mode()[0]
mode_font_size

11

In [9]:
words_df['font_size'].nsmallest(5)

362    8
363    8
366    8
367    8
368    8
Name: font_size, dtype: int64

In [10]:
words_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1507 entries, 0 to 1506
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   seq_no     1507 non-null   int64 
 1   word       1507 non-null   object
 2   font_size  1507 non-null   int64 
 3   bold       1507 non-null   bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 36.9+ KB


In [11]:
# Group by font size and count the occurrences
font_size_group = words_df.groupby('font_size').size().reset_index(name='counts')

# Calculate the proportion of each font size
total_words = font_size_group['counts'].sum()
font_size_group['proportion'] = (font_size_group['counts'] / total_words) * 100

# Sort by counts to get the less common font sizes
font_size_group_sorted = font_size_group.sort_values(by='font_size', ascending=False)

font_size_group_sorted['cumulative_proportion'] = font_size_group_sorted['proportion'].cumsum()

font_size_group_sorted

Unnamed: 0,font_size,counts,proportion,cumulative_proportion
4,22,136,9.024552,9.024552
3,16,12,0.796284,9.820836
2,11,1239,82.216324,92.03716
1,10,12,0.796284,92.833444
0,8,108,7.166556,100.0


In [12]:
# Define a threshold for the maximum proportion you consider significant
threshold = 20  # This means we consider font sizes that constitute less than 5% of the document

# Filter the DataFrame to get font sizes whose cumulative proportion is under the threshold
significant_font_sizes = font_size_group_sorted[font_size_group_sorted['cumulative_proportion'] <= threshold]

significant_font_sizes

Unnamed: 0,font_size,counts,proportion,cumulative_proportion
4,22,136,9.024552,9.024552
3,16,12,0.796284,9.820836


In [13]:
keywords_df = pd.DataFrame()

keywords = False

if not significant_font_sizes.empty:
    keywords = True
    # Extract the list of significant font sizes
    significant_sizes = significant_font_sizes['font_size'].tolist()

    # Filter the original DataFrame to only include rows with significant font sizes
    keywords_df = words_df[words_df['font_size'].isin(significant_sizes)]

keywords_df

Unnamed: 0,seq_no,word,font_size,bold
149,149,FOR,22,True
150,150,YOUR,22,True
151,151,ACTION,22,True
153,153,Sale,22,True
154,154,of,22,True
155,155,Uniform,22,True
156,156,in,22,True
157,157,School,22,True
177,177,Date,22,True
178,178,Time,22,True


In [14]:
# Initialize variables
# grouped_phrases = []
keywords = []
current_phrase = ''
previous_seq_no = -1

# Iterate through the DataFrame rows
for index, row in keywords_df.iterrows():
    keywords.append(row['word'].strip())
#     # Check if the current seq_no is consecutive to the previous one
#     if row['seq_no'] == previous_seq_no + 1:
#         # If consecutive, add the word to the current phrase
#         current_phrase += ('' + row['word'])
#     else:
#         # If not consecutive, and the current phrase is not empty,
#         # add the current phrase to the list of grouped phrases
#         if current_phrase:
#             grouped_phrases.append(current_phrase.strip())
#         # Start a new phrase with the current word
#         current_phrase = row['word']
#     # Update the previous seq_no to the current one
#     previous_seq_no = row['seq_no']

# # Add the last phrase to the list if not empty
# if current_phrase:
#     grouped_phrases.append(current_phrase.strip())

# # Resulting array of grouped phrases
# grouped_phrases_array = np.array(grouped_phrases)
# print(grouped_phrases_array)

final_keywords = ' '.join(keywords)
final_keywords

"FOR YOUR ACTION Sale of Uniform in School Date Time Venue Note Monday, 9 January 2023 Temperature-Taking Exercise Wednesday, 11 January 2023, MUST 2023 Book List Friday, 13 January. 恩园学校 127443. School Fees Schedule 2023 between 13 and 17 March 2023. NOT UPCOMING SCHOOL PROGRAMME / ACTIVITIES 20 January, Friday – Chinese New Year Celebration 1 February, Wednesday – Prefects' Investiture 3 February, Friday – Parent-Teacher Conference 1 (HBL for all students) IMPORTANT SCHOOL INFORMATION Student Care Centre @ Grace Orchard School 恩园学校 127443. Communication with Teachers The School will close at 6.30 p.m. 恩园学校 127443. School Safety and Security A. No Parking in the School Carpark Health and Fitness Programme 恩园学校 127443. Student Group Personal Accident Insurance Policy Medical expenses limit Accident death & permanent disablement benefits 20% co-payment requirement for parents 恩园学校 127443. 2023 TERM 1 SCHOOL CALENDAR DATE NO School for ALL students 恩园学校 127443."

In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(str(final_keywords))

filtered_tokens = set(token.text.lower() for token in doc if not token.is_stop and 
                                                              not token.like_num and 
                                                              token.is_alpha and 
                                                              token.ent_type_ not in ['DATE', 'TIME'])


filtered_text = " ".join(filtered_tokens)

doc = nlp(filtered_text)

for token in doc:
    print(f'{token.text:{10}} {token.pos_:{10}} {token.ent_type_:{10}}')

grace      NOUN                 
date       NOUN                 
co         NOUN                 
expenses   NOUN                 
student    NOUN                 
payment    NOUN                 
celebration NOUN                 
schedule   NOUN                 
february   PROPN      DATE      
time       NOUN                 
conference NOUN                 
information NOUN                 
exercise   NOUN                 
upcoming   ADJ                  
orchard    NOUN                 
health     NOUN                 
fitness    NOUN                 
venue      NOUN                 
parking    NOUN                 
benefits   NOUN                 
permanent  ADJ                  
medical    ADJ                  
teachers   NOUN                 
uniform    PROPN                
book       PROPN                
centre     PROPN                
group      NOUN                 
temperature NOUN                 
death      NOUN                 
security   NOUN                 
acciden