In [171]:
# 01_read_pdf.ipynb
# For each text in the CSV extracted in the script '01_read_pdf', it requests the LLM to identify the date.     

In [172]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [173]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import PyPDF2

In [174]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import list_files_by_type

In [175]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
guue_dir = str(yaml_config["GUUE_DIR"])
data_dir = str(yaml_config["DATA_DIR"])
bid_opening_marker_start =  str(yaml_config["BID_OPENING_MARKER_START"])
bid_opening_marker_end =  str(yaml_config["BID_OPENING_MARKER_END"])
bid_file_text = str(yaml_config["FILE_BID_TEXT"])
csv_sep = str(yaml_config["CSV_SEP"])
ted_url = str(yaml_config["TED_URL"])

In [176]:
### FUNCTIONS ###
def extract_text_by_line(pdf_path: str) -> str:
    """
    Extracts and prints the text from a PDF file, line by line.

    Args:
        pdf_path (str): The path to the PDF file to be processed.

    Returns:
        Optional[str]: Returns None if the file could not be opened, or if no text could be extracted.
    """
    # Initialize a list to hold all extracted text
    full_text = [] # Text of all lines in the PDF
    header_text = [] # Text in PDF header (to be found and avoided as footer)
    try:
        # Open the PDF file in binary read mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Loop through each page in the PDF
            for page_num, page in enumerate(pdf_reader.pages):
                # Extract text from the page
                page_text = page.extract_text()
                
                # Check if text was successfully extracted
                if page_text:
                    # Loop each line of text inside page_text
                    line_count = 0 # to detect header
                    parts_header = None
                    for line in page_text.split('\n'):
                        line_count+=1
                        # print("Line count:",line_count) # debug
                        # print("Line text:",line) # debug
                        if line_count <= 3:
                            header_text.append(line) # Trace the header text
                            if line_count == 1: # Special text in line 1 (header: GU/S S148 -> footer: 03/08/2022 S148)
                                parts_header = line.split(" ")
                                for part in parts_header:
                                    header_text.append(part)
                            if line_count == 2:
                                header_text.append(line)
                                h2 = f"{line} {parts_header[1]}"
                                header_text.append(h2)
                        if line_count > 3: # skip header text
                            if ted_url in line:
                                continue
                            else:
                                if line not in header_text: # not in header_text -> avoid the footer
                                    full_text.append(line)
                else:
                    print(f"Text could not be extracted from page {page_num + 1}")
    except FileNotFoundError:
        print(f"ERROR! File '{pdf_path}'. Error: not found.")
    except Exception as e:
        print(f"ERROR! File '{pdf_path}'. Error: {e}")
    finally:
        return full_text

In [177]:
def extract_elements_between_markers(string_list:list, start_marker:str, end_marker:str) -> list: 
    """
    Extracts a sublist of elements from the given list of strings, starting from the element that contains the start_marker and ending just before the element that contains the end_marker.

    Args:
        string_list (list of str): The list of strings to be processed.
        start_marker (str): The substring that marks the start of the extraction.
        end_marker (str): The substring that marks the end of the extraction.

    Returns:
        list of str: A sublist of the original list, containing elements between the start and end markers.
    """
    # Find the index of the start marker
    start_index = next((index for index, item in enumerate(string_list) if start_marker in item), None)
    # Find the index of the end marker
    end_index = next((index for index, item in enumerate(string_list) if end_marker in item), None)

    # Return the sublist if both markers are found
    if start_index is not None and end_index is not None and start_index < end_index:
        return string_list[start_index + 1:end_index]
    else:
        return []  # Return an empty list if markers are not found or in the wrong order


In [178]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-08 15:26:57



In [179]:
# Create list of PDF files
print(">> Listing PDF files")
print("Directory:", guue_dir)
list_pdf_files = list_files_by_type(guue_dir, "pdf")
list_pdf_files_len = len(list_pdf_files)
print("Files found:", list_pdf_files_len)
# print("Files:", list_csv_files) # debug
print()

>> Listing PDF files
Directory: guue
Files found: 1063



In [180]:
# Parse the files
print(">> Parsing PDF files")

i = 0

list_bid_opening_dic = [] # Container of texts of interest extracted from PDFs

for pdf_file in list_pdf_files:
    i+=1
    print(f"[{i} / {list_pdf_files_len}]")
    print("Reading (path):", str(pdf_file))
    text_list = extract_text_by_line(pdf_file) # Extracts all lines of text from the PDF
    text_list_len = len(text_list)
    # print(text_list) # debug

    # prepare the dictionary output for the actual PDF
    file_name = pdf_file.name # get only the file name in str
    dic_bid_opening = {"file_name":file_name, "text": None}
    
    print("Lines found in the PDF:", text_list_len)

    # get only lines about bid opening
    if text_list_len > 0:
        marker_list = extract_elements_between_markers(text_list, bid_opening_marker_start, bid_opening_marker_end) 
        marker_list_len = len(marker_list)
        # print(marker_list) # debug
        print("Bid opening strings found between markers:", marker_list_len)
        
        if marker_list_len > 0:
            marker_text = "|".join(marker_list)
            # print(marker_text) # debug
            dic_bid_opening["text"] = marker_text

    # save the file and the text found in bid opening section into a dictionary
    list_bid_opening_dic.append(dic_bid_opening)
    print()

>> Parsing PDF files
[1 / 1063]
Reading (path): guue/2016-OJS003-002872-it.pdf
Lines found in the PDF: 186
Bid opening strings found: 6

[2 / 1063]
Reading (path): guue/2016-OJS004-004078-it.pdf
Lines found in the PDF: 122
Bid opening strings found: 0

[3 / 1063]
Reading (path): guue/2016-OJS008-009964-it.pdf
Lines found in the PDF: 155
Bid opening strings found: 6

[4 / 1063]
Reading (path): guue/2016-OJS011-015326-it.pdf
Lines found in the PDF: 179
Bid opening strings found: 5

[5 / 1063]
Reading (path): guue/2016-OJS012-017147-it.pdf
Lines found in the PDF: 161
Bid opening strings found: 3

[6 / 1063]
Reading (path): guue/2016-OJS022-035271-it.pdf
Lines found in the PDF: 152
Bid opening strings found: 6

[7 / 1063]
Reading (path): guue/2016-OJS022-035325-it.pdf
Lines found in the PDF: 530
Bid opening strings found: 6

[8 / 1063]
Reading (path): guue/2016-OJS023-036422-it.pdf
Lines found in the PDF: 378
Bid opening strings found: 3

[9 / 1063]
Reading (path): guue/2016-OJS025-040488-

In [181]:
# Create a file with the texts extracted from the PDFs
print(">> Saving bid opening texts")
df_bid = pd.DataFrame.from_records(list_bid_opening_dic) # Create a dataframe from a list of dictionaries
df_bid = df_bid.sort_values(by='file_name')
# print(df_bid.head()) # debug
path_out = Path(data_dir) / bid_file_text
print("Path:", path_out)
df_bid.to_csv(path_out, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)

>> Saving bid opening texts
Path: data/bid_opening_text.csv


In [182]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-08 15:27:34
Time to finish: 0:00:37


*** PROGRAM END ***

