In [59]:
# 01_read_pdf.ipynb
# For each text in the CSV extracted in the script '01_read_pdf', it requests the LLM to identify the date.     

In [58]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [56]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import PyPDF2

In [62]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import list_files_by_type, list_subdirectories

In [57]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
download_dir = str(yaml_config["DOWNLOAD_DIR"])
guue_dir = str(yaml_config["GUUE_DIR"])
data_dir = str(yaml_config["DATA_DIR"])
csv_sep = str(yaml_config["CSV_SEP"])
ted_url = str(yaml_config["TED_URL"])
bid_file_text = str(yaml_config["FILE_BID_TEXT"]) # output
dic_lan = {"DE":0, "ES":0, "FR":0, "IT":1, "PT":0} # <-- INPUT: set 1 for the desired language; set only one language at time
dic_markers = {"DE":[], "ES":[], "FR":[], "IT":["Modalità di apertura delle offerte", "Sezione VI: Altre informazioni"], "PT":[]} # <-- INPUT: for each language, set start and end text markers
year_start = int(yaml_config["YEAR_START"])
year_end = int(yaml_config["YEAR_END"])
years_list = list(range(year_start, year_end + 1))
log_pdf = str(yaml_config["LOG_PDF_EXTRACTION"]) 
log_pdf_header = str(yaml_config["LOG_PDF_EXTRACTION_HEADER"])

## FUNCTIONS

In [60]:
### FUNCTIONS ###
def extract_text_by_line(pdf_path: str) -> str:
    """
    Extracts and prints the text from a PDF file, line by line.

    Args:
        pdf_path (str): The path to the PDF file to be processed.

    Returns:
        str: Returns None if the file could not be opened, or if no text could be extracted.
    """
    # Initialize a list to hold all extracted text
    full_text = [] # Text of all lines in the PDF
    header_text = [] # Text in PDF header (to be found and avoided as footer)
    try:
        # Open the PDF file in binary read mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Loop through each page in the PDF
            for page_num, page in enumerate(pdf_reader.pages):
                # Extract text from the page
                page_text = page.extract_text()
                
                # Check if text was successfully extracted
                if page_text:
                    # Loop each line of text inside page_text
                    line_count = 0 # to detect header
                    parts_header = None
                    for line in page_text.split('\n'):
                        line_count+=1
                        # print("Line count:",line_count) # debug
                        # print("Line text:",line) # debug
                        if line_count <= 3:
                            header_text.append(line) # Trace the header text
                            if line_count == 1: # Special text in line 1 (header: GU/S S148 -> footer: 03/08/2022 S148)
                                parts_header = line.split(" ")
                                for part in parts_header:
                                    header_text.append(part)
                            if line_count == 2:
                                header_text.append(line)
                                h2 = f"{line} {parts_header[1]}"
                                header_text.append(h2)
                        if line_count > 3: # skip header text
                            if ted_url in line:
                                continue
                            else:
                                if line not in header_text: # not in header_text -> avoid the footer
                                    full_text.append(line)
                else:
                    print(f"Text could not be extracted from page {page_num + 1}")
    except FileNotFoundError:
        print(f"ERROR! File '{pdf_path}'. Error: not found.")
    except Exception as e:
        print(f"ERROR! File '{pdf_path}'. Error: {e}")
    finally:
        return full_text

In [61]:
def extract_elements_between_markers(string_list:list, start_marker:str, end_marker:str) -> list: 
    """
    Extracts a sublist of elements from the given list of strings, starting from the element that contains the start_marker and ending just before the element that contains the end_marker.

    Args:
        string_list (list of str): The list of strings to be processed.
        start_marker (str): The substring that marks the start of the extraction.
        end_marker (str): The substring that marks the end of the extraction.

    Returns:
        list of str: A sublist of the original list, containing elements between the start and end markers.
    """
    # Find the index of the start marker
    start_index = next((index for index, item in enumerate(string_list) if start_marker in item), None)
    # Find the index of the end marker
    end_index = next((index for index, item in enumerate(string_list) if end_marker in item), None)

    # Return the sublist if both markers are found
    if start_index is not None and end_index is not None and start_index < end_index:
        return string_list[start_index + 1:end_index]
    else:
        return []  # Return an empty list if markers are not found or in the wrong order

In [63]:
def filename_to_caseid(filename:str) -> str:
    """ 
    Given the name of a file, derive its case ID merging YEAR and file ID.

    Args:
        filename (str): The string containing the file name.
    
    Returns:
        str: YEARID
    """
    # Split the filename by dashes
    parts = filename.split('-')
    
    # Extract the first element
    first_element = parts[0]
    
    # Extract the third element and remove leading zeros
    third_element = parts[2].lstrip('0')
    
    # Concatenate the two parts
    result = first_element + third_element
    
    return result

## MAIN

In [64]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-09-02 11:28:30



In [65]:
# print(yaml_config) # debug

In [66]:
print(">> Settings")
path_guee = Path(download_dir) / guue_dir
list_dirs = list_subdirectories(path_guee)
list_dirs_len = len(list_dirs)
print(f"Directories with GUUE found ({list_dirs_len}): {list_dirs}")
print("Years list")
print(years_list)
lang_code = None
key_with_value_1 = [key for key, value in dic_lan.items() if value == 1]
if key_with_value_1: 
    lang_code = key_with_value_1[0]
print("Desired language code for markers:", lang_code)
bid_opening_marker_start = dic_markers[lang_code][0]
bid_opening_marker_end = dic_markers[lang_code][1]
print("Markers (start):", bid_opening_marker_start)
print("Markers (end):", bid_opening_marker_end)

>> Settings
Directories with GUUE found (5): ['DE', 'ES', 'FR', 'IT', 'PT']
Years list
[2016, 2017, 2018, 2019, 2020, 2021, 2022]
Desired language code for markers: IT
Markers (start): Modalità di apertura delle offerte
Markers (end): Sezione VI: Altre informazioni


In [None]:
print(">> Preparing the output timing log")
path_log = Path(data_dir) / log_pdf
if not path_log.exists():
    # If the file does not exist, create it with the specified header
    with path_log.open(mode='w') as fp:
        fp.write(f"{log_pdf_header}\n")
    print(f"File created with header: {path_log}")
else:
    print(f"The file already exists: {path_log}")

>> Preparing the output timing log
The file already exists: data/ted_log_pdf_extraction.csv


In [67]:
# Create list of PDF files of a language
print(">> Listing PDF files")
list_pdf_files = []
path_guee_lang = Path(download_dir) / guue_dir / lang_code
print("Directory (lang):", path_guee_lang)
for year_value in years_list:
    path_guee_lang_year = path_guee_lang / str(year_value)
    list_pdf_files_y = list_files_by_type(path_guee_lang_year, "pdf")
    list_pdf_files_y_len = len(list_pdf_files_y)
    print(f"Files found for year {year_value}: {list_pdf_files_y_len}")
    list_pdf_files.extend(list_pdf_files_y)
list_pdf_files_len = len(list_pdf_files)
print("Total files found:", list_pdf_files_len)
# print("Files:", list_pdf_files) # debug

>> Listing PDF files
Directory (lang): /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction from structured and unstructured data for process mining (techniques, algorithms, and tools)/Python - GUUE in CSV per LOG/guue/IT
Files found for year 2016: 3052
Files found for year 2017: 3581
Files found for year 2018: 3671
Files found for year 2019: 3822
Files found for year 2020: 3777
Files found for year 2021: 4557
Files found for year 2022: 4783
Total files found: 27243


In [68]:
# Parse the files
print(">> Parsing PDF files")

i = 0

list_bid_opening_dic = [] # Container of texts of interest extracted from PDFs

for pdf_file in list_pdf_files:
    i+=1
    print(f"[{i} / {list_pdf_files_len}]")
    print("Reading (path):", str(pdf_file))
    text_list = extract_text_by_line(pdf_file) # Extracts all lines of text from the PDF
    text_list_len = len(text_list)
    # print(text_list) # debug

    # prepare the dictionary output for the actual PDF
    file_name = pdf_file.name # get only the file name in str
    case_id  = filename_to_caseid(file_name) # get the case_id from the file_name
    dic_bid_opening = {"file_name":file_name, "case_id":case_id, "text": None}
    
    print(f"Case ID derived from file name '{file_name}': {case_id}")
    print("Lines found in the PDF:", text_list_len)

    # get only lines about bid opening
    if text_list_len > 0:
        marker_list = extract_elements_between_markers(text_list, bid_opening_marker_start, bid_opening_marker_end) 
        marker_list_len = len(marker_list)
        # print(marker_list) # debug
        print("Bid opening strings found between markers:", marker_list_len)
        
        if marker_list_len > 0:
            marker_text = "|".join(marker_list)
            # print(marker_text) # debug
            dic_bid_opening["text"] = marker_text

    # save the file and the text found in bid opening section into a dictionary
    list_bid_opening_dic.append(dic_bid_opening)
    print()

>> Parsing PDF files
[1 / 27243]
Reading (path): /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction from structured and unstructured data for process mining (techniques, algorithms, and tools)/Python - GUUE in CSV per LOG/guue/IT/2016/2016-OJS001-00000108-it-ts.pdf
Case ID derived from file name '2016-OJS001-00000108-it-ts.pdf': 2016108
Lines found in the PDF: 235
Bid opening strings found between markers: 7

[2 / 27243]
Reading (path): /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction from structured and unstructured data for process mining (techniques, algorithms, and tools)/Python - GUUE in CSV per LOG/guue/IT/2016/2016-OJS001-00000236-it-ts.pdf
Case ID derived from file name '2016-OJS001-00000236-it-ts.pdf': 2016236
Lines found in the PDF: 629
Bid opening strings found between markers: 7

[3 / 27243]
Reading (path): /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction

In [None]:
# Create a file with the texts extracted from the PDFs
print(">> Saving bid opening texts")
df_bid = pd.DataFrame.from_records(list_bid_opening_dic) # Create a dataframe from a list of dictionaries
df_bid = df_bid.sort_values(by='file_name')
# print(df_bid.head()) # debug
path_out = Path(data_dir) / bid_file_text
print("Path:", path_out)
df_bid.to_csv(path_out, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)

>> Saving bid opening texts
Path: data/bid_opening_text.csv


In [None]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

# Save the timing
print(">> Saving timing to log")
csv_str = f"{lang_code};{str(start_time)};{str(end_time)};{str(delta_time)}\n"
with open(path_log, "a") as fp:
    fp.write(csv_str)
    
print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-08 15:27:34
Time to finish: 0:00:37


*** PROGRAM END ***

