In [1]:
# 02_read_date.ipynb
# Reads the PDFs and extracts the portion of text contained in the section 'Modalità di apertura delle offerte'; the texts are saved in a CSV file.      

In [2]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [3]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import os 
from openai import OpenAI
from dotenv import load_dotenv # Needed to load the contents of the .env file


In [4]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data_to_df, convert_dmy_to_ymd, left_join_df, calculate_accuracy

In [5]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
bid_file_text = str(yaml_config["FILE_BID_TEXT"])
bid_file_text_date = str(yaml_config["FILE_BID_TEXT_DATE"])
bid_file_text_date_label = str(yaml_config["FILE_BID_TEXT_DATE_LABEL"])
openai_model_name = str(yaml_config["OPENAI_MODEL_NAME"])
csv_sep = str(yaml_config["CSV_SEP"])
sample_size = int(yaml_config["BID_SAMPLE_SIZE"]) 

# OpenAI
load_dotenv() # Load environment variables from the .env file
openai_api_key = os.getenv("OPENAI_API_KEY")
suffix = "OAI" # CSV suffix containing LLM results

In [6]:
### FUNCTIONS ###

In [7]:
def llm_test(model_name:str) -> None:
    """
    Test the connection to the LLM.
    
    Args:
        model_name (str): Name of the model.
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    try:
        stream = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": "Say this is a test connection [OK]"}],
        stream=True,
        )
        for chunk in stream:
                print(chunk.choices[0].delta.content or "", end="")
    except Exception as e:
        print(f"ERROR! An unexpected error occurred in LLM test connection: {e}")

In [8]:
def llm_find_date(text: str, model_name:str) -> str:
    """
    Extracts and formats a date from a given text string using ChatGPT. The date is returned in the format dd/mm/yyyy.

    Args:
        text (str): The text from which the date needs to be extracted. It should contain a date in any recognisable format.
        model_name (str): Name of the model.

    Returns:
        str: A string representing the date in dd/mm/yyyy format. If an error occurs, it returns the error message.
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                        {"role": "system", "content": "You are a helpful assistant in finding dates in Italian texts"},
                        {"role": "user", "content": f"Please extract and format only the date in this Italian text as yyyy-mm-dd, without adding anything other than the date. Dates can be written as dd.mm.yyyy or dd/mm/yyyy and days and months less than 10 may not have the leading 0 (e.g.: 5.2.2016). Write -1 if date not found: {text}"}
                    ],
            model=model_name,
        )
        # print(type(chat_completion)) # debug
        # ChatCompletion(id='chatcmpl-9MrwkHcMyDloWUBVqAcWaSv9vr8B2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='25/02/2016', role='assistant', function_call=None, tool_calls=None))], created=1715237614, model='gpt-4-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=6, prompt_tokens=132, total_tokens=138))
        # The chat_completion vaiable contains a response object ChatCompletion that is is a list of data
        # The textual response is in: choices[0] -> message -> content
        chat_response = chat_completion.choices[0].message.content.strip()
        print("LLM output:", chat_response) # debug
        return chat_response
        # for more than one response: generated_texts = [choice.message["content"].strip() for choice in chat_completion["choices"]]
    except Exception as e:
        print(f"ERROR! An unexpected error occurred in LLM find date: {e}")
        return None

In [9]:
def process_row(row: pd.Series, model_name:str) -> str:
    """
    Process a single row of the DataFrame. If the 'text' field is not None, call llm_find_date() with the 'text'.

    Args:
        row (pd.Series): A pandas Series object representing a single row.
        model_name (str): Name of the model.
    Returns:
        str: The date string returned by llm_find_date if 'text' is not None; otherwise, None.
    """
    if pd.notnull(row['text']):
        print("LLM input:", row['text'])
        return llm_find_date(row['text'], model_name)
    return None  # Return None if 'text' is None

In [10]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-16 11:07:58



In [11]:
# Reading CSV file text and dates to be extracted by LLM
print(">> Reading CSV file")
path_bid_text = Path(data_dir) / bid_file_text
print("File:", str(path_bid_text))
dic_type = {"file_name":object, "case_id":object, "text":object}
df_bid_text = read_csv_data_to_df(path_bid_text, dic_type, csv_sep)
df_bid_text_len = len(df_bid_text)
print("Rows in dataframe:", df_bid_text_len)
print("Columns in dataframe:", df_bid_text.columns)
print()

>> Reading CSV file
File: data/bid_opening_text.csv
Reading CSV with input col_type...
Rows in dataframe: 1063
Columns in dataframe: Index(['file_name', 'case_id', 'text'], dtype='object')



In [12]:
print(">> Reading the annotated dataset (with dates as labels)")
path_csv_label = Path(data_dir) / bid_file_text_date_label
print("Path:", str(path_csv_label))
dic_type = {"file_name":object, "text":object, "label":object}
df_label = read_csv_data_to_df(path_csv_label, dic_type, csv_sep)
# Convert labels
df_label['label_ymd'] = df_label['label'].apply(convert_dmy_to_ymd)
# Replace empty cells (not found) with -1
df_label.loc[df_label['label_ymd'].isna(), 'label_ymd'] = "-1"
print("Rows in dataframe with labels:", df_bid_text_len)
print("Columns in dataframe with labels:", df_bid_text.columns)

>> Reading the annotated dataset (with dates as labels)
Path: data/bid_opening_text_date_label.csv
Reading CSV with input col_type...
Rows in dataframe with labels: 1063
Columns in dataframe with labels: Index(['file_name', 'case_id', 'text'], dtype='object')


In [13]:
# Testing the connection to LLM
print(">> Testing LLM connection")
# print("Open API key:", openai_api_key) # debug
print("Model name:", openai_model_name)
llm_test(openai_model_name)

>> Testing LLM connection
Model name: gpt-3.5-turbo
This is a test connection [OK]

In [14]:
# Parse the files
print(">> Reading CSV text and querying LLM")

# If sample_size is greater than 0 it extracts a sample of rows from the dataset
if sample_size > 0:
    print(f"Using a sample of {sample_size}")
    df_bid_text = df_bid_text.sample(n=sample_size)

# Applies LLM on the entire dataframe 
df_bid_text['date'] = df_bid_text.apply(lambda row: process_row(row, openai_model_name), axis=1)
print()

>> Reading CSV text and querying LLM
LLM input: Data: 12.4.2016 - 10:30|Luogo:|Servizio Patrimonio e Provveditorato — Piazza della Libertà 1 — Arezzo.|Persone ammesse ad assistere all'apertura delle offerte: sì|Informazioni complementari sulle persone ammesse e la procedura di apertura: I rappresentanti delle imprese |concorrenti.
LLM output: 2016-04-12
LLM input: Data: 19/09/2018|Ora locale: 09:30|Luogo:|C/o Centro Polivalente Il Melograno – Via Brera 31 – Cornaredo|Informazioni relative alle persone ammesse e alla procedura di apertura:|Sono ammesse ad assistere all’apertura delle offerte i rappresentanti legali degli operatori economici |partecipanti alla gara o suoi incaricati muniti di idonea delega.
LLM output: 2018-09-19
LLM input: Data: 08/06/2018|Ora locale: 09:30|Luogo:|Sala della Minor Cella, Comune di Modena, via Scudari 20, Modena.|Informazioni relative alle persone ammesse e alla procedura di apertura:|Legali rappresentanti delle ditte partecipanti o loro delegati.
LLM ou

In [15]:
print(">> New data obtained from LLM")
# df_bid_text['date'].fillna("-1", inplace=True)
# Replace empty cells (not found) with -1
df_bid_text.loc[df_bid_text['date'].isna(), 'date'] = "-1"
df_bid_text_len = len(df_bid_text)
print("Rows in dataframe:", df_bid_text_len)
print("Columns in dataframe:", df_bid_text.columns)
print(df_bid_text.head())
print()

>> New data obtained from LLM
Rows in dataframe: 10
Columns in dataframe: Index(['file_name', 'case_id', 'text', 'date'], dtype='object')
                     file_name     case_id  \
27   2016-OJS059-099856-it.pdf   201699856   
644  2018-OJS135-308355-it.pdf  2018308355   
577  2018-OJS083-187499-it.pdf  2018187499   
918  2019-OJS212-520327-it.pdf  2019520327   
709  2018-OJS199-453265-it.pdf  2018453265   

                                                  text        date  
27   Data: 12.4.2016 - 10:30|Luogo:|Servizio Patrim...  2016-04-12  
644  Data: 19/09/2018|Ora locale: 09:30|Luogo:|C/o ...  2018-09-19  
577  Data: 08/06/2018|Ora locale: 09:30|Luogo:|Sala...          -1  
918  Data: 03/12/2019|Ora locale: 09:30|Luogo:|Comu...          -1  
709  Data: 22.11.2018 - 09:30|Luogo:|Palazzo Munici...  2018-11-22  



In [16]:
# Create a file with the texts extracted from the PDFs
print(">> Saving bid opening texts and dates")
print("Dataframe with dates shape:", df_bid_text.shape)  # should be same shape as df_label
file_name = Path(bid_file_text_date).stem # get the general filename without extension
file_name_csv = f"{file_name}_{suffix}.csv"
path_out = Path(data_dir) / file_name_csv
print("Path:", path_out)
df_bid_text.to_csv(path_out, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)
print()

>> Saving bid opening texts and dates
Dataframe with dates shape: (10, 4)
Path: data/bid_opening_text_date_OAI.csv


In [17]:
# Checking accuracy against the annotated dataset
print(">> Checking accuracy against the annotated dataset")

# Only extracts dates found (!= -1)
df_bid_text = df_bid_text[df_bid_text['date'] != "-1"]
print("Dates from LLM length:", len(df_bid_text)) # should be same length as label_dates_list

# Only extracts labels (!= -1)
df_label = df_label[df_label['label_ymd'] != "-1"]
print("Dates from LABELS length:", len(df_label))  # should be same length as llm_dates_list

print()

# Merges the dataset from which the dates were extracted with the one containing the labels
key_col = "file_name" # Key column on which to perform the join
col_del = ["text"] # List of columns to be removed from the join
merged_df = left_join_df(df_bid_text, df_label, key_col, col_del)

print("Dataframe joint for accuracy")
print("Dataframe length:", len(merged_df))
print(merged_df.head())
print(merged_df.columns)
print()


>> Checking accuracy against the annotated dataset
Dates from LLM length: 7
Dates from LABELS length: 993

Dataframe joint for accuracy
Dataframe length: 7
                   file_name     case_id        date       label   label_ymd
0  2016-OJS059-099856-it.pdf   201699856  2016-04-12  12/04/2016  2016-04-12
1  2018-OJS135-308355-it.pdf  2018308355  2018-09-19  19/09/2018  2018-09-19
2  2018-OJS199-453265-it.pdf  2018453265  2018-11-22  22/11/2018  2018-11-22
3  2017-OJS147-304513-it.pdf  2017304513  2017-09-18  18/09/2017  2017-09-18
4  2019-OJS091-219253-it.pdf  2019219253  2019-05-29  29/05/2019  2019-05-29



In [18]:
# Computing precision
accuracy = calculate_accuracy(merged_df, "date", "label_ymd")
print(f"Accuracy: {accuracy:.2f} over a sample of {len(merged_df)} rows")
print()

Column type: object,object
Accuracy: 1.00 over a sample of 7 rows



In [19]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-16 11:08:05
Time to finish: 0:00:07


*** PROGRAM END ***

