In [None]:
# 02_read_date_OPENAI.ipynb
# Reads the PDFs and extracts the portion of text contained in the section 'Modalità di apertura delle offerte'; the texts are saved in a CSV file.
# Use https://openai.com/

In [None]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [None]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import os 
from openai import OpenAI
from dotenv import load_dotenv # Needed to load the contents of the .env file

In [None]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data_to_df, convert_dmy_to_ymd, left_join_df, calculate_accuracy

In [None]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
csv_sep = str(yaml_config["CSV_SEP"])
sample_size = int(yaml_config["TEST_SAMPLE"]) 

# INPUT
dic_lan = {"DE":0, "ES":0, "FR":0, "IT":1, "PT":0} # <-- INPUT: set 1 for the desired language; set only one language at time
bid_file_text = str(yaml_config["FILE_BID_TEXT"]) # the input file with text from which extract dates
bid_file_text_date_label = str(yaml_config["FILE_BID_TEXT_DATE_LABEL"]) # the label file based on FILE_BID_TEXT

# OUTPUT
bid_file_text_date = str(yaml_config["FILE_BID_TEXT_DATE"]) # the output files with dates
log_llm = str(yaml_config["LOG_PDF_LLM"]) 
log_llm_header = str(yaml_config["LOG_PDF_LLM_HEADER"]) 
log_llm = str(yaml_config["LOG_PDF_LLM"]) 
log_llm_header = str(yaml_config["LOG_PDF_LLM_HEADER"])

# OpenAI
load_dotenv() # Load environment variables from the .env file
openai_api_key = os.getenv("OPENAI_API_KEY")
suffix = "OAI" # CSV suffix containing LLM results
openai_model_name = str(yaml_config["OPENAI_MODEL_NAME"])

## FUNCTIONS

In [None]:
def llm_test(model_name:str) -> None:
    """
    Test the connection to the LLM.
    
    Parameters:
        model_name (str): Name of the model.
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    try:
        stream = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": "Say this is a test connection [OK]"}],
        stream=True,
        )
        for chunk in stream:
                print(chunk.choices[0].delta.content or "", end="")
    except Exception as e:
        print(f"ERROR! An unexpected error occurred in LLM test connection: {e}")

In [None]:
def llm_find_date(text: str, model_name:str) -> str:
    """
    Extracts and formats a date from a given text string using ChatGPT. The date is returned in the format dd/mm/yyyy.

    Parameters:
        text (str): The text from which the date needs to be extracted. It should contain a date in any recognisable format.
        model_name (str): Name of the model.

    Returns:
        str: A string representing the date in dd/mm/yyyy format. If an error occurs, it returns the error message.
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                        {"role": "system", "content": "You are a helpful assistant in finding dates in Italian texts"},
                        {"role": "user", "content": f"Please extract and format only the date in this Italian text as yyyy-mm-dd, without adding anything other than the date. Dates can be written as dd.mm.yyyy or dd/mm/yyyy and days and months less than 10 may not have the leading 0 (e.g.: 5.2.2016). Write -1 if date not found: {text}"}
                    ],
            model=model_name,
        )
        # print(type(chat_completion)) # debug
        # ChatCompletion(id='chatcmpl-9MrwkHcMyDloWUBVqAcWaSv9vr8B2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='25/02/2016', role='assistant', function_call=None, tool_calls=None))], created=1715237614, model='gpt-4-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=6, prompt_tokens=132, total_tokens=138))
        # The chat_completion vaiable contains a response object ChatCompletion that is is a list of data
        # The textual response is in: choices[0] -> message -> content
        chat_response = chat_completion.choices[0].message.content.strip()
        print("LLM output:", chat_response) # debug
        print("-"*3)
        return chat_response
        # for more than one response: generated_texts = [choice.message["content"].strip() for choice in chat_completion["choices"]]
    except Exception as e:
        print(f"ERROR! An unexpected error occurred in LLM find date: {e}")
        return None

In [None]:
def process_row(row: pd.Series, model_name:str) -> str:
    """
    Process a single row of the DataFrame. If the 'text' field is not None, call llm_find_date() with the 'text'.

    Parameters:
        row (pd.Series): A pandas Series object representing a single row.
        model_name (str): Name of the model.
    Returns:
        str: The date string returned by llm_find_date if 'text' is not None; otherwise, None.
    """
    if pd.notnull(row['text']):
        print("LLM input:", row['text'])
        return llm_find_date(row['text'], model_name)
    return None  # Return None if 'text' is None

## MAIN

In [None]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()

In [17]:
print(">> Settings")
lang_code = None
key_with_value_1 = [key for key, value in dic_lan.items() if value == 1]
if key_with_value_1: 
    lang_code = key_with_value_1[0]
print("Desired language code for LLM:", lang_code)

# input
bid_file_text_lang = bid_file_text.replace("LANG", lang_code)
path_bid_text_lang = Path(data_dir) / bid_file_text_lang
print("File input:", path_bid_text_lang)

bid_file_text_date_label_lang = bid_file_text_date_label.replace("LANG", lang_code)
path_bid_text_lang_label = Path(data_dir) / bid_file_text_date_label_lang
print("File input with labels:", path_bid_text_lang_label)

# output
bid_file_text_date_lang = bid_file_text_date.replace("LANG", lang_code)
path_bid_text_lang_llm = Path(data_dir) / bid_file_text_date_lang
print("File output:", path_bid_text_lang_llm)

>> Settings
Desired language code for LLM: IT
File input: data/bid_opening_text_IT.csv
File input with labels: data/bid_opening_text_IT_date_label.csv
File output: data/bid_opening_text_IT_date_llm.csv


In [18]:
print(">> Preparing the output timing log")
path_log = Path(data_dir) / log_llm
if not path_log.exists():
    # If the file does not exist, create it with the specified header
    with path_log.open(mode='w') as fp:
        fp.write(f"{log_llm_header}\n")
    print(f"File created with header: {path_log}")
else:
    print(f"The file already exists: {path_log}")

>> Preparing the output timing log
File created with header: data/ted_log_pdf_llm.csv


In [None]:
# Reading CSV file text and dates to be extracted by LLM
print(">> Reading CSV input file")
path_bid_text = Path(data_dir) / bid_file_text
print("File:", str(path_bid_text))
dic_type = {"file_name":object, "case_id":object, "text":object, "text_date":object}
df_bid_text = read_csv_data_to_df(path_bid_text, dic_type, csv_sep)
df_bid_text_len = len(df_bid_text)
print("Rows in dataframe:", df_bid_text_len)
print("Columns in dataframe:", df_bid_text.columns)
print()

In [None]:
# Reading CSV file text and annotated as labels
print(">> Reading the annotated dataset (with dates as labels)")
path_csv_label = Path(data_dir) / bid_file_text_date_label
print("Path:", str(path_csv_label))
dic_type = {"file_name":object, "text":object, "label":object}
df_label = read_csv_data_to_df(path_csv_label, dic_type, csv_sep)
# Convert labels
df_label['label_ymd'] = df_label['label'].apply(convert_dmy_to_ymd)
# Replace empty cells (not found) with -1
df_label.loc[df_label['label_ymd'].isna(), 'label_ymd'] = "-1"
print("Rows in dataframe with labels:", df_bid_text_len)
print("Columns in dataframe with labels:", df_bid_text.columns)

In [None]:
# Testing the connection to LLM
print(">> Testing LLM connection")
# print("Open API key:", openai_api_key) # debug
print("Model name:", openai_model_name)
llm_test(openai_model_name)

In [None]:
# Parse the files
print(">> Reading CSV text and querying LLM")

# If sample_size is greater than 0 it extracts a sample of rows from the dataset
if sample_size > 0:
    if sample_size > df_bid_text_len:
        sample_size = df_bid_text_len
    print(f"Using a sample of size {sample_size}")
    df_bid_text = df_bid_text.sample(n=sample_size)
else:
    print("Using the entire dataframe")
    df_bid_text = df_bid_text  # Optionally, you can add df_bid_text = df_bid_text.copy() to make it explicit that no sampling is applied
print()

# Applies LLM on the dataframe 
print("Query at LLM started")
print()
df_bid_text['date'] = df_bid_text.apply(lambda row: process_row(row, openai_model_name), axis=1)
print()
print("Query at LLM concluded")
print()

In [None]:
print(">> New data obtained from LLM")
# df_bid_text['date'].fillna("-1", inplace=True)
# Replace empty cells (not found) with -1
df_bid_text.loc[df_bid_text['date'].isna(), 'date'] = "-1"
df_bid_text_len = len(df_bid_text)
print("Rows in dataframe:", df_bid_text_len)
print("Columns in dataframe:", df_bid_text.columns)
print(df_bid_text.head())
print()

In [None]:
# Create a file with the texts extracted from the PDFs
print(">> Saving bid opening texts and dates")
print("Dataframe with dates shape:", df_bid_text.shape)  # should be same shape as df_label
file_name = Path(bid_file_text_date).stem # get the general filename without extension
file_name_csv = f"{file_name}_{suffix}.csv"
path_out = Path(data_dir) / file_name_csv
print("Path:", path_out)
df_bid_text.to_csv(path_out, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)
print()

In [None]:
# Checking accuracy against the annotated dataset
print(">> Checking accuracy against the annotated dataset")

# Only extracts dates found (!= -1)
df_bid_text = df_bid_text[df_bid_text['date'] != "-1"]
print("Dates from LLM length:", len(df_bid_text)) # should be same length as label_dates_list

# Only extracts labels (!= -1)
df_label = df_label[df_label['label_ymd'] != "-1"]
print("Dates from LABELS length:", len(df_label))  # should be same length as llm_dates_list

print()

# Merges the dataset from which the dates were extracted with the one containing the labels
key_col = "file_name" # Key column on which to perform the join
col_del = ["text"] # List of columns to be removed from the join
merged_df = left_join_df(df_bid_text, df_label, key_col, col_del)

print("Dataframe joint for accuracy")
print("Dataframe length:", len(merged_df))
print(merged_df.head())
print(merged_df.columns)
print()


In [None]:
# Computing precision
accuracy = calculate_accuracy(merged_df, "date", "label_ymd")
print(f"Accuracy: {accuracy:.2f} over a sample of {len(merged_df)} rows")
print()

In [None]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()