In [72]:
# 02_read_date_HUGCHAT.ipynb
# Reads the PDFs and extracts the portion of text contained in the section 'Modalità di apertura delle offerte'; the texts are saved in a CSV file.  
# Use https://huggingface.co/chat/    

In [73]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [137]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import os 
from dotenv import load_dotenv # Needed to load the contents of the .env file
from hugchat import hugchat
from hugchat.login import Login
import requests
from time import sleep as t_sleep

In [88]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data_to_df, convert_dmy_to_ymd

In [138]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
bid_file_text = str(yaml_config["FILE_BID_TEXT"])
bid_file_text_date = str(yaml_config["FILE_BID_TEXT_DATE"])
bid_file_text_date_label = str(yaml_config["FILE_BID_TEXT_DATE_LABEL"])
csv_sep = str(yaml_config["CSV_SEP"])

# HugChat
load_dotenv() # Load environment variables from the .env file
hc_username = os.getenv("HC_EMAIL")
hc_password = os.getenv("HC_PASS")
time_sleep = int(yaml_config["TIME_SLEEP"]) #  To avoid too many requests in a short time

In [30]:
### FUNCTIONS ###

In [111]:
def llm_test(username:str, passwd:str) -> None:
    """
    Test the connection to the LLM.
    
    Args:
        username (str): login e-mail.
        passwd (str): login password.
    """
    try:
        # Hugging Face Login
        session = Login(username, passwd)
        if session.login():
            print("OK! Successful login to LLM")
        else:
            print("ERROR! Login failed: invalid session.")
    except Exception as e:
        print(f"ERROR! An unexpected error occurred in LLM test connection: {e}")

In [125]:
# Function for generating LLM response
def llm_find_date(text:str, cookies:requests.cookies.RequestsCookieJar):
    """
    Extracts and formats a date from a given text string using HuggingChat. The date is returned in the format dd/mm/yyyy.

    Args:
        text (str): The text from which the date needs to be extracted. It should contain a date in any recognisable format.
        cookies (requests.cookies): Cookie to avoid login.

    Returns:
        str: A string representing the date in dd/mm/yyyy format. If an error occurs, it returns the error message.
    """
    # Create ChatBot
    try:
        prompt_input = f"Please extract and format only the date in this Italian text as yyyy-mm-dd, without adding anything other than the date. Dates can be written as dd.mm.yyyy or dd/mm/yyyy and days and months less than 10 may not have the leading 0 (e.g.: 5.2.2016). Write -1 if date not found. Text: {text}"
        chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
        response = chatbot.chat(prompt_input)
        print("LLM output:", response)
        # print(type(response))
        return response    
    except Exception as e:
        print(f"ERROR! An unexpected error occurred during the LLM prompt use: {e}")
        return None

In [141]:
def process_row(row: pd.Series, cookies:requests.cookies.RequestsCookieJar) -> str:
    """
    Process a single row of the DataFrame. If the 'text' field is not None, call llm_find_date() with the 'text'.

    Args:
        row (pd.Series): A pandas Series object representing a single row.
        cookies (requests.cookies): Cookie to avoid login.
    Returns:
        str: The date string returned by llm_find_date if 'text' is not None; otherwise, None.
    """
    if pd.notnull(row['text']):
        print("LLM input:", row['text'])
        t_sleep(time_sleep) # Avoid too many requests
        return llm_find_date(row['text'], cookies)
    return None  # Return None if 'text' is None

In [114]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-15 11:26:47



In [94]:
# Reading CSV file text and dates to be extracted by LLM
print(">> Reading CSV file")
path_bid_text = Path(data_dir) / bid_file_text
print("File:", str(path_bid_text))
dic_type = {"file_name":object, "case_id":object, "text":object}
df_bid_text = read_csv_data_to_df(path_bid_text, dic_type, csv_sep)
df_bid_text_len = len(df_bid_text)
print("Rows in dataframe:", df_bid_text_len)
print("Columns in dataframe:", df_bid_text.columns)
print()

>> Reading CSV file
File: data/bid_opening_text.csv
Reading CSV with input col_type...
Rows in dataframe: 1063
Columns in dataframe: Index(['file_name', 'case_id', 'text'], dtype='object')



In [95]:
print(">> Reading the annotated dataset (with dates as labels)")
path_csv_label = Path(data_dir) / bid_file_text_date_label
print("Path:", str(path_csv_label))
dic_type = {"file_name":object, "text":object, "label":object}
df_label = read_csv_data_to_df(path_csv_label, dic_type, csv_sep)
print("Dataframe with labels shape:", df_label.shape) # should be same shape as df_bid_text
# Convert labels
df_label['label_ymd'] = df_label['label'].apply(convert_dmy_to_ymd)
# Replace empty cells (not found) with -1
df_label.loc[df_label['label_ymd'].isna(), 'label_ymd'] = "-1"

>> Reading the annotated dataset (with dates as labels)
Path: data/bid_opening_text_date_label.csv
Reading CSV with input col_type...
Dataframe with labels shape: (1063, 3)


In [117]:
# Testing the connection to LLM
print(">> Testing LLM connection")
# print("Open API key:", openai_api_key) # debug
llm_test(hc_username, hc_password)

>> Testing LLM connection
OK! Successful login to LLM


In [142]:
# Parse the files
print(">> Reading CSV text and querying LLM")
# Hugging Face Login
sign = Login(hc_username, hc_password)
cookies = sign.login()
# df_bid_text['date'] = df_bid_text.apply(process_row, axis=1)
df_bid_text['date'] = df_bid_text.apply(lambda row: process_row(row, cookies), axis=1)
print()

>> Reading CSV text and querying LLM
LLM input: Data: 25.2.2016 - 9:00|Luogo:|Provincia di Varese — Piazza Libertà 1 — Varese.|Persone ammesse ad assistere all'apertura delle offerte: sì|Informazioni complementari sulle persone ammesse e la procedura di apertura: Un rappresentante per ogni |impresa offerente munito di relativa procura.
LLM output: 2016-02-25
LLM input: Data: 25.2.2016 - 9:30|Luogo:|Comune di Piacenza — Piazza Cavalli 2.|Persone ammesse ad assistere all'apertura delle offerte: sì|Informazioni complementari sulle persone ammesse e la procedura di apertura: Legali rappresentanti delle ditte |concorrenti o persone delegate.
LLM output: 2016-02-25
LLM input: Data: 11.3.2016 - 9:00|Luogo:|Settore Gare e Appalti, Via Marchetti 3 — Brescia.|Persone ammesse ad assistere all'apertura delle offerte: sì|Informazioni complementari sulle persone ammesse e la procedura di apertura: Seduta pubblica.
LLM output: 2016-03-11
LLM input: Data: 4.3.2016 - 10:00|Persone ammesse ad assistere 

In [39]:
print(">> New data obtained from LLM")
# df_bid_text['date'].fillna("-1", inplace=True)
# Replace empty cells (not found) with -1
df_bid_text.loc[df_bid_text['date'].isna(), 'date'] = "-1"
print(df_bid_text.columns)
print(df_bid_text.head())
print()

>> New data obtained from LLM
Index(['file_name', 'case_id', 'text', 'date'], dtype='object')
                   file_name    case_id  \
0  2016-OJS003-002872-it.pdf   20162872   
1  2016-OJS004-004078-it.pdf   20164078   
2  2016-OJS008-009964-it.pdf   20169964   
3  2016-OJS011-015326-it.pdf  201615326   
4  2016-OJS012-017147-it.pdf  201617147   

                                                text        date  
0  Data: 25.2.2016 - 9:00|Luogo:|Provincia di Var...          -1  
1                                                NaN          -1  
2  Data: 25.2.2016 - 9:30|Luogo:|Comune di Piacen...  2016-02-25  
3  Data: 11.3.2016 - 9:00|Luogo:|Settore Gare e A...  2016-03-11  
4  Data: 4.3.2016 - 10:00|Persone ammesse ad assi...  2016-03-04  



In [40]:
# Create a file with the texts extracted from the PDFs
print(">> Saving bid opening texts and dates")
print("Dataframe with dates shape:", df_bid_text.shape)  # should be same shape as df_label
path_out = Path(data_dir) / bid_file_text_date
print("Path:", path_out)
df_bid_text.to_csv(path_out, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)

>> Saving bid opening texts and dates
Dataframe with dates shape: (1063, 4)
Path: data/bid_opening_text_date.csv


In [156]:
# Checking accuracy against the annotated dataset
print(">> Checking accuracy against the annotated dataset")
path_csv_label = Path(data_dir) / bid_file_text_date_label
print("Path:", str(path_csv_label))
dic_type = {"file_name":object, "text":object, "label":object}
df_label = read_csv_data_to_df(path_csv_label, dic_type, csv_sep)
print("Dataframe with labels shape:", df_label.shape) # should be same shape as df_bid_text
# Convert labels
df_label['label_ymd'] = df_label['label'].apply(convert_dmy_to_ymd)
# Replace empty cells (not found) with -1
df_label.loc[df_label['label_ymd'].isna(), 'label_ymd'] = "-1"

# Gets dates in list format to compare them (removing not found -> -1)
df_bid_text = df_bid_text[df_bid_text['date'] != "-1"]
llm_dates_list = df_bid_text['date'].tolist()
print("Dates from LLM list length:", len(llm_dates_list)) # should be same length as label_dates_list

df_label = df_label[df_label['label_ymd'] != "-1"]
label_dates_list = df_label['label_ymd'].tolist()
print("Dates from LABELS list length:", len(label_dates_list))  # should be same length as llm_dates_list

>> Checking accuracy against the annotated dataset
Path: data/bid_opening_text_date_label.csv
Dataframe with labels shape: (1063, 3)
Dates from LLM list length: 993
Dates from LABELS list length: 993


In [159]:
# Computing precision
correct_dates = [date in label_dates_list for date in llm_dates_list]
accuracy = sum(correct_dates) / len(correct_dates) if correct_dates else 0
print(f"Precision: {accuracy * 100:.2f}%")

Precision: 99.50%


In [158]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-09 12:24:05
Time to finish: 0:15:29


*** PROGRAM END ***

