In [1]:
# 02_read_date_HUGCHAT.ipynb
# Reads the PDFs and extracts the portion of text contained in the section 'Modalità di apertura delle offerte'; the texts are saved in a CSV file.  
# Use https://huggingface.co/chat/    

In [2]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [3]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
import os 
from dotenv import load_dotenv # Needed to load the contents of the .env file
from hugchat import hugchat
from hugchat.login import Login
import requests
from time import sleep as t_sleep

In [4]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data_to_df, convert_dmy_to_ymd, left_join_df, calculate_accuracy

In [5]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
bid_file_text = str(yaml_config["FILE_BID_TEXT"])
bid_file_text_date = str(yaml_config["FILE_BID_TEXT_DATE"])
bid_file_text_date_label = str(yaml_config["FILE_BID_TEXT_DATE_LABEL"])
csv_sep = str(yaml_config["CSV_SEP"])

# HugChat
load_dotenv() # Load environment variables from the .env file
hc_username = os.getenv("HC_EMAIL")
hc_password = os.getenv("HC_PASS")
time_sleep = int(yaml_config["TIME_SLEEP"]) # To avoid too many requests in a short time

In [6]:
### FUNCTIONS ###

In [7]:
def llm_test(username:str, passwd:str) -> None:
    """
    Test the connection to the LLM.
    
    Args:
        username (str): login e-mail.
        passwd (str): login password.
    """
    try:
        # Hugging Face Login
        session = Login(username, passwd)
        if session.login():
            print("OK! Successful login to LLM")
        else:
            print("ERROR! Login failed: invalid session.")
    except Exception as e:
        print(f"ERROR! An unexpected error occurred in LLM test connection: {e}")

In [8]:
# Function for generating LLM response
def llm_find_date(text:str, cookies:requests.cookies.RequestsCookieJar):
    """
    Extracts and formats a date from a given text string using HuggingChat. The date is returned in the format dd/mm/yyyy.

    Args:
        text (str): The text from which the date needs to be extracted. It should contain a date in any recognisable format.
        cookies (requests.cookies): Cookie to avoid login.

    Returns:
        str: A string representing the date in dd/mm/yyyy format. If an error occurs, it returns the error message.
    """
    # Create ChatBot
    try:
        prompt_input = f"Please extract and format only the date in this Italian text as yyyy-mm-dd, without adding anything other than the date. Dates can be written as dd.mm.yyyy or dd/mm/yyyy and days and months less than 10 may not have the leading 0 (e.g.: 5.2.2016). Write -1 if date not found. Text: {text}"
        chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
        response = chatbot.chat(prompt_input)
        print("LLM output:", response)
        # print(type(response)) # debug
        return response    
    except Exception as e:
        print(f"ERROR! An unexpected error occurred during the LLM prompt use: {e}")
        return None

In [9]:
def process_row(row: pd.Series, cookies:requests.cookies.RequestsCookieJar) -> str:
    """
    Process a single row of the DataFrame. If the 'text' field is not None, call llm_find_date() with the 'text'.

    Args:
        row (pd.Series): A pandas Series object representing a single row.
        cookies (requests.cookies): Cookie to avoid login.
    Returns:
        str: The date string returned by llm_find_date if 'text' is not None; otherwise, None.
    """
    if pd.notnull(row['text']):
        print("LLM input:", row['text'])
        t_sleep(time_sleep) # Avoid too many requests
        return llm_find_date(row['text'], cookies)
    return None  # Return None if 'text' is None

In [10]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-16 09:38:35



In [11]:
# Reading CSV file text and dates to be extracted by LLM
print(">> Reading CSV file")
path_bid_text = Path(data_dir) / bid_file_text
print("File:", str(path_bid_text))
dic_type = {"file_name":object, "case_id":object, "text":object}
df_bid_text = read_csv_data_to_df(path_bid_text, dic_type, csv_sep)
df_bid_text_len = len(df_bid_text)
print("Rows in dataframe:", df_bid_text_len)
print("Columns in dataframe:", df_bid_text.columns)
print()

>> Reading CSV file
File: data/bid_opening_text.csv
Reading CSV with input col_type...
Rows in dataframe: 1063
Columns in dataframe: Index(['file_name', 'case_id', 'text'], dtype='object')



In [12]:
print(">> Reading the annotated dataset (with dates as labels)")
path_csv_label = Path(data_dir) / bid_file_text_date_label
print("Path:", str(path_csv_label))
dic_type = {"file_name":object, "text":object, "label":object}
df_label = read_csv_data_to_df(path_csv_label, dic_type, csv_sep)
print("Dataframe with labels shape:", df_label.shape) # should be same shape as df_bid_text
# Convert labels
df_label['label_ymd'] = df_label['label'].apply(convert_dmy_to_ymd)
# Replace empty cells (not found) with -1
df_label.loc[df_label['label_ymd'].isna(), 'label_ymd'] = "-1"

>> Reading the annotated dataset (with dates as labels)
Path: data/bid_opening_text_date_label.csv
Reading CSV with input col_type...
Dataframe with labels shape: (1063, 3)


In [13]:
# Testing the connection to LLM
print(">> Testing LLM connection")
# print("Open API key:", openai_api_key) # debug
llm_test(hc_username, hc_password)

>> Testing LLM connection
OK! Successful login to LLM


In [14]:
# Parse the files
print(">> Reading CSV text and querying LLM")
# If you only want a sample of the rows, uncomment the following line to extract a random sample of 10 rows
df_bid_text = df_bid_text.sample(n=10)

# Hugging Face Login
sign = Login(hc_username, hc_password)
cookies = sign.login()

# Applies LLM on the dataframe 
df_bid_text['date'] = df_bid_text.apply(lambda row: process_row(row, cookies), axis=1)
print()

>> Reading CSV text and querying LLM
LLM input: Data: 25/10/2017|Ora locale: 09:00|Luogo:|Rignano Flaminio — Piazza IV Novembre 1 — sede comunale.|Informazioni relative alle persone ammesse e alla procedura di apertura:|Come da normativa e dagli atti di gara.
LLM output: 2017-10-25
LLM input: Data: 28.11.2018 - 10:00|Luogo:|Tarcento, Piazza Roma 7|Persone ammesse ad assistere all'apertura delle offerte: sì|Informazioni complementari sulle persone ammesse e la procedura di apertura: La procedura è di evidenza |pubblica.
LLM output: 2018-11-28
LLM input: Data: 22/08/2018|Ora locale: 08:30|Luogo:|Presso la Sede della C.U.C. dell’Unione dei Comuni del Frignano|Via Giardini 15 (sala al 3o piano)|Pavullo nel Frignano (MO)
LLM output: 2018-08-22
LLM input: Luogo:|Le offerte dovranno pervenire in modalità telematica e l'apertura avverrà sulla piattaforma telematica Sintel |Regione Lombardia
LLM output: -1
LLM input: Data: 05/12/2017|Ora locale: 10:30|Luogo:|Gambassi Terme Via Garibaldi 7.|Info

In [15]:
print(">> New data obtained from LLM")
# df_bid_text['date'].fillna("-1", inplace=True)
# Replace empty cells (not found) with -1
df_bid_text.loc[df_bid_text['date'].isna(), 'date'] = "-1"
print(df_bid_text.columns)
print(df_bid_text.head())
print()

>> New data obtained from LLM
Index(['file_name', 'case_id', 'text', 'date'], dtype='object')
                     file_name     case_id  \
468  2017-OJS180-368850-it.pdf  2017368850   
702  2018-OJS194-440333-it.pdf  2018440333   
647  2018-OJS136-311074-it.pdf  2018311074   
716  2018-OJS202-460935-it.pdf  2018460935   
489  2017-OJS208-430527-it.pdf  2017430527   

                                                  text        date  
468  Data: 25/10/2017|Ora locale: 09:00|Luogo:|Rign...  2017-10-25  
702  Data: 28.11.2018 - 10:00|Luogo:|Tarcento, Piaz...  2018-11-28  
647  Data: 22/08/2018|Ora locale: 08:30|Luogo:|Pres...  2018-08-22  
716  Luogo:|Le offerte dovranno pervenire in modali...          -1  
489  Data: 05/12/2017|Ora locale: 10:30|Luogo:|Gamb...  2017-12-05  



In [16]:
# Create a file with the texts extracted from the PDFs
print(">> Saving bid opening texts and dates")
print("Dataframe with dates shape:", df_bid_text.shape)  # should be same shape as df_label
file_name = Path(bid_file_text_date).stem # get the general filename without extension
file_name_csv = f"{file_name}_OAI.csv"
path_out = Path(data_dir) / file_name_csv
print("Path:", path_out)
df_bid_text.to_csv(path_out, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)

>> Saving bid opening texts and dates
Dataframe with dates shape: (10, 4)
Path: data/bid_opening_text_date_OAI.csv


In [22]:
# Checking accuracy against the annotated dataset
print(">> Checking accuracy against the annotated dataset")

# Only extracts dates found (!= -1)
df_bid_text = df_bid_text[df_bid_text['date'] != "-1"]
print("Dates from LLM length:", len(df_bid_text)) # should be same length as label_dates_list

# Only extracts labels (!= -1)
df_label = df_label[df_label['label_ymd'] != "-1"]
print("Dates from LABELS length:", len(df_label))  # should be same length as llm_dates_list

print()

# Merges the dataset from which the dates were extracted with the one containing the labels
key_col = "file_name" # Key column on which to perform the join
col_del = ["text"] # List of columns to be removed from the join
merged_df = left_join_df(df_bid_text, df_label, key_col, col_del)

print("Dataframe for accuracy")
print("Dataframe length:", len(merged_df))
print(merged_df.head())
print(merged_df.columns)
print()

>> Checking accuracy against the annotated dataset
Dates from LLM length: 9
Dates from LABELS length: 993

Dataframe for accuracy
Dataframe length: 9
                   file_name     case_id        date       label   label_ymd
0  2017-OJS180-368850-it.pdf  2017368850  2017-10-25  25/10/2017  2017-10-25
1  2018-OJS194-440333-it.pdf  2018440333  2018-11-28  28/11/2018  2018-11-28
2  2018-OJS136-311074-it.pdf  2018311074  2018-08-22  22/08/2018  2018-08-22
3  2018-OJS202-460935-it.pdf  2018460935          -1         NaN         NaN
4  2017-OJS208-430527-it.pdf  2017430527  2017-12-05  05/12/2017  2017-12-05
Index(['file_name', 'case_id', 'date', 'label', 'label_ymd'], dtype='object')



In [21]:
# Computing precision
accuracy = calculate_accuracy(merged_df, "date", "label_ymd")
print(f"Accuracy: {accuracy:.2f} over a sample of {len(merged_df)} rows")
print()

Accuracy: 0.00 over a sample of 9 rows



In [19]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-16 09:39:35
Time to finish: 0:01:00


*** PROGRAM END ***

