In [141]:
# 02_read_date_OPENAI.ipynb
# Reads the PDFs and extracts the portion of text contained in the section 'Modalità di apertura delle offerte'; the texts are saved in a CSV file.
# Use https://openai.com/

In [142]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [143]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd

In [144]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data_to_df, convert_dmy_to_ymd, left_join_df

In [145]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
csv_sep = str(yaml_config["CSV_SEP"])
event_log_dir = str(yaml_config["EVENT_LOG_DIR"])
event_log_file = str(yaml_config["FILE_EVENT_LOG"]) # main event log file (with LANG to choose)
event_log_text_date = str(yaml_config["FILE_BID_TEXT_DATE"])  # text dates (with LANG to choose)
year_start = int(yaml_config["YEAR_START"])
year_end = int(yaml_config["YEAR_END"])

# INPUT
dic_lan = {"DE":0, "ES":0, "FR":0, "IT":0, "PT":1} # <-- INPUT: set 1 for the desired language; set only one language at time
model_name = "OAI" # <-- INPUT [OAI, HFC]
dic_log_type = {"case_id":"object", "event":"object", "timestamp":"object", "t_type":"object", "amount":"float", "electronic":"object", "framework_agr":"object", "nuts":"object", "country":"object", "cpv_division":"object", "cpv":"object", "case_len":"int"}
dic_dates_type = {"file_name":object, "case_id":object, "text":object, "text_date":object, "text_date_llm":object}
event_new = str(yaml_config["BID_OPENING_EVENT"])

# OUTPUT
log_suffix = "enr"

## FUNCTIONS

## MAIN

In [146]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-09-05 15:27:22



In [147]:
print(">> Settings")
lang_code = None
key_with_value_1 = [key for key, value in dic_lan.items() if value == 1]
if key_with_value_1: 
    lang_code = key_with_value_1[0]
print("Desired language code for LLM:", lang_code)

# INPUT
# Initial event log
event_log_file_lang = event_log_file.replace("LANG", lang_code).replace("YS", str(year_start)).replace("YE", str(year_end))
path_log_file_lang = Path(event_log_dir) / event_log_file_lang
print("File input:", path_log_file_lang)

# Dates extracted
event_log_text_date_lang = event_log_text_date.replace("LANG", lang_code)
path_log_text_date_lang = Path(data_dir) / f"{Path(event_log_text_date_lang).stem}_{model_name}.csv"
print("File input with new dates:", path_log_text_date_lang)

# OUTPUT
# Event log enriched
event_log_file_lang_enr = f"{Path(event_log_file_lang).stem}_{log_suffix}.csv"
path_log_file_lang_enr = Path(data_dir) / event_log_file_lang_enr
print("File output:", path_log_file_lang_enr)

>> Settings
Desired language code for LLM: PT
File input: /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction from structured and unstructured data for process mining (techniques, algorithms, and tools)/Python - GUUE in CSV per LOG/data_log/TED_log_2016-2022_PT.csv
File input with new dates: data/bid_opening_text_PT_date_llm_OAI.csv
File output: data/TED_log_2016-2022_PT_enr.csv


In [148]:
# Reading CSV file text and dates to be extracted by LLM
print(">> Reading CSV input file")
print("Path:", str(path_log_file_lang))
df_event_log = read_csv_data_to_df(path_log_file_lang, dic_log_type, csv_sep)
df_event_log_len = len(df_event_log)
print("Rows in dataframe:", df_event_log_len)
print("Columns in dataframe:", df_event_log.columns)
print("Cases in event log:", df_event_log["case_id"].nunique())

>> Reading CSV input file
Path: /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction from structured and unstructured data for process mining (techniques, algorithms, and tools)/Python - GUUE in CSV per LOG/data_log/TED_log_2016-2022_PT.csv
Reading CSV with input col_type: {'case_id': 'object', 'event': 'object', 'timestamp': 'object', 't_type': 'object', 'amount': 'float', 'electronic': 'object', 'framework_agr': 'object', 'nuts': 'object', 'country': 'object', 'cpv_division': 'object', 'cpv': 'object', 'case_len': 'int'}
Rows in dataframe: 7286
Columns in dataframe: Index(['case_id', 'event', 'timestamp', 't_type', 'amount', 'electronic',
       'framework_agr', 'nuts', 'country', 'cpv_division', 'cpv', 'case_len'],
      dtype='object')
Cases in event log: 2333


In [149]:
df_event_log.head(5)

Unnamed: 0,case_id,event,timestamp,t_type,amount,electronic,framework_agr,nuts,country,cpv_division,cpv,case_len
0,2016101188,PUBLICATION,2016-03-18,U,,,N,,PT,9,9310000,3
1,2016101188,PARTICIPATION,2016-05-09,U,,,N,,PT,9,9310000,3
2,2016101188,AWARD,2016-06-24,U,,,N,,PT,9,9310000,3
3,2016101809,PUBLICATION,2016-03-18,S,,,N,,PT,90,90610000,3
4,2016101809,PARTICIPATION,2016-05-09,S,,,N,,PT,90,90610000,3


In [150]:
# Count the number of valid dates (other than 0) 
print(">> Reading CSV text extracted dates file")
print("Path:", str(path_log_file_lang))
df_bid_text = read_csv_data_to_df(path_log_text_date_lang, dic_dates_type, csv_sep)
df_bid_text = df_bid_text[df_bid_text['case_id'].isin(df_event_log['case_id'])]
df_bid_text_len = len(df_bid_text)
df_bid_text_valid = df_bid_text[df_bid_text["text_date"]!="0"]
df_bid_text_valid_len = len(df_bid_text_valid)
print(f"Number of not empty dates ({lang_code}): {df_bid_text_valid_len} / {df_bid_text_len}")
print("Cases in text dates", df_bid_text["case_id"].nunique())
print("Valid cases in text dates", df_bid_text_valid["case_id"].nunique())

>> Reading CSV text extracted dates file
Path: /Volumes/SAMSUNG-PHD/PhD/Corsi da seguire/Knowledge management and information extraction from structured and unstructured data for process mining (techniques, algorithms, and tools)/Python - GUUE in CSV per LOG/data_log/TED_log_2016-2022_PT.csv
Reading CSV with input col_type: {'file_name': <class 'object'>, 'case_id': <class 'object'>, 'text': <class 'object'>, 'text_date': <class 'object'>, 'text_date_llm': <class 'object'>}
Number of not empty dates (PT): 2164 / 2333
Cases in text dates 2333
Valid cases in text dates 2164


In [151]:
df_bid_text_valid.head(5)

Unnamed: 0,file_name,case_id,text,text_date,text_date_llm
10,2016-OJS010-00012575-pt-ts.pdf,201612575,Data: 1.3.2016 - 10:00 | Local: Secretaria Reg...,Data: 1.3.2016 - 10:00,2016-03-01
11,2016-OJS010-00012614-pt-ts.pdf,201612614,Data: 15.3.2016 - 10:30 |,Data: 15.3.2016 - 10:30,2016-03-15
20,2016-OJS018-00027331-pt-ts.pdf,201627331,Data: 16.3.2016 - 10:00 |,Data: 16.3.2016 - 10:00,2016-03-16
26,2016-OJS022-00035201-pt-ts.pdf,201635201,Data: 11.3.2016 - 14:00 | Local: Município de ...,Data: 11.3.2016 - 14:00,2016-03-11
31,2016-OJS026-00042162-pt-ts.pdf,201642162,Data: 23.3.2016 - 9:00 | Local: | http://www.a...,Data: 23.3.2016 - 9:00,2016-03-23


In [152]:
# Enriching the event log
print(">> Enriching the event log")

# Step 1: Remove duplicates from df_bid_text_valid to ensure each case_id is unique
df_bid_text_valid_unique = df_bid_text_valid.drop_duplicates(subset=['case_id'])

# Step 2: Find the unique case_ids that are in both dataframes
common_case_ids = df_event_log['case_id'].unique()
df_common = df_bid_text_valid_unique[df_bid_text_valid_unique['case_id'].isin(common_case_ids)]

# Step 3: Create a list to store new rows
new_rows = []

# Step 4: Iterate over each common case_id to create one new row per case_id
i = 1
for idx, row in df_common.iterrows():
    print(f"Row {i}")
    case_id = row['case_id']
    text_date_llm = row['text_date_llm']
    
    # Find the first occurrence of the current case_id in df_event_log
    existing_values = df_event_log[df_event_log['case_id'] == case_id].iloc[0]
    
    # Create a new row with the existing values and the date from df_bid_text_valid
    new_row = {
        'case_id': case_id,
        'event': 'BID-OPENING',
        'timestamp': text_date_llm,
        't_type': existing_values['t_type'],
        'amount': existing_values['amount'],
        'electronic': existing_values['electronic'],
        'framework_agr': existing_values['framework_agr'],
        'nuts': existing_values['nuts'],
        'country': existing_values['country'],
        'cpv_division': existing_values['cpv_division'],
        'cpv': existing_values['cpv'],
        'case_len': existing_values['case_len']
    }
    
    # Add the new row to the list
    new_rows.append(new_row)
    i+=1

# Step 5: Convert the list of new rows into a DataFrame
new_rows_df = pd.DataFrame(new_rows)

# Step 6: Append the new rows to the df_event_log dataframe
df_event_log = pd.concat([df_event_log, new_rows_df], ignore_index=True)

# Step 7: Drop any duplicates after the new rows have been added
df_event_log.drop_duplicates(subset=['case_id', 'event', 'timestamp'], inplace=True)

# Step 8: Sort the dataframe by 'case_id' and 'timestamp'
df_event_log.sort_values(by=['case_id', 'timestamp'], inplace=True)

# Step 9: Reset the index of the dataframe after sorting
df_event_log.reset_index(drop=True, inplace=True)

# Check
print("Cases in dataframe enriched:", df_event_log["case_id"].nunique())
print("Rows in dataframe enriched:", len(df_event_log))
print("Columns in dataframe enriched:", df_event_log.columns)

>> Enriching the event log
Row 1
Row 2
Row 3
Row 4
Row 5
Row 6
Row 7
Row 8
Row 9
Row 10
Row 11
Row 12
Row 13
Row 14
Row 15
Row 16
Row 17
Row 18
Row 19
Row 20
Row 21
Row 22
Row 23
Row 24
Row 25
Row 26
Row 27
Row 28
Row 29
Row 30
Row 31
Row 32
Row 33
Row 34
Row 35
Row 36
Row 37
Row 38
Row 39
Row 40
Row 41
Row 42
Row 43
Row 44
Row 45
Row 46
Row 47
Row 48
Row 49
Row 50
Row 51
Row 52
Row 53
Row 54
Row 55
Row 56
Row 57
Row 58
Row 59
Row 60
Row 61
Row 62
Row 63
Row 64
Row 65
Row 66
Row 67
Row 68
Row 69
Row 70
Row 71
Row 72
Row 73
Row 74
Row 75
Row 76
Row 77
Row 78
Row 79
Row 80
Row 81
Row 82
Row 83
Row 84
Row 85
Row 86
Row 87
Row 88
Row 89
Row 90
Row 91
Row 92
Row 93
Row 94
Row 95
Row 96
Row 97
Row 98
Row 99
Row 100
Row 101
Row 102
Row 103
Row 104
Row 105
Row 106
Row 107
Row 108
Row 109
Row 110
Row 111
Row 112
Row 113
Row 114
Row 115
Row 116
Row 117
Row 118
Row 119
Row 120
Row 121
Row 122
Row 123
Row 124
Row 125
Row 126
Row 127
Row 128
Row 129
Row 130
Row 131
Row 132
Row 133
Row 134
Row 135
R

In [153]:
df_event_log.head(5)

Unnamed: 0,case_id,event,timestamp,t_type,amount,electronic,framework_agr,nuts,country,cpv_division,cpv,case_len
0,2016101188,PUBLICATION,2016-03-18,U,,,N,,PT,9,9310000,3
1,2016101188,PARTICIPATION,2016-05-09,U,,,N,,PT,9,9310000,3
2,2016101188,AWARD,2016-06-24,U,,,N,,PT,9,9310000,3
3,2016101809,PUBLICATION,2016-03-18,S,,,N,,PT,90,90610000,3
4,2016101809,PARTICIPATION,2016-05-09,S,,,N,,PT,90,90610000,3


In [154]:
# Create a file with the texts extracted from the PDFs
print(">> Saving event log enriched")
print("Path:", path_log_file_lang_enr)
df_event_log.to_csv(path_log_file_lang_enr, sep=csv_sep, index=False, quoting=csv.QUOTE_ALL)
print()

>> Saving event log enriched
Path: data/TED_log_2016-2022_PT_enr.csv



In [155]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-09-05 15:27:24
Time to finish: 0:00:02


*** PROGRAM END ***

