In [33]:
# 03_log_creation
# Transforms data into an event log
# Events
# ID_NOTICE_CN -> case-id
# DT_DISPATCH (date) -> PUBLICATION
# DT_APPLICATIONS (date) -> PARTICIPATION
# DT_AWARD (date) -> AWARD
# CONTRACT_START (date) -> CONTRACT-START
# CONTRACT_COMPLETION (date) -> CONTRACT-END

In [34]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [35]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd

In [36]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data, fix_date, json_data_to_list_dict, dic_get_years

In [37]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
log_dir = str(yaml_config["LOG_DIR"])
ted_cfc_file = str(yaml_config["TED_CFC_FILE"]) # input
ted_can_file = str(yaml_config["TED_CAN_FILE"]) # input
ted_config_file = str(yaml_config["TED_CONFIG_FILE"]) # input: filter configuration
dic_types_cfc = dict(yaml_config["TED_CFC_TYPES"]) # input
dic_types_can = dict(yaml_config["TED_CAN_TYPES"]) # input
list_ted_log_events = list(yaml_config["TED_LOG_EVENTS"]) # input
list_ted_log_attributes = list(yaml_config["TED_LOG_ATTRIBUTES"])
ted_join_file = str(yaml_config["TED_JOIN_FILE"]) # output
log_file = str(yaml_config["LOG_FILE"]) # output

In [38]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process: ", str(start_time))
print()


*** PROGRAM START ***

Start process:  2024-05-28 10:38:07



In [39]:
# Gets filters from JSON configuration
print(">> Filters configuration")
print("Configuration file:", ted_config_file)
list_filters = json_data_to_list_dict(ted_config_file)
print("Configuration list:", list_filters) 
# Find the dictionary with the key 'YEAR' and get min/max values
min_year, max_year = dic_get_years(list_filters, 'YEAR')

>> Filters configuration
Configuration file: ted_config.json
Configuration list: [{'CAE_TYPE': ['3']}, {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']}, {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]}]
Minimum value for YEAR: 2016
Maximum value for YEAR: 2022


In [40]:
# Reads CFC and CAN type CVS
print(">> Reading CFC and CAN files")
path_cfc_file = Path(data_dir) / ted_cfc_file.replace("YS", str(min_year)).replace("YE", str(max_year))
print("Reading:", str(path_cfc_file))
df_cfc = read_csv_data(path_cfc_file, dic_types_cfc, ";")
df_cfc_len = len(df_cfc)
print("Dataframe CFC length:", df_cfc_len)
path_can_file = Path(data_dir) / ted_can_file.replace("YS", str(min_year)).replace("YE", str(max_year))
print("Reading:", str(path_can_file))
df_can = read_csv_data(path_can_file, dic_types_can, ";")
df_can_len = len(df_can)
print("Dataframe CAN length:", df_can_len)
print()

>> Reading CFC and CAN files
Reading: data/TED_CFC_2016-2022.csv
Dataframe CFC length: 811398
Reading: data/TED_CAN_2016-2022.csv
Dataframe CAN length: 730216



In [41]:
# Joint the CFC and CAN dataframe
# The two datasets are linked by CFC.FUTURE_CAN_ID = CAN.ID_NOTICE_CAN. Note: the CAN of a CFC may be missing, due to the tender (CFC.ID_NOTICE_CN) not having been awarded.
print(">> Joining CFC and CAN dataframes")

path_out = Path(data_dir) / ted_join_file.replace("YS", str(min_year)).replace("YE", str(max_year))

if path_out.exists():
    print("The CFC and CAN merged file already exists:", path_out)
    print(f"If you wish to recreate it, delete the file '{path_out}'")
else:
    # Join and clean
    df_join = pd.merge(left = df_cfc, right = df_can, left_on='FUTURE_CAN_ID', right_on='ID_NOTICE_CAN', how = 'inner')

    # Drop columns with '_y'
    cols_to_drop = [col for col in df_join.columns if col.endswith('_y')]
    df_join.drop(columns=cols_to_drop, inplace=True)
    # Rename columns with '_x'
    rename_dict = {col: col.rstrip('_x') for col in df_join.columns if col.endswith('_x')}
    df_join.rename(columns=rename_dict, inplace=True)

    # Output and Save
    print("Joint dataframe shape:", df_join.shape)
    df_join.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
    print("Data saved to:", str(path_out)) 
print()

>> Joining CFC and CAN dataframes
Joint dataframe shape: (8550053, 96)
Data saved to: data/TED_CFC_CAN_2016-2022.csv



In [42]:
# Preparing the output
print(">> Preparing the output")
out_dir = Path(log_dir)
print("Event log directory:", str(out_dir))
out_dir.mkdir(exist_ok=True)
print()

>> Preparing the output
Event log directory: data_log



In [43]:
# List of features of interest for the event log
print(">> Creating the event log")
print("Features used as trace event:", list_ted_log_events)
print("Features used as trace attribute:", list_ted_log_attributes)
list_ted_log_features = list_ted_log_events + list_ted_log_attributes
print("Features (all):", list_ted_log_features)
print()

>> Creating the event log
Features used as trace event: ['ID_NOTICE_CN', 'DT_DISPATCH', 'CONTRACT_START', 'CONTRACT_COMPLETION', 'DT_APPLICATIONS', 'DT_AWARD']
Features used as trace attribute: ['TYPE_OF_CONTRACT', 'VALUE_EURO', 'B_ELECTRONIC_AUCTION', 'B_FRA_AGREEMENT', 'TAL_LOCATION_NUTS', 'CPV', 'ISO_COUNTRY_CODE']
Features (all): ['ID_NOTICE_CN', 'DT_DISPATCH', 'CONTRACT_START', 'CONTRACT_COMPLETION', 'DT_APPLICATIONS', 'DT_AWARD', 'TYPE_OF_CONTRACT', 'VALUE_EURO', 'B_ELECTRONIC_AUCTION', 'B_FRA_AGREEMENT', 'TAL_LOCATION_NUTS', 'CPV', 'ISO_COUNTRY_CODE']



In [44]:
# Select from the full dataframe only the columns in the feature list
print("> Starting dataframe")
df_ted_log = df_join[list_ted_log_features]
print("Columns considered:", df_ted_log.columns)

> Starting dataframe
Columns considered: Index(['ID_NOTICE_CN', 'DT_DISPATCH', 'CONTRACT_START', 'CONTRACT_COMPLETION',
       'DT_APPLICATIONS', 'DT_AWARD', 'TYPE_OF_CONTRACT', 'VALUE_EURO',
       'B_ELECTRONIC_AUCTION', 'B_FRA_AGREEMENT', 'TAL_LOCATION_NUTS', 'CPV',
       'ISO_COUNTRY_CODE'],
      dtype='object')


In [45]:
# Checking distinct feature values
print("> Checking distinct feature values")
print("B_ELECTRONIC_AUCTION:", df_ted_log["B_ELECTRONIC_AUCTION"].unique())
print("B_FRA_AGREEMENT:", df_ted_log["B_FRA_AGREEMENT"].unique())
print("TYPE_OF_CONTRACT:", df_ted_log["TYPE_OF_CONTRACT"].unique())
print()

> Checking distinct feature values
B_ELECTRONIC_AUCTION: ['N' nan 'Y']
B_FRA_AGREEMENT: ['N' 'Y']
TYPE_OF_CONTRACT: ['U' 'S' 'W']



In [51]:
print("> Creating the event log")

# Removes rows with ID_NOTICE_CN null 
df = df_ted_log[df_ted_log['ID_NOTICE_CN'].notna()]

# Event log header
event_log = pd.DataFrame(columns=['case_id', 'event', 'timestamp', 't_type', 'amount', 'electronic', 'framework_agr', 'nuts', 'country', 'cpv_division', 'cpv'], dtype=object) # All columns have the default type 'object'.
event_log['amount'] = event_log['amount'].astype(float)

# print("Event log header:", event_log.columns) # debug

# Events (condition T/F, event name, column feature); the column feature contains the timestamp
# (ID_NOTICE_CN is the case-id, the other columns contain the timestamp so if they are not empty the event has occurred)
conditions = [
    (~df['DT_DISPATCH'].isna(), 'PUBLICATION', 'DT_DISPATCH'),
    (~df['DT_APPLICATIONS'].isna(), 'PARTICIPATION', 'DT_APPLICATIONS'),
    (~df['DT_AWARD'].isna(), 'AWARD', 'DT_AWARD'),
    (~df['CONTRACT_START'].isna(), 'CONTRACT-START', 'CONTRACT_START'),
    (~df['CONTRACT_COMPLETION'].isna(), 'CONTRACT-END', 'CONTRACT_COMPLETION'),
]

for condition, event_name, date_col in conditions:
    temp_df = df[condition].copy()  # .copy() to avoid SettingWithCopyWarning
    temp_df['event'] = event_name
    temp_df['timestamp'] = temp_df[date_col].apply(fix_date)
    temp_df['case_id'] = temp_df['ID_NOTICE_CN']
    temp_df['t_type'] = temp_df['TYPE_OF_CONTRACT']
    temp_df['amount'] = temp_df['VALUE_EURO'].astype(float)
    temp_df['electronic'] = temp_df['B_ELECTRONIC_AUCTION']
    temp_df['framework_agr'] = temp_df['B_FRA_AGREEMENT']
    temp_df['nuts'] = temp_df['TAL_LOCATION_NUTS']
    temp_df['country'] =  temp_df['ISO_COUNTRY_CODE']
    temp_df['cpv_division'] = temp_df['CPV'].str[:2]
    temp_df['cpv'] = temp_df['CPV']    
    
    # Let us ensure that all necessary columns are present before concatenating
    temp_df = temp_df[['case_id', 'event', 'timestamp', 't_type', 'amount', 'electronic', 'framework_agr', 'nuts', 'country', 'cpv_division', 'cpv']]
    event_log = pd.concat([event_log, temp_df])

event_log = event_log.drop_duplicates()

print("> Ordering the event log")
# Order by 'case_id' and 'timestamp'
event_log = event_log.sort_values(by=['case_id', 'timestamp'])

print("> Saving the event log")
log_file = log_file.replace("YS", str(min_year)).replace("YE", str(max_year))
path_out = Path(log_dir) / log_file
print("Saving event log to:", str(path_out))
event_log.to_csv(path_out, index=False, sep=';')

> Creating the event log
> Ordering the event log
> Saving the event log
Saving event log to: data_log/TED_log_2016-2022.csv


In [52]:
print(">> Splitting the event log by country")
list_countries = list(event_log["country"].unique()) # get the coutries
print("Available countries:",list_countries)
for country in list_countries:
    df = event_log[event_log["country"]==country]
    log_file_name = f"{log_file.split('.')[0]}_{country}.csv"
    path_out = Path(log_dir) / log_file_name
    print(f"Saving event log of country '{country}' to:", str(path_out))
    df.to_csv(path_out, index=False, sep=';')

>> Splitting the event log by country
Available countries: ['ES', 'DE', 'FR', 'IT']
Saving event log of country 'ES' to: data_log/TED_log_2016-2022_ES.csv
Saving event log of country 'DE' to: data_log/TED_log_2016-2022_DE.csv
Saving event log of country 'FR' to: data_log/TED_log_2016-2022_FR.csv
Saving event log of country 'IT' to: data_log/TED_log_2016-2022_IT.csv


In [53]:
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-28 10:49:18
Time to finish: 0:11:11


*** PROGRAM END ***

