In [1]:
# 01_read_opendata.ipynb
# Reads and filters raw Open Data.
# CFC: Call For Competition (step 1/2)  
# CAN: Contract Award Notices (step 2/2)

In [2]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [2]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd

In [4]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import list_files_by_type, read_csv_data, data_schema

In [5]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
od_dir = str(yaml_config["OD_DIR"])
data_dir = str(yaml_config["DATA_DIR"])
ted_cfc_file = str(yaml_config["TED_CFC_FILE"])
ted_can_file = str(yaml_config["TED_CAN_FILE"])
ted_cfc_schema_file = str(yaml_config["TED_CFC_SCHEMA_FILE"])
ted_can_schema_file = str(yaml_config["TED_CAN_SCHEMA_FILE"])
ted_country_codes_feature = str(yaml_config["COUNTRY_CODES_FEATURE"])
ted_country_codes_values = list(yaml_config["COUNTRY_CODES_VALUES"])
ted_cae_codes_feature = str(yaml_config["CA_CODES_FEATURE"])
ted_cae_codes_values = list(yaml_config["CA_CODES_VALUES"])

In [6]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process: " + str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-06 22:53:19



In [7]:
# Create list of CSV files
print(">> Listing OD files")
print("Directory:", od_dir)
list_csv_files = list_files_by_type(od_dir, "csv")
list_csv_files_len = len(list_csv_files)
print("Files found:", list_csv_files_len)
# print("Files:", list_csv_files) # debug
print()

print(">> Filters")
print("Feature name:", ted_country_codes_feature)
print("Feature values:", ted_country_codes_values)
print()

>> Reading OD files
Directory: opendata
Files found: 6

>> Filters
Feature name: ISO_COUNTRY_CODE
Feature values: ['ES', 'IT', 'FR']



In [8]:
# Reads raw data and merge the DataFrames (# CFC.FUTURE_CAN_ID = CNC.ID_NOTICE_CAN) filtering it
print(">> Parsing OD files")
dic_types_cfc = {'ID_NOTICE_CN':object, 'FUTURE_CAN_ID':object, 'FUTURE_CAN_ID_ESTIMATED':object, 'CPV':object, 'CAE_TYPE':object} # Columns not to be transformed into numbers
dic_types_can = {'ID_NOTICE_CAN':object, 'ID_AWARD':object, 'ID_LOT_AWARDED':object, 'CPV':object, 'CAE_TYPE': object} # Columns not to be transformed into numbers
list_cfc = []
list_can = []
i = 0
for csv_file in list_csv_files:
    i+=1
    print(f"[{i} / {list_csv_files_len}]")
    if "CFC" in csv_file.name:
        print("Reading CFC file:", csv_file.name)
        df = read_csv_data(csv_file, dic_types_cfc)
        df_len = len(df)
        # filters
        df_filtered = df[df[ted_country_codes_feature].isin(ted_country_codes_values) & df[ted_cae_codes_feature].isin(ted_cae_codes_values)]
        df_filtered_len = len(df_filtered)
        print("Dataframe length (complete):", df_len)
        print("Dataframe length (filtered):", df_filtered_len)
        list_cfc.append(df_filtered)
    if "CAN" in csv_file.name:
        print("Reading CAN file:", csv_file.name)
        df = read_csv_data(csv_file, dic_types_can)
        df_len = len(df)
        # filters
        df_filtered = df[df[ted_country_codes_feature].isin(ted_country_codes_values) & df[ted_cae_codes_feature].isin(ted_cae_codes_values)]
        df_filtered_len = len(df_filtered)
        print("Dataframe length (complete):", df_len)
        print("Dataframe length (filtered):", df_filtered_len)
        list_can.append(df_filtered)
print()

>> Parsing OD files
[1 / 6]
Reading CAN file: Export_OpenDataCAN_year2020.csv
Dataframe length (complete): 1070272
Dataframe length (filtered): 71326
[2 / 6]
Reading CAN file: Export_OpenDataCAN_year2021.csv
Dataframe length (complete): 1162663
Dataframe length (filtered): 88128
[3 / 6]
Reading CAN file: Export_OpenDataCAN_year2022.csv
Dataframe length (complete): 1071826
Dataframe length (filtered): 92823
[4 / 6]
Reading CFC file: Export_OpenDataCFC_year2020.csv
Dataframe length (complete): 1250856
Dataframe length (filtered): 104709
[5 / 6]
Reading CFC file: Export_OpenDataCFC_year2021.csv
Dataframe length (complete): 1242338
Dataframe length (filtered): 103501
[6 / 6]
Reading CFC file: Export_OpenDataCFC_year2022.csv
Dataframe length (complete): 1067540
Dataframe length (filtered): 90561



In [9]:
# Output the data merged and filtered

print(">> Preparing output")
out_dir = Path(data_dir)
out_dir.mkdir(exist_ok=True)
print()

print("> Creating CFC file")
# Merges all dataframes in the list and saves to file
df_cfc = pd.concat(list_cfc, ignore_index=True)
df_cfc_len = len(df_cfc)
print("Final CFC length:", df_cfc_len)
path_out = Path(data_dir) / ted_cfc_file
df_cfc.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to:", str(path_out)) 
# Get the  schema
df_cfc_schema = data_schema(df_cfc)
path_out = Path(data_dir) / ted_cfc_schema_file
df_cfc_schema.to_csv(path_out, sep=";", index=False)
print("Schema saved to:", str(path_out)) 
print()

print("> Creating CAN file")
# Merges all dataframes in the list and saves to file
df_can = pd.concat(list_can, ignore_index=True)
df_can_len = len(df_can)
print("Final CAN length:", df_can_len)
path_out = Path(data_dir) / ted_can_file
df_can.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to:", str(path_out))
# Get the  schema
df_can_schema = data_schema(df_can)
path_out = Path(data_dir) / ted_can_schema_file
df_can_schema.to_csv(path_out, sep=";", index=False)
print("Schema saved to:", str(path_out)) 
print()

>> Preparing output

> Creating CFC file
Final CFC length: 298771
Data saved to: data/TED_CFC_2020-2022.csv
Schema saved to: data/TED_CFC_schema.csv

> Creating CAN file
Final CAN length: 252277
Data saved to: data/TED_CAN_2020-2022.csv
Schema saved to: data/TED_CAN_schema.csv



In [10]:
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-06 22:54:47
Time to finish: 0:01:28


*** PROGRAM END ***

