In [25]:
# 01_read_opendata
# Reads, merges and filters raw Open Data.
# CFC: Call For Competition (step 1/2)  
# CAN: Contract Award Notices (step 2/2)
# 2024-05-13: Added URL extraction to the texts of each CFC

In [26]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [27]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd

In [28]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import list_files_by_type, read_csv_data, data_schema, json_data_to_list_dict, df_filter, df_uniques, dic_get_years

In [29]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
od_dir = str(yaml_config["OD_DIR"])
data_dir = str(yaml_config["DATA_DIR"])
stats_dir = str(yaml_config["STATS_DIR"])
ted_config_file = str(yaml_config["TED_CONFIG_FILE"]) # input: filter configuration
dic_types_cfc = dict(yaml_config["TED_CFC_TYPES"]) # input
dic_types_can = dict(yaml_config["TED_CAN_TYPES"]) # input
ted_cfc_schema_file = str(yaml_config["TED_CFC_SCHEMA_FILE"]) # output
ted_can_schema_file = str(yaml_config["TED_CAN_SCHEMA_FILE"]) # output
ted_urls_file = str(yaml_config["TED_URLS_FILE"]) # output
ted_cfc_file = str(yaml_config["TED_CFC_FILE"]) # output
ted_can_file = str(yaml_config["TED_CAN_FILE"]) # output

In [30]:
### FUNCTIONS ###
def generate_pdf_notice_url(row: pd.Series) -> str:
    """
    Adds a new column 'PDF_NOTICE_URL' to the DataFrame by generating the URL based on the 'TED_NOTICE_URL' and 'ISO_COUNTRY_CODE' columns.
    
    Parameters:
        row (pd.Series): A row of the DataFrame containing TED_NOTICE_URL and ISO_COUNTRY_CODE.
    
    Returns:
        str: The constructed PDF_NOTICE_URL or None if the TED_NOTICE_URL format is incorrect.
    """

    # Check if ISO_COUNTRY_CODE is not null
    if pd.isnull(row['ISO_COUNTRY_CODE']):
        return None
    
    # Extract the uri part from the URL
    uri_part = row['TED_NOTICE_URL'].split('uri=')[1]
    
    # Split the uri part to extract the ID
    parts = uri_part.split(':')

    if len(parts) > 3:
        notice_id = parts[2]  # Get the desired element
        # Convert ISO country code to lowercase
        iso_country_code = row['ISO_COUNTRY_CODE'].lower()
        # Construct the new URL
        pdf_notice_url = f"https://ted.europa.eu/{iso_country_code}/notice/{notice_id}/pdfs"
        return pdf_notice_url
    
    return None

def add_pdf_notice_url_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds a new column 'PDF_NOTICE_URL' to the DataFrame by generating the URL based on the 'TED_NOTICE_URL' and 'ISO_COUNTRY_CODE' columns.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame containing 'TED_NOTICE_URL' and 'ISO_COUNTRY_CODE' columns.
    
    Returns:
        pd.DataFrame: The DataFrame with the added 'PDF_NOTICE_URL' column.
    """
    df['PDF_NOTICE_URL'] = df.apply(generate_pdf_notice_url, axis=1)
    return df

In [31]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-28 10:24:09



In [32]:
# Create list of CSV files
print(">> Listing OD files")
print("Directory:", od_dir)
list_csv_files = list_files_by_type(od_dir, "csv") # Gets all CSV type files in od_dir
list_csv_files_len = len(list_csv_files)
print("Files found:", list_csv_files_len)
# print("Files:", list_csv_files) # debug
print()

>> Listing OD files
Directory: opendata
Files found: 14



In [33]:
# Gets filters from JSON configuration
print(">> Filters configuration")
print("Configuration file:", ted_config_file)
list_filters = json_data_to_list_dict(ted_config_file)
print("Configuration list:",list_filters) 
# Find the dictionary with the key 'YEAR' and get min/max values
min_year, max_year = dic_get_years(list_filters, 'YEAR')

>> Filters configuration
Configuration file: ted_config.json
Configuration list: [{'CAE_TYPE': ['3']}, {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']}, {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]}]
Minimum value for YEAR: 2016
Maximum value for YEAR: 2022


In [34]:
# Reads raw data and merge the DataFrames (# CFC.FUTURE_CAN_ID = CNC.ID_NOTICE_CAN) filtering it
print(">> Parsing OD files")
list_cfc = [] # It will contain all dataframes of type CFC
list_can = [] # It will contain all dataframes of type CAN
i = 0
for csv_file in list_csv_files:
    i+=1
    print(f"[{i} / {list_csv_files_len}]")
    if "CFC" in csv_file.name:
        print("Reading CFC file:", csv_file.name)
        df = read_csv_data(csv_file, dic_types_cfc)
        df_len = len(df)
        print("Dataframe length (complete):", df_len)
        # filters
        # df_filtered = df[df[ted_country_codes_feature].isin(ted_country_codes_values) & df[ted_cae_codes_feature].isin(ted_cae_codes_values)]
        df_filtered = df_filter(df, list_filters)
        print("Dataframe length (filtered):", df_filtered_len)
        list_cfc.append(df_filtered)
    if "CAN" in csv_file.name:
        print("Reading CAN file:", csv_file.name)
        df = read_csv_data(csv_file, dic_types_can)
        df_len = len(df)
        print("Dataframe length (complete):", df_len)
        # filters
        # df_filtered = df[df[ted_country_codes_feature].isin(ted_country_codes_values) & df[ted_cae_codes_feature].isin(ted_cae_codes_values)]
        df_filtered = df_filter(df, list_filters)
        df_filtered_len = len(df_filtered)
        print("Dataframe length (filtered):", df_filtered_len)
        list_can.append(df_filtered)
print()

>> Parsing OD files
[1 / 14]
Reading CAN file: Export_OpenDataCAN_year2016.csv
Dataframe length (complete): 556084
Filtering for: {'CAE_TYPE': ['3']} [OK]
Filtering for: {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']} [OK]
Filtering for: {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]} [OK]
Dataframe length (filtered): 66737
[2 / 14]
Reading CAN file: Export_OpenDataCAN_year2017.csv
Dataframe length (complete): 702824
Filtering for: {'CAE_TYPE': ['3']} [OK]
Filtering for: {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']} [OK]
Filtering for: {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]} [OK]
Dataframe length (filtered): 82858
[3 / 14]
Reading CAN file: Export_OpenDataCAN_year2018.csv
Dataframe length (complete): 804040
Filtering for: {'CAE_TYPE': ['3']} [OK]
Filtering for: {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']} [OK]
Filtering for: {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]} [OK]
Dataframe length (filtered): 98625
[4 / 14]
Reading CAN file: Export_OpenDataCAN_year

In [35]:
# Output the data merged and filtered
print(">> Preparing output")
out_dir = Path(data_dir)
out_dir.mkdir(exist_ok=True)
print()

>> Preparing output



In [36]:
print(">> Creating unique CFC file")
# Merges all dataframes in the list and saves to file
df_cfc = pd.concat(list_cfc, ignore_index=True)
df_cfc_len = len(df_cfc)
print("Final CFC length:", df_cfc_len)
path_out = Path(data_dir) / ted_cfc_file.replace("YS", str(min_year)).replace("YE", str(max_year))
df_cfc.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to:", str(path_out)) 
# Get the  schema
df_cfc_schema = data_schema(df_cfc)
path_out = Path(data_dir) / ted_cfc_schema_file
df_cfc_schema.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Schema saved to:", str(path_out)) 
print()

>> Creating unique CFC file
Final CFC length: 811398
Data saved to: data/TED_CFC_2016-2022.csv
Schema saved to: data/TED_CFC_schema.csv



In [37]:
print(">> Creating unique CAN file")
# Merges all dataframes in the list and saves to file
df_can = pd.concat(list_can, ignore_index=True)
df_can_len = len(df_can)
print("Final CAN length:", df_can_len)
path_out = Path(data_dir) / ted_can_file.replace("YS", str(min_year)).replace("YE", str(max_year))
df_can.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to:", str(path_out))
# Get the  schema
df_can_schema = data_schema(df_can)
path_out = Path(data_dir) / ted_can_schema_file
df_can_schema.to_csv(path_out, sep=";", index=False)
print("Schema saved to:", str(path_out)) 
print()

>> Creating unique CAN file
Final CAN length: 730216
Data saved to: data/TED_CAN_2016-2022.csv
Schema saved to: data/TED_CAN_schema.csv



In [38]:
# Checking filtered data
print(">> Checking filtered data")
# Displaying unique values of the columns involved in the conditions for each dataframe
print("Dataframe CFC")
df_uniques(df_cfc, list_filters)
print("Dataframe CAN")
df_uniques(df_can, list_filters)
print()

>> Checking filtered data
Dataframe CFC
Unique values for column 'CAE_TYPE': ['3']
Unique values for column 'ISO_COUNTRY_CODE': ['ES', 'DE', 'IT', 'FR']
Unique values for column 'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]
Dataframe CAN
Unique values for column 'CAE_TYPE': ['3']
Unique values for column 'ISO_COUNTRY_CODE': ['IT', 'FR', 'DE', 'ES']
Unique values for column 'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]



In [39]:
print(">> Creating URLs file")
# Extracts the URL of the texts of each CFC sorted by ID_NOTICE_CN and YEAR 
cols_select = ["ID_NOTICE_CN", "YEAR", "ISO_COUNTRY_CODE", "TED_NOTICE_URL"]
df_cfc_url = df_cfc[cols_select]
df_cfc_url = df_cfc_url.drop_duplicates()
df_cfc_url = df_cfc_url.sort_values(by = cols_select)
df_cfc_url = add_pdf_notice_url_column(df_cfc_url)
path_out = Path(data_dir) / ted_urls_file
df_cfc_url.to_csv(path_out, sep = ";", index = False)
print("URLs saved to:", str(path_out)) 
print()

>> Creating URLs file
URLs saved to: data/TED_URLs.csv



In [40]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-28 10:27:11
Time to finish: 0:03:02


*** PROGRAM END ***

