In [83]:
# 01_read_opendata.ipynb
# Reads, merges and filters raw Open Data.
# CFC: Call For Competition (step 1/2)  
# CAN: Contract Award Notices (step 2/2)
# 2024-05-13: Added URL extraction to the texts of each CFC

In [84]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [85]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd

In [86]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import list_files_by_type, read_csv_data, data_schema, json_data_to_list_dict, df_filter, df_uniques, df_stats_by_year

In [87]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
od_dir = str(yaml_config["OD_DIR"])
data_dir = str(yaml_config["DATA_DIR"])
stats_dir = str(yaml_config["STATS_DIR"])
cpv_division = str(yaml_config["CPV_DIVISION"]) # input: cpv division for stats
ted_config_file = str(yaml_config["TED_CONFIG_FILE"]) # input: filter configuration
ted_cfc_file = str(yaml_config["TED_CFC_FILE"]) # output
ted_can_file = str(yaml_config["TED_CAN_FILE"]) # output
ted_cfc_schema_file = str(yaml_config["TED_CFC_SCHEMA_FILE"]) # output
ted_can_schema_file = str(yaml_config["TED_CAN_SCHEMA_FILE"]) # output
ted_urls_file = str(yaml_config["TED_URLS_FILE"]) # output

In [88]:
### FUNCTIONS ###
def generate_pdf_notice_url(row: pd.Series) -> str:
    """
    Adds a new column 'PDF_NOTICE_URL' to the DataFrame by generating the URL based on the 'TED_NOTICE_URL' and 'ISO_COUNTRY_CODE' columns.
    
    Parameters:
        row (pd.Series): A row of the DataFrame containing TED_NOTICE_URL and ISO_COUNTRY_CODE.
    
    Returns:
        str: The constructed PDF_NOTICE_URL or None if the TED_NOTICE_URL format is incorrect.
    """

    # Check if ISO_COUNTRY_CODE is not null
    if pd.isnull(row['ISO_COUNTRY_CODE']):
        return None
    
    # Extract the uri part from the URL
    uri_part = row['TED_NOTICE_URL'].split('uri=')[1]
    
    # Split the uri part to extract the ID
    parts = uri_part.split(':')

    if len(parts) > 3:
        notice_id = parts[2]  # Get the desired element
        # Convert ISO country code to lowercase
        iso_country_code = row['ISO_COUNTRY_CODE'].lower()
        # Construct the new URL
        pdf_notice_url = f"https://ted.europa.eu/{iso_country_code}/notice/{notice_id}/pdfs"
        return pdf_notice_url
    
    return None

def add_pdf_notice_url_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds a new column 'PDF_NOTICE_URL' to the DataFrame by generating the URL based on the 'TED_NOTICE_URL' and 'ISO_COUNTRY_CODE' columns.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame containing 'TED_NOTICE_URL' and 'ISO_COUNTRY_CODE' columns.
    
    Returns:
        pd.DataFrame: The DataFrame with the added 'PDF_NOTICE_URL' column.
    """
    df['PDF_NOTICE_URL'] = df.apply(generate_pdf_notice_url, axis=1)
    return df

In [89]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-27 23:21:59



In [90]:
# Create list of CSV files
print(">> Listing OD files")
print("Directory:", od_dir)
list_csv_files = list_files_by_type(od_dir, "csv") # Gets all CSV type files in od_dir
list_csv_files_len = len(list_csv_files)
print("Files found:", list_csv_files_len)
# print("Files:", list_csv_files) # debug
print()

>> Listing OD files
Directory: opendata
Files found: 14



In [91]:
# Gets filters from JSON configuration
print(">> Filters")
print("Configuration file:", ted_config_file)
list_filters = json_data_to_list_dict(ted_config_file)
print("Configuration list:",list_filters) 
# Find the dictionary with the key 'YEAR'
year_dict = next((item for item in list_filters if 'YEAR' in item), None)
# Extract the minimum and maximum value if the dictionary is found
if year_dict:
    year_list = year_dict['YEAR']
    min_year = min(year_list)
    max_year = max(year_list)
    print("Minimum value for YEAR:", min_year) 
    print("Maximum value for YEAR:", max_year)
else:
    print("Dictionary with key 'YEAR' not found.")

>> Filters
Configuration file: ted_config.json
Configuration list: [{'CAE_TYPE': ['3']}, {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']}, {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]}]
Minimum value for YEAR: 2016
Maximum value for YEAR: 2022


In [92]:
# Reads raw data and merge the DataFrames (# CFC.FUTURE_CAN_ID = CNC.ID_NOTICE_CAN) filtering it
print(">> Parsing OD files")
dic_types_cfc = {'ID_NOTICE_CN':object, 'FUTURE_CAN_ID':object, 'FUTURE_CAN_ID_ESTIMATED':object, 'CPV':object, 'CAE_TYPE':object} # Columns not to be transformed into numbers
dic_types_can = {'ID_NOTICE_CAN':object, 'ID_AWARD':object, 'ID_LOT_AWARDED':object, 'CPV':object, 'CAE_TYPE': object} # Columns not to be transformed into numbers
list_cfc = [] # It will contain all dataframes of type CFC
list_can = [] # It will contain all dataframes of type CAN
i = 0
for csv_file in list_csv_files:
    i+=1
    print(f"[{i} / {list_csv_files_len}]")
    if "CFC" in csv_file.name:
        print("Reading CFC file:", csv_file.name)
        df = read_csv_data(csv_file, dic_types_cfc)
        df_len = len(df)
        print("Dataframe length (complete):", df_len)
        # filters
        # df_filtered = df[df[ted_country_codes_feature].isin(ted_country_codes_values) & df[ted_cae_codes_feature].isin(ted_cae_codes_values)]
        df_filtered = df_filter(df, list_filters)
        print("Dataframe length (filtered):", df_filtered_len)
        list_cfc.append(df_filtered)
    if "CAN" in csv_file.name:
        print("Reading CAN file:", csv_file.name)
        df = read_csv_data(csv_file, dic_types_can)
        df_len = len(df)
        print("Dataframe length (complete):", df_len)
        # filters
        # df_filtered = df[df[ted_country_codes_feature].isin(ted_country_codes_values) & df[ted_cae_codes_feature].isin(ted_cae_codes_values)]
        df_filtered = df_filter(df, list_filters)
        df_filtered_len = len(df_filtered)
        print("Dataframe length (filtered):", df_filtered_len)
        list_can.append(df_filtered)
print()

>> Parsing OD files
[1 / 14]
Reading CAN file: Export_OpenDataCAN_year2016.csv
Dataframe length (complete): 556084
Filtering for: {'CAE_TYPE': ['3']} [OK]
Filtering for: {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']} [OK]
Filtering for: {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]} [OK]
Dataframe length (filtered): 66737
[2 / 14]
Reading CAN file: Export_OpenDataCAN_year2017.csv
Dataframe length (complete): 702824
Filtering for: {'CAE_TYPE': ['3']} [OK]
Filtering for: {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']} [OK]
Filtering for: {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]} [OK]
Dataframe length (filtered): 82858
[3 / 14]
Reading CAN file: Export_OpenDataCAN_year2018.csv
Dataframe length (complete): 804040
Filtering for: {'CAE_TYPE': ['3']} [OK]
Filtering for: {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']} [OK]
Filtering for: {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]} [OK]
Dataframe length (filtered): 98625
[4 / 14]
Reading CAN file: Export_OpenDataCAN_year

In [93]:
# Output the data merged and filtered
print(">> Preparing output")
out_dir = Path(data_dir)
out_dir.mkdir(exist_ok=True)
print()

>> Preparing output



In [94]:
print(">> Creating unique CFC file")
# Merges all dataframes in the list and saves to file
df_cfc = pd.concat(list_cfc, ignore_index=True)
df_cfc_len = len(df_cfc)
print("Final CFC length:", df_cfc_len)
path_out = Path(data_dir) / ted_cfc_file.replace("YS", str(min_year)).replace("YE", str(max_year))
df_cfc.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to:", str(path_out)) 
# Get the  schema
df_cfc_schema = data_schema(df_cfc)
path_out = Path(data_dir) / ted_cfc_schema_file
df_cfc_schema.to_csv(path_out, sep=";", index=False)
print("Schema saved to:", str(path_out)) 
print()

>> Creating unique CFC file
Final CFC length: 811398
Data saved to: data/TED_CFC_2016-2022.csv
Schema saved to: data/TED_CFC_schema.csv



In [95]:
print(">> Creating unique CAN file")
# Merges all dataframes in the list and saves to file
df_can = pd.concat(list_can, ignore_index=True)
df_can_len = len(df_can)
print("Final CAN length:", df_can_len)
path_out = Path(data_dir) / ted_can_file.replace("YS", str(min_year)).replace("YE", str(max_year))
df_can.to_csv(path_out, sep=";", index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Data saved to:", str(path_out))
# Get the  schema
df_can_schema = data_schema(df_can)
path_out = Path(data_dir) / ted_can_schema_file
df_can_schema.to_csv(path_out, sep=";", index=False)
print("Schema saved to:", str(path_out)) 
print()

>> Creating unique CAN file
Final CAN length: 730216
Data saved to: data/TED_CAN_2016-2022.csv
Schema saved to: data/TED_CAN_schema.csv



In [96]:
# Checking filtered data
print(">> Checking filtered data")
# Displaying unique values of the columns involved in the conditions for eache dataframe
print("Dataframe CFC")
df_uniques(df_cfc, list_filters)
print("Dataframe CAN")
df_uniques(df_can, list_filters)
print()

>> Checking filtered data
Dataframe CFC
Unique values for column 'CAE_TYPE': ['3']
Unique values for column 'ISO_COUNTRY_CODE': ['ES', 'DE', 'IT', 'FR']
Unique values for column 'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]
Dataframe CAN
Unique values for column 'CAE_TYPE': ['3']
Unique values for column 'ISO_COUNTRY_CODE': ['IT', 'FR', 'DE', 'ES']
Unique values for column 'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]



In [97]:
print(">> Creating URLs file")
# Extracts the URL of the texts of each CFC sorted by ID_NOTICE_CN and YEAR 
cols_select = ["ID_NOTICE_CN", "YEAR", "ISO_COUNTRY_CODE", "TED_NOTICE_URL"]
df_cfc_url = df_cfc[cols_select]
df_cfc_url = df_cfc_url.drop_duplicates()
df_cfc_url = df_cfc_url.sort_values(by = cols_select)
df_cfc_url = add_pdf_notice_url_column(df_cfc_url)
path_out = Path(data_dir) / ted_urls_file
df_cfc_url.to_csv(path_out, sep = ";", index = False)
print("URLs saved to:", str(path_out)) 
print()

>> Creating URLs file
URLs saved to: data/TED_URLs.csv



In [98]:
# Creation of statistics
print(">> Creating stats file")
list_stats_floats = ["VALUE_EURO"] # min, max, avg
list_stats_int = ["CPV"]
col_cpv_division = f"CPV_division_{cpv_division}"

# For every ISO_COUNTRY_CODE, YEAR, TYPE_OF_CONTRACT get stats by list_stats_floats, list_stats_int, cpv_division
list_stats = []
for country in df_cfc['ISO_COUNTRY_CODE'].unique():
    df_cfc_country = df_cfc[df_cfc['ISO_COUNTRY_CODE'] == country]
    for year in df_cfc_country['YEAR'].unique():
        df_cfc_year = df_cfc_country[df_cfc_country['YEAR'] == year]
        for ted_type in df_cfc_year['TYPE_OF_CONTRACT'].unique():
            df_cfc_type = df_cfc_year[df_cfc_year['TYPE_OF_CONTRACT'] == ted_type]
            print(f"County: {country} | Year: {year} | Type: {ted_type}")
            print("Dataframe size (rows):", len(df_cfc_type))
            
            # create the output stats
            data_pd = {
                    "country": country,
                    "year": year,
                    "notice_num": df_cfc_type["ID_NOTICE_CN"].nunique()
                }
            
            for col in list_stats_floats:
                data_pd[f"{col}_min"] = df_cfc_type[col].min().round(2)
                data_pd[f"{col}_max"] = df_cfc_type[col].max().round(2)
                data_pd[f"{col}_mean"] = df_cfc_type[col].mean().round(2)
                data_pd[f"{col}_median"] = df_cfc_type[col].median()

            for col in list_stats_int:
                data_pd[f"{col}_distinct_count"] =  df_cfc_type[col].nunique()

            # Get CPV division stats
            data_pd[col_cpv_division] = df_cfc_type['CPV'].str.startswith(cpv_division).sum()

            list_stats.append(data_pd)

    print("-"*3)

# Display the final result
df_stats = pd.DataFrame.from_records(list_stats)
df_stats = df_stats.sort_values(by = ["country", "year"])

>> Creating stats file
County: ES | Year: 2016 | Type: U
Dataframe size (rows): 2629
County: ES | Year: 2016 | Type: S
Dataframe size (rows): 3333
County: ES | Year: 2016 | Type: W
Dataframe size (rows): 77
County: ES | Year: 2017 | Type: U
Dataframe size (rows): 5759
County: ES | Year: 2017 | Type: S
Dataframe size (rows): 5797
County: ES | Year: 2017 | Type: W
Dataframe size (rows): 168
County: ES | Year: 2018 | Type: W
Dataframe size (rows): 207
County: ES | Year: 2018 | Type: U
Dataframe size (rows): 7541
County: ES | Year: 2018 | Type: S
Dataframe size (rows): 6438
County: ES | Year: 2019 | Type: W
Dataframe size (rows): 262
County: ES | Year: 2019 | Type: U
Dataframe size (rows): 13667
County: ES | Year: 2019 | Type: S
Dataframe size (rows): 12701
County: ES | Year: 2020 | Type: W
Dataframe size (rows): 390
County: ES | Year: 2020 | Type: U
Dataframe size (rows): 12957
County: ES | Year: 2020 | Type: S
Dataframe size (rows): 35986
County: ES | Year: 2021 | Type: W
Dataframe size 

In [99]:
df_stats

Unnamed: 0,country,year,notice_num,VALUE_EURO_min,VALUE_EURO_max,VALUE_EURO_mean,VALUE_EURO_median,CPV_distinct_count,CPV_division_90
21,DE,2016,4594,0.01,1.265000e+08,2213118.61,7.148100e+05,518,1957
22,DE,2016,6504,0.01,5.380000e+09,6291426.74,4.516000e+05,568,25
23,DE,2016,2511,0.01,4.500000e+07,903839.66,3.830000e+05,529,44
24,DE,2017,7423,0.01,7.200000e+08,4493312.77,4.600000e+05,631,29
25,DE,2017,2939,0.01,2.565600e+08,1416703.01,4.200000e+05,576,88
...,...,...,...,...,...,...,...,...,...
58,IT,2021,3587,20000.00,1.878876e+09,9884377.68,1.275000e+06,464,1310
59,IT,2021,167,117950.29,3.348465e+08,15286927.74,8.667817e+06,61,0
60,IT,2022,330,162460.22,6.368850e+08,52990454.71,1.047055e+07,106,0
61,IT,2022,942,1700.00,8.000000e+09,70537161.18,7.200903e+06,328,0


In [100]:
# Checking stats
print(">> Checking stats")
print("Distinct ID_NOTICE_CN in CFC:", df_cfc["ID_NOTICE_CN"].nunique())
print("Sum of distinct ID_NOTICE_CN in stats:", df_stats["notice_num"].sum())
# df_grouped_cpv_90 = df_stats.groupby('country')[col_cpv_division].sum().reset_index()
df_country_cpv = df_stats.groupby('country').agg({
    "notice_num": 'sum',
    col_cpv_division: 'sum',
}).reset_index()
df_country_cpv = df_country_cpv.sort_values(by = col_cpv_division, ascending=False)

>> Checking stats
Distinct ID_NOTICE_CN in CFC: 324941
Sum of distinct ID_NOTICE_CN in stats: 324941


In [101]:
df_country_cpv

Unnamed: 0,country,notice_num,CPV_division_90
0,DE,150031,24561
2,FR,99915,18523
1,ES,47743,9596
3,IT,27252,6426


In [102]:
print(">> Saving stats")
path_out = Path(stats_dir) / "TED_CFC_2016-2022_stats.csv"
df_stats.to_csv(path_out, sep=";", index=False)
path_out = Path(stats_dir) / "TED_CFC_2016-2022_stats.xlsx"
df_stats.to_excel(path_out, sheet_name="TED_CFC_2016-2022_stats", index=False)
path_out = Path(stats_dir) / f"TED_CFC_2016-2022_cpv_{cpv_division}.csv"
df_country_cpv.to_csv(path_out, sep=";", index=False)

>> Saving stats


In [103]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-27 23:24:54
Time to finish: 0:02:55


*** PROGRAM END ***

