In [54]:
# 02_stats_opendata
# Create file statistics by country and year. Before executing this script, configure the filters in the JSON defined in ```TED_CONFIG_FILE```.

In [55]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [56]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd

In [57]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import list_files_by_type, read_csv_data, data_schema, json_data_to_list_dict, df_uniques, dic_get_years

In [58]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
stats_dir = str(yaml_config["STATS_DIR"])
cpv_division = str(yaml_config["CPV_DIVISION"]) # input: cpv division for stats
ted_config_file = str(yaml_config["TED_CONFIG_FILE"]) # input: filter configuration
ted_cfc_file = str(yaml_config["TED_CFC_FILE"]) # input
ted_can_file = str(yaml_config["TED_CAN_FILE"]) # input
dic_types_cfc = dict(yaml_config["TED_CFC_TYPES"]) # input
dic_types_can = dict(yaml_config["TED_CAN_TYPES"]) # input

In [59]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-28 10:29:56



In [60]:
# Gets filters from JSON configuration
print(">> Filters configuration")
print("Configuration file:", ted_config_file)
list_filters = json_data_to_list_dict(ted_config_file)
print("Configuration list:",list_filters) 
# Find the dictionary with the key 'YEAR' and get min/max values
min_year, max_year = dic_get_years(list_filters, 'YEAR')

>> Filters configuration
Configuration file: ted_config.json
Configuration list: [{'CAE_TYPE': ['3']}, {'ISO_COUNTRY_CODE': ['IT', 'FR', 'ES', 'DE']}, {'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]}]
Minimum value for YEAR: 2016
Maximum value for YEAR: 2022


In [61]:
print(">> Reading CFC and CAN file")
path_cfc = Path(data_dir) / ted_cfc_file.replace("YS", str(min_year)).replace("YE", str(max_year))
path_can = Path(data_dir) / ted_can_file.replace("YS", str(min_year)).replace("YE", str(max_year))
print("CFC path:", path_cfc)
print("CAN path:", path_can)
df_cfc = read_csv_data(path_cfc, dic_types_cfc, ";")
df_can = read_csv_data(path_can, dic_types_can, ";")
# print("CFC columns:",df_cfc.columns) # debug
# print("CAN columns:",df_can.columns) # debug
print()

>> Reading CFC and CAN file
CFC path: data/TED_CFC_2016-2022.csv
CAN path: data/TED_CAN_2016-2022.csv



In [62]:
# Checking filtered data
print(">> Checking filtered data")
# Displaying unique values of the columns involved in the conditions for each dataframe
print("Dataframe CFC")
df_uniques(df_cfc, list_filters)
print("Dataframe CAN")
df_uniques(df_can, list_filters)
print()

>> Checking filtered data
Dataframe CFC
Unique values for column 'CAE_TYPE': ['3']
Unique values for column 'ISO_COUNTRY_CODE': ['ES', 'DE', 'IT', 'FR']
Unique values for column 'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]
Dataframe CAN
Unique values for column 'CAE_TYPE': ['3']
Unique values for column 'ISO_COUNTRY_CODE': ['IT', 'FR', 'DE', 'ES']
Unique values for column 'YEAR': [2016, 2017, 2018, 2019, 2020, 2021, 2022]



In [63]:
# Creation of statistics
print(">> Creating stats file")
list_stats_floats = ["VALUE_EURO"] # min, max, avg
list_stats_int = ["CPV"]
col_cpv_division = f"CPV_division_{cpv_division}"

# For every ISO_COUNTRY_CODE, YEAR, TYPE_OF_CONTRACT get stats by list_stats_floats, list_stats_int, cpv_division
list_stats = []
for country in df_cfc['ISO_COUNTRY_CODE'].unique():
    df_cfc_country = df_cfc[df_cfc['ISO_COUNTRY_CODE'] == country]
    for year in df_cfc_country['YEAR'].unique():
        df_cfc_year = df_cfc_country[df_cfc_country['YEAR'] == year]
        for ted_type in df_cfc_year['TYPE_OF_CONTRACT'].unique():
            df_cfc_type = df_cfc_year[df_cfc_year['TYPE_OF_CONTRACT'] == ted_type]
            print(f"County: {country} | Year: {year} | Type: {ted_type}")
            print("Dataframe size (rows):", len(df_cfc_type))
            
            # create the output stats
            data_pd = {
                    "country": country,
                    "year": year,
                    "notice_num": df_cfc_type["ID_NOTICE_CN"].nunique()
                }
            
            for col in list_stats_floats:
                data_pd[f"{col}_min"] = df_cfc_type[col].min().round(2)
                data_pd[f"{col}_max"] = df_cfc_type[col].max().round(2)
                data_pd[f"{col}_mean"] = df_cfc_type[col].mean().round(2)
                data_pd[f"{col}_median"] = df_cfc_type[col].median()

            for col in list_stats_int:
                data_pd[f"{col}_distinct_count"] =  df_cfc_type[col].nunique()

            # Get CPV division stats
            data_pd[col_cpv_division] = df_cfc_type['CPV'].str.startswith(cpv_division).sum()

            list_stats.append(data_pd)

    print("-"*3)

df_stats = pd.DataFrame.from_records(list_stats) # creates a single dataframe
df_stats_agg = df_stats.groupby(['country', 'year']).sum().reset_index() # Aggregate data by country and year
df_stats_agg = df_stats_agg.sort_values(by = ["country", "year"])

>> Creating stats file
County: ES | Year: 2016 | Type: U
Dataframe size (rows): 2629
County: ES | Year: 2016 | Type: S
Dataframe size (rows): 3333
County: ES | Year: 2016 | Type: W
Dataframe size (rows): 77
County: ES | Year: 2017 | Type: U
Dataframe size (rows): 5759
County: ES | Year: 2017 | Type: S
Dataframe size (rows): 5797
County: ES | Year: 2017 | Type: W
Dataframe size (rows): 168
County: ES | Year: 2018 | Type: W
Dataframe size (rows): 207
County: ES | Year: 2018 | Type: U
Dataframe size (rows): 7541
County: ES | Year: 2018 | Type: S
Dataframe size (rows): 6438
County: ES | Year: 2019 | Type: W
Dataframe size (rows): 262
County: ES | Year: 2019 | Type: U
Dataframe size (rows): 13667
County: ES | Year: 2019 | Type: S
Dataframe size (rows): 12701
County: ES | Year: 2020 | Type: W
Dataframe size (rows): 390
County: ES | Year: 2020 | Type: U
Dataframe size (rows): 12957
County: ES | Year: 2020 | Type: S
Dataframe size (rows): 35986
County: ES | Year: 2021 | Type: W
Dataframe size 

In [64]:
df_stats_agg

Unnamed: 0,country,year,notice_num,VALUE_EURO_min,VALUE_EURO_max,VALUE_EURO_mean,VALUE_EURO_median,CPV_distinct_count,CPV_division_90
0,DE,2016,13609,0.03,5551500000.0,9408385.0,1549410.0,1615,2026
1,DE,2017,16136,0.03,1198560000.0,9714189.0,1980000.0,1786,3429
2,DE,2018,18991,0.03,486000000.0,8235383.0,1427765.0,1926,3511
3,DE,2019,22051,0.03,464691600.0,10736920.0,1775831.0,1943,3892
4,DE,2020,24948,0.03,1131003000.0,10414440.0,1544626.0,1707,3546
5,DE,2021,26182,0.03,25071000000.0,18427070.0,1632227.0,1629,3913
6,DE,2022,28114,0.03,3230000000.0,12428240.0,1878682.0,1729,4244
7,ES,2016,4385,65618.82,2316082000.0,38043020.0,13799000.0,1045,770
8,ES,2017,5608,881233.61,827509200.0,32196560.0,11289220.0,1239,852
9,ES,2018,5708,214008.28,3682308000.0,45625880.0,9840360.0,1327,1052


In [65]:
# Checking stats
print(">> Checking stats")
print("Distinct ID_NOTICE_CN in CFC:", df_cfc["ID_NOTICE_CN"].nunique())
print("Sum of distinct ID_NOTICE_CN in stats:", df_stats["notice_num"].sum())
# df_grouped_cpv_90 = df_stats.groupby('country')[col_cpv_division].sum().reset_index()
df_country_cpv = df_stats.groupby('country').agg({
    "notice_num": 'sum',
    col_cpv_division: 'sum',
}).reset_index()
df_country_cpv[f"{col_cpv_division}_ratio"] = (df_country_cpv[col_cpv_division] / df_country_cpv['notice_num']).round(2)
df_country_cpv = df_country_cpv.sort_values(by = col_cpv_division, ascending=False)

>> Checking stats
Distinct ID_NOTICE_CN in CFC: 324941
Sum of distinct ID_NOTICE_CN in stats: 324941


In [66]:
df_country_cpv

Unnamed: 0,country,notice_num,CPV_division_90,CPV_division_90_ratio
0,DE,150031,24561,0.16
2,FR,99915,18523,0.19
1,ES,47743,9596,0.2
3,IT,27252,6426,0.24


In [67]:
print(">> Saving stats")
path_out = Path(stats_dir) / f"TED_CFC_{min_year}-{max_year}_stats.csv"
df_stats_agg.to_csv(path_out, sep=";", index=False)
path_out = Path(stats_dir) / f"TED_CFC_{min_year}-{max_year}_stats.xlsx"
df_stats_agg.to_excel(path_out, sheet_name=f"TED_CFC_{min_year}-{max_year}_stats", index=False)
path_out = Path(stats_dir) / f"TED_CFC_{min_year}-{max_year}_cpv_{cpv_division}.csv"
df_country_cpv.to_csv(path_out, sep=";", index=False)

>> Saving stats


In [68]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-28 10:30:17
Time to finish: 0:00:21


*** PROGRAM END ***

