In [25]:
# 04_download_guue
# Starting from the file defined in TED_URLS_FILE, it downloads the linked PDFs in the 'PDF_NOTICE_URL' column of the CSV and saves them in GUUE_DIR. 

In [26]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [27]:
### IMPORT ###
from pathlib import Path
from datetime import datetime
import requests

In [28]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data

In [29]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
ted_urls_file = str(yaml_config["TED_URLS_FILE"]) # input
ted_urls_config = str(yaml_config["TED_URLS_FILE"]) # input
ted_lang_codes = list(yaml_config["TED_LANG_DOWNLOAD"]) # input
lang_code = ted_lang_codes[0]
ted_cpv_codes = list(yaml_config["TED_CPV_DOWNLOAD"]) # optional: choose a cpv division to PDF download instead of all PDFs
ted_cpv_codes_str = [str(code) for code in ted_cpv_codes] # string version of the above list
guue_dir = str(yaml_config["GUUE_DIR"]) # output
year_start = int(yaml_config["YEAR_START"])
year_end = int(yaml_config["YEAR_END"])
years_list = list(range(year_start, year_end + 1))
log_download = str(yaml_config["LOG_DOWNLOAD"]) # output 
log_download_header = str(yaml_config["LOG_DOWNLOAD_HEADER"]) # output 

In [30]:
### FUNCTIONS ###
def get_filename_from_cd(content_disposition: str) -> str:
    """
    Extracts the filename from the Content-Disposition header.

    Parameters:
        content_disposition (str): The Content-Disposition header from which to extract the filename.

    Returns:
        str: The extracted filename, or None if the header is not present or does not contain a filename.
    """
    if not content_disposition:
        return None
    filename = content_disposition.split('filename=')[1]
    if filename[0] == '"' or filename[0] == "'":
        filename = filename[1:-1]
    return filename

In [31]:
def download_pdf(url:str, download_dir:str, lang_name:str, ted_year:str, file_name:str) -> int:
    """
    Downloads a PDF file from the given URL and saves it to the specified directory.

    Parameters:
        url (str): The URL of the PDF file to be downloaded.
        download_dir (Path): The download directory where the PDF file will be saved.
        lang_name (Path): The specific directory where the PDF file will be saved (as language).
        ted_year (Path): The specific directory where the PDF file will be saved (as year).
        file_name (Path): The default name of the PDF file if CD is empty.
    Returns:
        int: 1 if downloaded, else 0
    """
    try:
        response = requests.get(url)
        response.raise_for_status() # Check that the request was successful
        cd = response.headers.get('Content-Disposition') # attachment; filename="2016-OJS008-00010051-fr-ts.pdf"
        # print("CD:", cd) # debug
        # If Content-Disposition is available, extract the filename from the URL, else use the default one
        if cd:
            file_name = get_filename_from_cd(cd)
        save_path = Path(download_dir) / lang_name / ted_year / file_name
        if save_path.exists():
            print(f"File '{save_path}' already exists. Skipping download.")
            return 0
        with open(save_path, 'wb') as f:
            f.write(response.content)
            print(f"Downloaded '{url}' to '{save_path}'")
            return 1
    except Exception as e:
        print(f"Failed to download '{url}': {e}")
        return 0

In [32]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-09-02 10:39:36



In [33]:
# Reads URLs file
print(">> Parsing URLs file")
dic_types_cfc = {'ID_NOTICE_CN':object, 'YEAR':object, 'CPV_DIVISION':object} # Columns not to be transformed into numbers
path_in = Path(data_dir) / ted_urls_file
print("Path:", path_in)
df_ted_url = read_csv_data(path_in, dic_types_cfc, ";")
df_ted_url = df_ted_url[df_ted_url['PDF_NOTICE_URL'].notna()]
df_ted_url_len = len(df_ted_url)
print("Rows in CSV file:", df_ted_url_len)
# print(df_ted_url.head()) # debug
print("Languages:", df_ted_url['ISO_COUNTRY_CODE'].unique())
count_by_country_code = df_ted_url['ISO_COUNTRY_CODE'].value_counts().reset_index()
count_by_country_code.columns = ['ISO_COUNTRY_CODE', 'COUNT']
print(count_by_country_code)
print("Years list")
print(years_list)
print()

>> Parsing URLs file
Path: data/TED_URLs.csv
Rows in CSV file: 329017
Languages: ['PT' 'DE' 'FR' 'ES' 'IT']
  ISO_COUNTRY_CODE   COUNT
0               DE  150031
1               FR   99915
2               ES   47743
3               IT   27252
4               PT    4076
Years list
[2016, 2017, 2018, 2019, 2020, 2021, 2022]



In [34]:
print(">> Preparing the output directories")
save_dir = Path(guue_dir)
save_dir.mkdir(parents=True, exist_ok=True)
print("Path all languages:", save_dir)
save_dir_lg = Path(guue_dir) / lang_code
save_dir_lg.mkdir(parents=True, exist_ok=True)
print(f"Path specific language ({lang_code}): {save_dir_lg}")
for year_value in years_list:
    save_dir_lg_y = save_dir_lg / str(year_value)
    save_dir_lg_y.mkdir(parents=True, exist_ok=True)
    print(f"Path specific year ({year_value}): {save_dir_lg_y}")

>> Preparing the output directories
Path all languages: guue
Path specific language (IT): guue/IT
Path specific year (2016): guue/IT/2016
Path specific year (2017): guue/IT/2017
Path specific year (2018): guue/IT/2018
Path specific year (2019): guue/IT/2019
Path specific year (2020): guue/IT/2020
Path specific year (2021): guue/IT/2021
Path specific year (2022): guue/IT/2022


In [35]:
print(">> Preparing the output timing log")
path_log = Path(guue_dir) / log_download
if not path_log.exists():
    # If the file does not exist, create it with the specified header
    with path_log.open(mode='w') as fp:
        fp.write(f"{log_download_header}\n")
    print(f"File created with header: {path_log}")
else:
    print(f"The file already exists: {path_log}")

>> Preparing the output timing log
File created with header: guue/ted_log_pdf_download.csv


In [36]:
# Download PDF references in PDF_NOTICE_URL
print(">> Downloading PDF files")
print("Country list:", ted_lang_codes)
df_ted_url_country = df_ted_url[df_ted_url["ISO_COUNTRY_CODE"].isin(ted_lang_codes)] #  Gets the URLs of only the desired languages
# print("CPV list:", ted_cpv_codes_str)
# df_ted_url_country = df_ted_url_country[df_ted_url_country["CPV_DIVISION"].isin(ted_cpv_codes_str)] # optional: choose a cpv division to PDF download instead of all PDFs
df_ted_url_country_len = len(df_ted_url_country)
print("Rows for the chosen filters:", df_ted_url_country_len)

>> Downloading PDF files
Country list: ['IT']
Rows for the chosen filters: 27252


In [37]:
print("> Download starting...")
i = 0
count_ok = 0
for index, row in df_ted_url_country.iterrows():
    i+=1
    print(f"[{i} / {df_ted_url_country_len}] language {lang_code}")
    ted_year = str(row['YEAR'])
    pdf_url = row['PDF_NOTICE_URL']
    file_name = f"{row['ID_NOTICE_CN']}.pdf" # Save the PDF with same name of ID_NOTICE_CN if missing in response
    ok = download_pdf(pdf_url, guue_dir, lang_code, ted_year, file_name)
    count_ok+=ok
print("Total files downloaded:", count_ok)
print()

> Download starting...
[1 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101003-it-ts.pdf' already exists. Skipping download.
[2 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101145-it-ts.pdf' already exists. Skipping download.
[3 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101207-it-ts.pdf' already exists. Skipping download.
[4 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101591-it-ts.pdf' already exists. Skipping download.
[5 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101593-it-ts.pdf' already exists. Skipping download.
[6 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101596-it-ts.pdf' already exists. Skipping download.
[7 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101598-it-ts.pdf' already exists. Skipping download.
[8 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101599-it-ts.pdf' already exists. Skipping download.
[9 / 27252] language IT
File 'guue/IT/2016/2016-OJS060-00101667-it-ts.pdf' already exists. Skippi

In [None]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

# Save the timing
print(">> Saving timing to log")
csv_str = f"{lang_code};{str(start_time)};{str(end_time)};{str(delta_time)}\n"
with open(path_log, "a") as fp:
    fp.write(csv_str)
    
print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-07-16 14:09:24
Time to finish: 3:40:48


*** PROGRAM END ***

