In [12]:
# 04_download_guue
# Starting from the file defined in TED_URLS_FILE, it downloads the linked PDFs in the 'PDF_NOTICE_URL' column of the CSV and saves them in GUUE_DIR. 

In [13]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [14]:
### IMPORT ###
from pathlib import Path
from datetime import datetime
import pandas as pd
import requests

In [15]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data

In [16]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
ted_urls_file = str(yaml_config["TED_URLS_FILE"]) # input
ted_urls_config = str(yaml_config["TED_URLS_FILE"]) # input
ted_lang_codes = list(yaml_config["TED_LANG_DOWNLOAD"]) # input
ted_cpv_codes = list(yaml_config["TED_CPV_DOWNLOAD"]) # input
ted_cpv_codes_str = [str(code) for code in ted_cpv_codes]
guue_dir = str(yaml_config["GUUE_DIR"]) # output

In [17]:
### FUNCTIONS ###
def get_filename_from_cd(content_disposition: str) -> str:
    """
    Extracts the filename from the Content-Disposition header.

    Parameters:
        content_disposition (str): The Content-Disposition header from which to extract the filename.

    Returns:
        str: The extracted filename, or None if the header is not present or does not contain a filename.
    """
    if not content_disposition:
        return None
    filename = content_disposition.split('filename=')[1]
    if filename[0] == '"' or filename[0] == "'":
        filename = filename[1:-1]
    return filename

In [18]:
def download_pdf(url:str, dir_name:str, file_name:str) -> int:
    """
    Downloads a PDF file from the given URL and saves it to the specified directory.

    Parameters:
        url (str): The URL of the PDF file to be downloaded.
        dir_name (Path): The directory where the PDF file will be saved.
        file_name (Path): The default name of the PDF file if CD is empty.

    Returns:
        int: 1 if downloaded, else 0
    """
    try:
        response = requests.get(url)
        response.raise_for_status() # Check that the request was successful
        cd = response.headers.get('Content-Disposition') # attachment; filename="2016-OJS008-00010051-fr-ts.pdf"
        # print("CD:", cd) # debug
        # If Content-Disposition is available, extract the filename from the URL, else use the default one
        if cd:
            file_name = get_filename_from_cd(cd)
        save_path = Path(dir_name) / file_name
        if save_path.exists():
            print(f"File '{save_path}' already exists. Skipping download.")
            return 0
        with open(save_path, 'wb') as f:
            f.write(response.content)
            print(f"Downloaded '{url}' to '{save_path}'")
            return 1
    except Exception as e:
        print(f"Failed to download '{url}': {e}")
        return 0

In [19]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-28 11:26:16



In [20]:
# Reads URLs file
print(">> Parsing URLs file")
dic_types_cfc = {'ID_NOTICE_CN':object, 'YEAR':object, 'CPV_DIVISION':object} # Columns not to be transformed into numbers
path_in = Path(data_dir) / ted_urls_file
print("Path:", path_in)
df_ted_url = read_csv_data(path_in, dic_types_cfc, ";")
df_ted_url = df_ted_url[df_ted_url['PDF_NOTICE_URL'].notna()]
df_ted_url_len = len(df_ted_url)
print("Rows in CSV file:", df_ted_url_len)
# print(df_ted_url.head()) # debug
print()

>> Parsing URLs file
Path: data/TED_URLs.csv
Rows in CSV file: 324941



In [21]:
print(">> Preparing the output directory")
save_dir = Path(guue_dir)
save_dir.mkdir(parents=True, exist_ok=True)
print("Path:", save_dir)

>> Preparing the output directory
Path: guue


In [22]:
# Download PDF references in PDF_NOTICE_URL
print(">> Downloading PDF files")
print("Country list:", ted_lang_codes)
print("CPV list:", ted_cpv_codes_str)
df_ted_url_country = df_ted_url[df_ted_url["ISO_COUNTRY_CODE"].isin(ted_lang_codes)] #  Gets the URLs of only the desired languages
df_ted_url_country = df_ted_url_country[df_ted_url_country["CPV_DIVISION"].isin(ted_cpv_codes_str)]
df_ted_url_country_len = len(df_ted_url_country)
print("Rows for the chosen language and CPV division:", df_ted_url_country_len)

>> Downloading PDF files
Country list: ['IT']
CPV list: ['90']
Rows for the chosen language: 3372


In [24]:
print("> Download starting...")
i = 0
count_ok = 0
for index, row in df_ted_url_country.iterrows():
    i+=1
    print(f"[{i} / {df_ted_url_country_len}]")
    pdf_url = row['PDF_NOTICE_URL']
    file_name = f"{row['ID_NOTICE_CN']}.pdf" # Save the PDF with same name of ID_NOTICE_CN if missing in response
    ok = download_pdf(pdf_url, guue_dir, file_name)
    count_ok+=ok
print("Total files downloaded:", count_ok)
print()

> Download starting...
[1 / 3372]
File 'guue/2016-OJS060-00101754-it-ts.pdf' already exists. Skipping download.
[2 / 3372]
File 'guue/2016-OJS060-00101773-it-ts.pdf' already exists. Skipping download.
[3 / 3372]
File 'guue/2016-OJS062-00106295-it-ts.pdf' already exists. Skipping download.
[4 / 3372]
File 'guue/2016-OJS062-00107040-it-ts.pdf' already exists. Skipping download.
[5 / 3372]
File 'guue/2016-OJS064-00110525-it-ts.pdf' already exists. Skipping download.
[6 / 3372]
File 'guue/2016-OJS065-00111759-it-ts.pdf' already exists. Skipping download.
[7 / 3372]
File 'guue/2016-OJS065-00112687-it-ts.pdf' already exists. Skipping download.
[8 / 3372]
File 'guue/2016-OJS066-00115336-it-ts.pdf' already exists. Skipping download.
[9 / 3372]
File 'guue/2016-OJS066-00115433-it-ts.pdf' already exists. Skipping download.
[10 / 3372]
File 'guue/2016-OJS067-00117318-it-ts.pdf' already exists. Skipping download.
[11 / 3372]
File 'guue/2016-OJS009-00011809-it-ts.pdf' already exists. Skipping downlo

In [12]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-28 09:08:35
Time to finish: 9:18:25


*** PROGRAM END ***

