In [2]:
# 03_download_guue
# Starting from the file defined in TED_URLS_FILE, it downloads the linked PDFs in the 'PDF_NOTICE_URL' column of the CSV and saves them in GUUE_DIR. 

In [3]:
# Force to reload extrernal modules every new cell execution
%reload_ext autoreload
%autoreload 2

In [4]:
### IMPORT ###
from pathlib import Path
import csv
from datetime import datetime
import pandas as pd
from urllib.parse import urlparse
import requests


In [5]:
### LOCAL IMPORT ###
from config import config_reader
from utilities import read_csv_data

In [6]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
data_dir = str(yaml_config["DATA_DIR"])
ted_urls_file = str(yaml_config["TED_URLS_FILE"]) # input
guue_dir = str(yaml_config["GUUE_DIR"]) 

In [7]:
### FUNCTIONS ###
def get_filename_from_cd(content_disposition: str) -> str:
    """
    Extracts the filename from the Content-Disposition header.

    Parameters:
        content_disposition (str): The Content-Disposition header from which to extract the filename.

    Returns:
        str: The extracted filename, or None if the header is not present or does not contain a filename.
    """
    if not content_disposition:
        return None
    filename = content_disposition.split('filename=')[1]
    if filename[0] == '"' or filename[0] == "'":
        filename = filename[1:-1]
    return filename

In [14]:
def download_pdf(url:str, dir_name:str, file_name:str) -> None:
    """
    Downloads a PDF file from the given URL and saves it to the specified directory.

    Parameters:
        url (str): The URL of the PDF file to be downloaded.
        dir_name (Path): The directory where the PDF file will be saved.
        file_name (Path): The default name of the PDF file if CD is empty.

    Returns:
        None
    """
    try:
        response = requests.get(url)
        response.raise_for_status() # Check that the request was successful
        cd = response.headers.get('Content-Disposition') # attachment; filename="2016-OJS008-00010051-fr-ts.pdf"
        # print("CD:", cd) # debug
        # If Content-Disposition is available, extract the filename from the URL, else use the default one
        if cd:
            file_name = get_filename_from_cd(cd)
        save_path = Path(dir_name) / file_name
        if save_path.exists():
            print(f"File '{save_path}' already exists. Skipping download.")
            return
        with open(save_path, 'wb') as f:
            f.write(response.content)
            print(f"Downloaded '{url}' to '{save_path}'")
    except Exception as e:
        print(f"Failed to download '{url}': {e}")

In [15]:
### MAIN ###
print()
print("*** PROGRAM START ***")
print()

start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()


*** PROGRAM START ***

Start process: 2024-05-24 17:27:50



In [10]:
# Reads URLs file
print(">> Parsing URLs file")
dic_types_cfc = {'ID_NOTICE_CN':object, 'YEAR':object} # Columns not to be transformed into numbers
path_in = Path(data_dir) / ted_urls_file
print("Path:", path_in)
df_ted_url = read_csv_data(path_in, dic_types_cfc, ";")
df_ted_url = df_ted_url[df_ted_url['PDF_NOTICE_URL'].notna()]
df_ted_url_len = len(df_ted_url)
print("Rows in CSV file:", df_ted_url_len)
# print(df_ted_url.head()) # debug
print()

>> Parsing URLs file
Path: data/TED_URLs.csv
Rows in CSV file: 174910



In [11]:
print(">> Preparing the output directory")
save_dir = Path(guue_dir)
save_dir.mkdir(parents=True, exist_ok=True)
print("Path:", save_dir)

>> Preparing the output directory
Path: guue


In [13]:
# Download PDF references in PDF_NOTICE_URL
print(">> Downloading PDF files")
i = 0
for index, row in df_ted_url.iterrows():
    i+=1
    print(f"[{i} / {df_ted_url_len}]")
    pdf_url = row['PDF_NOTICE_URL']
    file_name = f"{row['ID_NOTICE_CN']}.pdf" # Save the PDF with same name of ID_NOTICE_CN if missing in response
    download_pdf(pdf_url, guue_dir, file_name)

>> Downloading PDF files
[1 / 174910]
File 'guue/2016-OJS008-00010051-fr-ts.pdf' already exists. Skipping download.
[2 / 174910]
File 'guue/2016-OJS060-00100531-es-ts.pdf' already exists. Skipping download.
[3 / 174910]
File 'guue/2016-OJS008-00010055-fr-ts.pdf' already exists. Skipping download.
[4 / 174910]
File 'guue/2016-OJS008-00010066-fr-ts.pdf' already exists. Skipping download.
[5 / 174910]
File 'guue/2016-OJS060-00100664-fr-ts.pdf' already exists. Skipping download.
[6 / 174910]
File 'guue/2016-OJS060-00100688-fr-ts.pdf' already exists. Skipping download.
[7 / 174910]
File 'guue/2016-OJS008-00010069-fr-ts.pdf' already exists. Skipping download.
[8 / 174910]
File 'guue/2016-OJS060-00100690-fr-ts.pdf' already exists. Skipping download.
[9 / 174910]
File 'guue/2016-OJS060-00100693-fr-ts.pdf' already exists. Skipping download.
[10 / 174910]
File 'guue/2016-OJS060-00100696-es-ts.pdf' already exists. Skipping download.
[11 / 174910]
File 'guue/2016-OJS008-00010072-fr-ts.pdf' already

KeyboardInterrupt: 

In [20]:
# program end
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time

print()
print("End process:", end_time)
print("Time to finish:", delta_time)
print()

print()
print("*** PROGRAM END ***")
print()


End process: 2024-05-24 10:22:11
Time to finish: 0:26:11


*** PROGRAM END ***

