In [None]:
import requests
import re
import time

from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tqdm.auto import tqdm

In [None]:
savepath = Path("./SMI_GZ/")
savepath.mkdir(parents=True,exist_ok=True)

base_url = "https://files.docking.org/zinc22/2d-all/"
H_links = [a["href"] for a in BeautifulSoup(requests.get(base_url).text, "lxml").find_all("a", href=re.compile(r'^H'))]

def fetch_smi_links(link):
    url = base_url + link
    smi_links = BeautifulSoup(requests.get(url).text, 'lxml').find_all("a", href=re.compile(r'.*\.smi\.gz$'))
    smi_links = [elem["href"] for elem in smi_links]
    return list(map(lambda link: url + link, smi_links))

with ThreadPool(10) as pool:
    results = list(tqdm(pool.imap(fetch_smi_links, H_links), total=len(H_links)))

download_urls = []
for result in results:
    download_urls.extend(result)

In [None]:
def download(url, path):
    path = Path(path)
    if path.exists():
        return
    
    for attempt in range(5):
        try:
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to download with error code {response.status_code}. Retrying in 2 seconds...")
                time.sleep(2)
                continue
            with open(path, "wb") as file:
                file.write(response.content)
            return
        except requests.RequestException as e:
            print(f"Network error: {e}. Retrying in 2 seconds...")
            time.sleep(2)
        except IOError as e:
            print(f"File I/O error: {e}. Retrying in 2 seconds...")
            time.sleep(2)
        except Exception as e:
            print(f"An unexpected error occurred: {e}. Retrying in 2 seconds...")
            time.sleep(2)
    print(f"Failed to download {url} to {path}")
    return

def download_smi_parallel(urls):
    def download_smi(url):
        download(url, savepath / url.split("/")[-1])
    with tqdm(total=len(urls)) as pbar:
        for _ in ThreadPool(20).imap_unordered(download_smi, urls):
            pbar.update()

In [None]:
download_smi_parallel(download_urls)