In [18]:
import os
import gzip
import shutil
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from pandarallel import pandarallel

pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [19]:
url = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/"

In [20]:
dir_to_download = Path('../data/pubchem/sdfs')
dir_to_download.mkdir(parents=True, exist_ok=True)

In [21]:
processed_file = Path("../data/pubchem/processed.txt")
with processed_file.open('a'):
    pass

In [22]:
with open(processed_file, 'r') as f:
    processed_before = set(f.read().split())

In [23]:
processed_before

{'Compound_000000001_000500000.sdf.gz',
 'Compound_000500001_001000000.sdf.gz',
 'Compound_001000001_001500000.sdf.gz',
 'Compound_001500001_002000000.sdf.gz',
 'Compound_002000001_002500000.sdf.gz',
 'Compound_002500001_003000000.sdf.gz',
 'Compound_003000001_003500000.sdf.gz',
 'Compound_003500001_004000000.sdf.gz',
 'Compound_004000001_004500000.sdf.gz',
 'Compound_004500001_005000000.sdf.gz',
 'Compound_005000001_005500000.sdf.gz',
 'Compound_005500001_006000000.sdf.gz',
 'Compound_006000001_006500000.sdf.gz',
 'Compound_006500001_007000000.sdf.gz',
 'Compound_007000001_007500000.sdf.gz',
 'Compound_007500001_008000000.sdf.gz',
 'Compound_008000001_008500000.sdf.gz',
 'Compound_008500001_009000000.sdf.gz',
 'Compound_009000001_009500000.sdf.gz',
 'Compound_009500001_010000000.sdf.gz',
 'Compound_010000001_010500000.sdf.gz',
 'Compound_010500001_011000000.sdf.gz',
 'Compound_011000001_011500000.sdf.gz',
 'Compound_011500001_012000000.sdf.gz',
 'Compound_012000001_012500000.sdf.gz',


In [24]:
def get_urls(url, ext):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, features='lxml')
    all_links = [link.get("href") for link in soup("a")]
    return pd.Series(filter(lambda x: x.endswith(ext), all_links))

In [25]:
sdfs = get_urls(url, ".sdf.gz")

In [26]:
file_names = pd.Series(list(sorted(sdfs)))[:329 // 4 * 3]

In [27]:
file_names

0      Compound_000000001_000500000.sdf.gz
1      Compound_000500001_001000000.sdf.gz
2      Compound_001000001_001500000.sdf.gz
3      Compound_001500001_002000000.sdf.gz
4      Compound_002000001_002500000.sdf.gz
                      ...                 
241    Compound_120500001_121000000.sdf.gz
242    Compound_121000001_121500000.sdf.gz
243    Compound_121500001_122000000.sdf.gz
244    Compound_122000001_122500000.sdf.gz
245    Compound_122500001_123000000.sdf.gz
Length: 246, dtype: object

In [28]:
def tag_processed(file_name):
    with open(processed_file, 'a') as f:
        f.write(str(file_name) + '\n')
    print('prcessed:', file_name)

In [29]:
def download_zips(file_name):
    if file_name in processed_before:
        print("skipped", file_name)
        return
    print("start", file_name)
    with requests.get(url + file_name, stream=True) as r:
        with open(dir_to_download / file_name, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    with gzip.open(dir_to_download / file_name, 'rb') as f_in:
        with open(dir_to_download / file_name[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(dir_to_download / file_name)
    tag_processed(file_name)

In [30]:
file_names.parallel_apply(download_zips)

skippedstartskippedskipped  Compound_062000001_062500000.sdf.gz  Compound_092500001_093000000.sdf.gz
Compound_000000001_000500000.sdf.gz

skippedskipped  Compound_062500001_063000000.sdf.gzCompound_000500001_001000000.sdf.gz

skippedskipped  Compound_063000001_063500000.sdf.gzCompound_001000001_001500000.sdf.gz

Compound_031000001_031500000.sdf.gzskippedskipped
  Compound_063500001_064000000.sdf.gzCompound_001500001_002000000.sdf.gzskipped
 
skippedCompound_031500001_032000000.sdf.gzskipped
  skippedCompound_064000001_064500000.sdf.gzCompound_002000001_002500000.sdf.gz 

Compound_032000001_032500000.sdf.gzskippedskipped
  Compound_002500001_003000000.sdf.gzskippedCompound_064500001_065000000.sdf.gz 
skippedCompound_032500001_033000000.sdf.gz 
Compound_065000001_065500000.sdf.gz
skipped
skipped skipped Compound_033000001_033500000.sdf.gz Compound_003000001_003500000.sdf.gz
Compound_065500001_066000000.sdf.gz
skipped
skipped skipped  Compound_033500001_034000000.sdf.gzCompound_003500001_

prcessed: Compound_084500001_085000000.sdf.gz
start Compound_085000001_085500000.sdf.gz
prcessed: Compound_085000001_085500000.sdf.gz
start Compound_085500001_086000000.sdf.gz
prcessed: Compound_085500001_086000000.sdf.gz
start Compound_086000001_086500000.sdf.gz
prcessed: Compound_095500001_096000000.sdf.gz
start Compound_096000001_096500000.sdf.gz
prcessed: Compound_086000001_086500000.sdf.gz
start Compound_086500001_087000000.sdf.gz
prcessed: Compound_096000001_096500000.sdf.gz
start Compound_096500001_097000000.sdf.gz
prcessed: Compound_096500001_097000000.sdf.gz
start Compound_097000001_097500000.sdf.gz
prcessed: Compound_086500001_087000000.sdf.gz
start Compound_087000001_087500000.sdf.gz
prcessed: Compound_097000001_097500000.sdf.gz
start Compound_097500001_098000000.sdf.gz
prcessed: Compound_097500001_098000000.sdf.gz
start Compound_098000001_098500000.sdf.gz
prcessed: Compound_087000001_087500000.sdf.gz
start Compound_087500001_088000000.sdf.gz
prcessed: Compound_098000001_098

0      None
1      None
2      None
3      None
4      None
       ... 
241    None
242    None
243    None
244    None
245    None
Length: 246, dtype: object

In [16]:
def check_processed(file_name, processed):
    for processed_name in processed:
        if file_name in processed_name:
            return True
    return False

In [17]:
def remove_unprocessed():
    with processed_file.open('r') as f:
        processed = f.read().split()
    for file in os.listdir(dir_to_download):
        if check_processed(file, processed):
            continue
        os.remove(dir_to_download / file)
        print('removed:', dir_to_download / file)


In [17]:
remove_unprocessed()