# Summary

# Imports

In [1]:
%run _imports.ipynb

Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2017-12-03 21:12:24.894844


In [2]:
NOTEBOOK_NAME = 'download'
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

# Download

## OA package

In [8]:
urllib.request.urlretrieve(
    'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv',
    f'{NOTEBOOK_NAME}/oa_file_list.csv'
)

('download/oa_file_list.csv', <email.message.Message at 0x2b3c01d932b0>)

In [9]:
oa_file_list = pd.read_csv(f'{NOTEBOOK_NAME}/oa_file_list.csv')

In [10]:
oa_file_list.head()

Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,11250746.0,NO-CC CODE
1,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,11250747.0,NO-CC CODE
2,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,11250748.0,NO-CC CODE
3,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,11056684.0,NO-CC CODE
4,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,11400682.0,NO-CC CODE


In [11]:
oa_package_df = oa_file_list[['File', 'Accession ID']].drop_duplicates()
oa_package_df.columns = ['file_path', 'pmc_id']

In [12]:
oa_package_df.head()

Unnamed: 0,file_path,pmc_id
0,oa_package/08/e0/PMC13900.tar.gz,PMC13900
1,oa_package/b0/ac/PMC13901.tar.gz,PMC13901
2,oa_package/f7/98/PMC13902.tar.gz,PMC13902
3,oa_package/9c/7f/PMC13911.tar.gz,PMC13911
4,oa_package/c6/fb/PMC13912.tar.gz,PMC13912


## Functions

In [3]:
ARCHIVE_DIR = f"{NOTEBOOK_NAME}/pmc_3"
os.makedirs(ARCHIVE_DIR, exist_ok=True)

In [4]:
EXTENSIONS_TO_KEEP = ['.nxml', '.xml', '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.xls', '.xlsx']

In [5]:
import datetime
import tarfile
from typing import List
import zipfile

import dateutil.parser


def extract_tar_file(filename, output_dir) -> List[str]:
    with tarfile.open(filename, 'r:gz') as tar_file:
        members = [
            m
            for m in tar_file.getmembers()
            if any(m.name.endswith(suffix) for suffix in EXTENSIONS_TO_KEEP)
        ]
        tar_file.extractall(path=output_dir, members=members)
    return [m.name for m in members]
    
    
def extract_zip_file(filename, output_dir) -> List[str]:
    with zipfile.ZipFile(filename) as zip_file:
        members = [
            m
            for m in zip_file.namelist()
            if any(m.endswith(suffix) for suffix in EXTENSIONS_TO_KEEP)
        ]
        zip_file.extractall(path=output_dir, members=members)
    return members    


def download(pmc_id, source_url, output_dir, overwrite=False):
    info = {
        'pmc_id': pmc_id,
        'source_url': source_url,
        'output_dir': output_dir,
    }

    output_dir = op.join(ARCHIVE_DIR, output_dir)
    
    if output_dir.strip('/').endswith(pmc_id):
        info_file = op.join(output_dir, 'info.json')
    else:
        info_file = op.join(output_dir, pmc_id, 'info.json')
    
    info['info_file'] = op.relpath(info_file, ARCHIVE_DIR)

    if not overwrite and op.isfile(info_file):
        info.update({'error': "Info file already exists!"})
        return info

    with tempfile.NamedTemporaryFile() as tmp_file:
        urllib.request.urlretrieve(source_url, tmp_file.name)
        if source_url.endswith('.tar.gz'):
            extracted_files = extract_tar_file(tmp_file.name, output_dir)
        elif source_url.endswith('.zip'):
            extracted_files = extract_zip_file(tmp_file.name, output_dir)
        else:
            raise Exception(f"Unsupported archive {source_url}!")
        
        info.update({
            'source_url': source_url,
            'created_on': datetime.datetime.now().isoformat(),
            'extracted_files': extracted_files,
        })
        os.makedirs(op.dirname(info_file), exist_ok=True)
        with open(info_file, 'wt') as fout:
            json.dump(info, fout)
    
    return info

In [6]:
pmc_id = 'PMC1790863'
source_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/00/00/PMC1790863.tar.gz'
output_dir = 'oa_package/00/00/'

download(pmc_id, source_url, output_dir, True)

{'created_on': '2017-12-03T21:12:28.080391',
 'extracted_files': ['PMC1790863/pone.0000217.nxml'],
 'info_file': 'oa_package/00/00/PMC1790863/info.json',
 'output_dir': 'oa_package/00/00/',
 'pmc_id': 'PMC1790863',
 'source_url': 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/00/00/PMC1790863.tar.gz'}

In [7]:
pmc_id = 'PMC1125901'
source_url = 'ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901.zip'
output_dir = 'suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901'

download(pmc_id, source_url, output_dir, True)

{'created_on': '2017-12-03T21:12:29.382429',
 'extracted_files': [],
 'info_file': 'suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901/info.json',
 'output_dir': 'suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901',
 'pmc_id': 'PMC1125901',
 'source_url': 'ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901.zip'}

## Downlaod files

In [8]:
ARCHIVE_DIR

'download/pmc_3'

In [9]:
to_download = pd.read_csv('pmc_statistics/to_download.csv')

In [10]:
to_download.head()

Unnamed: 0,pmc_id,source_url,output_dir
0,PMC13900,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/e0/PMC13900.tar.gz,oa_package/08/e0/
1,PMC13901,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b0/ac/PMC13901.tar.gz,oa_package/b0/ac/
2,PMC13902,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f7/98/PMC13902.tar.gz,oa_package/f7/98/
3,PMC13911,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/9c/7f/PMC13911.tar.gz,oa_package/9c/7f/
4,PMC13912,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c6/fb/PMC13912.tar.gz,oa_package/c6/fb/


In [11]:
to_download.tail()

Unnamed: 0,pmc_id,source_url,output_dir
2083565,PPR7031,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7031.zip,suppl/OA/preprint/PPR7031
2083566,PPR7032,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7032.zip,suppl/OA/preprint/PPR7032
2083567,PPR7033,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7033.zip,suppl/OA/preprint/PPR7033
2083568,PPR7034,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7034.zip,suppl/OA/preprint/PPR7034
2083569,PPR7035,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7035.zip,suppl/OA/preprint/PPR7035


In [12]:
# info_list = []

# for row in tqdm.tqdm_notebook(to_download.itertuples(), total=len(to_download)):
#     info = download(row.pmc_id, row.source_url, row.output_dir, overwrite=False)
#     info_list.append(info)

In [13]:
chunk_size = 50_000
finished_chunks = []

In [14]:
len(to_download) // chunk_size

41

In [None]:
finished_futures = []
for i in range(20, 30):
    chunk_offset = chunk_size * i
    if i in finished_chunks:
        continue
    with concurrent.futures.ThreadPoolExecutor(8) as pool:
        futures = [
            pool.submit(download, row.pmc_id, row.source_url, row.output_dir, overwrite=False) 
            for row in itertools.islice(to_download.itertuples(), chunk_offset, chunk_offset + chunk_size)
        ]
        for future in tqdm.tqdm_notebook(
                concurrent.futures.as_completed(futures), 
                total=len(futures),
                desc=f"Chunk {i}"):
            finished_futures.append(futures)

A Jupyter Widget

In [22]:
def foo():
    yield 1
    yield 2
    yield 3
    
    
for x in foo():
    print(x)

1
2
3


In [28]:
results = []

for i, future in enumerate(futures_all):
    if not future.done():
        print(f"Future {i} is not done!")
        break
    try:
        results.append(future.result())
    except urllib.request.URLError as e:
        print(f"Error for {i}: {e}!")

Error for 55358: <urlopen error ftp error: error_temp('425 PASV: Address already in use',)>!
Future 60408 is not done!


In [29]:
result_df = pd.DataFrame(results)

In [30]:
result_df.head()

Unnamed: 0,created_on,error,extracted_files,output_dir,pmc_id,source_url
0,,Info file already exists!,,oa_package/08/e0/,PMC13900,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/e0/PMC13900.tar.gz
1,,Info file already exists!,,oa_package/b0/ac/,PMC13901,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b0/ac/PMC13901.tar.gz
2,,Info file already exists!,,oa_package/f7/98/,PMC13902,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f7/98/PMC13902.tar.gz
3,,Info file already exists!,,oa_package/9c/7f/,PMC13911,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/9c/7f/PMC13911.tar.gz
4,,Info file already exists!,,oa_package/c6/fb/,PMC13912,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c6/fb/PMC13912.tar.gz


In [15]:
def download_file(file_path):
    pmc_id = op.basename(file_path).partition('.')[0]
    pmc_id_hash = hashlib.md5(pmc_id.encode('utf-8')).hexdigest()
    output_path = op.join(ARCHIVE_DIR, pmc_id_hash[0:2], pmc_id_hash[2:4])

    with tempfile.NamedTemporaryFile() as tmp_file:
        urllib.request.urlretrieve(f'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/{file_path}', tmp_file.name)
        with tarfile.open(tmp_file.name, 'r:gz') as tar_file:
            members = [
                m
                for m in tar_file.getmembers()
                if any(m.name.endswith(suffix) for suffix in EXTENSIONS_TO_KEEP)
            ]
            tar_file.extractall(path=output_path, members=members)

In [16]:
download_file(file_path)

NameError: name 'file_path' is not defined

In [None]:
with concurrent.futures.ProcessPoolExecutor(64) as pool:
    futures = pool.map(download_file, oa_package_df['file_path'].values)

In [89]:
{output_dir}

{'download_oa_package/oa_package_0/b5/c0/PMC13900'}

In [90]:
!ls {output_dir}

BCR-3-1-055.nxml  PMC13900


In [73]:
md5 = hashlib.md5('PMC2'.encode('utf-8'))
md5.hexdigest()

'ff6cf0a66226e4f916d5991840d1d99f'

In [38]:
    print(tar_file.getmembers())
    print(tar_file.getnames())
    print(tar_file.list())
        

[<TarInfo 'PMC13900' at 0x2ae234d4f1d8>, <TarInfo 'PMC13900/BCR-3-1-055.nxml' at 0x2ae234d4f048>, <TarInfo 'PMC13900/BCR-3-1-055.pdf' at 0x2ae23775a368>]
['PMC13900', 'PMC13900/BCR-3-1-055.nxml', 'PMC13900/BCR-3-1-055.pdf']
?rwxrwxr-x pmc/pmcdev          0 2017-01-01 14:16:21 PMC13900/ 
?rw-rw-r-- pmc/pmcdev      52460 2017-01-01 14:16:21 PMC13900/BCR-3-1-055.nxml 
?rw-rw-r-- pmc/pmcdev     106618 2017-01-01 14:16:21 PMC13900/BCR-3-1-055.pdf 
None


In [39]:
member = tar_file.getmembers()[1]

In [40]:
member.path

'PMC13900/BCR-3-1-055.nxml'

In [41]:
tar_file.extractall(path=f"{NOTEBOOK_NAME}/delete", members=[member])

In [43]:
!ls -al {NOTEBOOK_NAME}/delete/PMC13900

total 9
drwxrwsr-x 2 strokach def-pmkim  4096 Dec  2 23:35 .
drwxrwsr-x 3 strokach def-pmkim  4096 Dec  2 23:35 ..
-rw-rw-r-- 1 strokach def-pmkim 52460 Jan  1  2017 BCR-3-1-055.nxml


In [35]:
member.name

'PMC13900'

In [36]:
member.

SyntaxError: invalid syntax (<ipython-input-36-e5ed5e978f0c>, line 1)