# Summary

# Imports

In [1]:
%run _imports.ipynb

Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2017-12-03 14:45:53.051862


In [2]:
NOTEBOOK_NAME = 'download'
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

# Download

## OA package

In [3]:
urllib.request.urlretrieve(
    'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv',
    f'{NOTEBOOK_NAME}/oa_file_list.csv'
)

('download/oa_file_list.csv', <email.message.Message at 0x2b4d685f4c50>)

In [4]:
oa_file_list = pd.read_csv(f'{NOTEBOOK_NAME}/oa_file_list.csv')

In [5]:
oa_file_list.head()

Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,11250746.0,NO-CC CODE
1,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,11250747.0,NO-CC CODE
2,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,11250748.0,NO-CC CODE
3,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,11056684.0,NO-CC CODE
4,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,11400682.0,NO-CC CODE


In [6]:
oa_package_df = oa_file_list[['File', 'Accession ID']].drop_duplicates()
oa_package_df.columns = ['file_path', 'pmc_id']

In [7]:
oa_package_df.head()

Unnamed: 0,file_path,pmc_id
0,oa_package/08/e0/PMC13900.tar.gz,PMC13900
1,oa_package/b0/ac/PMC13901.tar.gz,PMC13901
2,oa_package/f7/98/PMC13902.tar.gz,PMC13902
3,oa_package/9c/7f/PMC13911.tar.gz,PMC13911
4,oa_package/c6/fb/PMC13912.tar.gz,PMC13912


In [28]:
ARCHIVE_DIR = f"{NOTEBOOK_NAME}/pmc_0"
os.makedirs(ARCHIVE_DIR, exist_ok=True)

In [29]:
EXTENSIONS_TO_KEEP = ['.nxml', '.xml', '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.xls', '.xlsx']

In [67]:
import datetime
import tarfile
from typing import List
import zipfile

import dateutil.parser


def extract_tar_file(filename, output_dir) -> List[str]:
    with tarfile.open(filename, 'r:gz') as tar_file:
        members = [
            m
            for m in tar_file.getmembers()
            if any(m.name.endswith(suffix) for suffix in EXTENSIONS_TO_KEEP)
        ]
        tar_file.extractall(path=output_dir, members=members)
    return [m.name for m in members]
    
    
def extract_zip_file(filename, output_dir) -> List[str]:
    with zipfile.ZipFile(filename) as zip_file:
        members = [
            m
            for m in zip_file.namelist()
            if any(m.endswith(suffix) for suffix in EXTENSIONS_TO_KEEP)
        ]
        zip_file.extractall(path=output_dir, members=members)
    return members    


def download(source_url, output_dir, overwrite=False):
    info_file = op.join(output_dir, 'info.json')
    
    if not overwrite and op.isfile(info_file):
        return {'error': "Info file already exists!"}

    with tempfile.NamedTemporaryFile() as tmp_file:
        urllib.request.urlretrieve(source_url, tmp_file.name)
        if source_url.endswith('.tar.gz'):
            extracted_files = extract_tar_file(tmp_file.name, output_dir)
        elif source_url.endswith('.zip'):
            extracted_files = extract_zip_file(tmp_file.name, output_dir)
        else:
            raise Exception(f"Unsupported archive {source_url}!")
        
        info = {
            'source_url': source_url,
            'created_on': datetime.datetime.now().isoformat(),
            'num_extracted_files': len(extracted_files),
        }
        os.makedirs(op.dirname(info_file), exist_ok=True)
        with open(info_file, 'wt') as fout:
            json.dump(info, fout)
    
    return info

In [68]:
source_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/00/00/PMC1790863.tar.gz'
output_dir = op.join(ARCHIVE_DIR, 'oa_package/00/00/')

download(source_url, output_dir, True)

{'created_on': '2017-12-03T15:42:04.728418',
 'num_extracted_files': 1,
 'source_url': 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/00/00/PMC1790863.tar.gz'}

In [69]:
source_url = 'ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901.zip'
output_dir = op.join(ARCHIVE_DIR, 'suppl/NON-OA/PMC1125900-PMC1129899/')

download(source_url, output_dir, True)

{'created_on': '2017-12-03T15:42:06.007084',
 'num_extracted_files': 0,
 'source_url': 'ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC1125900-PMC1129899/PMC1125901.zip'}

In [40]:
!ls {output_dir}

info.json  PMC1790863


In [43]:
!cat {output_dir}/info.json

{"source_url": "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/00/00/PMC1790863.tar.gz", "created_on": "2017-12-03T15:19:07.495682"}

In [15]:
def download_file(file_path):
    pmc_id = op.basename(file_path).partition('.')[0]
    pmc_id_hash = hashlib.md5(pmc_id.encode('utf-8')).hexdigest()
    output_path = op.join(ARCHIVE_DIR, pmc_id_hash[0:2], pmc_id_hash[2:4])

    with tempfile.NamedTemporaryFile() as tmp_file:
        urllib.request.urlretrieve(f'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/{file_path}', tmp_file.name)
        with tarfile.open(tmp_file.name, 'r:gz') as tar_file:
            members = [
                m
                for m in tar_file.getmembers()
                if any(m.name.endswith(suffix) for suffix in EXTENSIONS_TO_KEEP)
            ]
            tar_file.extractall(path=output_path, members=members)

In [16]:
download_file(file_path)

NameError: name 'file_path' is not defined

In [None]:
with concurrent.futures.ProcessPoolExecutor(64) as pool:
    futures = pool.map(download_file, oa_package_df['file_path'].values)

In [89]:
{output_dir}

{'download_oa_package/oa_package_0/b5/c0/PMC13900'}

In [90]:
!ls {output_dir}

BCR-3-1-055.nxml  PMC13900


In [73]:
md5 = hashlib.md5('PMC2'.encode('utf-8'))
md5.hexdigest()

'ff6cf0a66226e4f916d5991840d1d99f'

In [38]:
    print(tar_file.getmembers())
    print(tar_file.getnames())
    print(tar_file.list())
        

[<TarInfo 'PMC13900' at 0x2ae234d4f1d8>, <TarInfo 'PMC13900/BCR-3-1-055.nxml' at 0x2ae234d4f048>, <TarInfo 'PMC13900/BCR-3-1-055.pdf' at 0x2ae23775a368>]
['PMC13900', 'PMC13900/BCR-3-1-055.nxml', 'PMC13900/BCR-3-1-055.pdf']
?rwxrwxr-x pmc/pmcdev          0 2017-01-01 14:16:21 PMC13900/ 
?rw-rw-r-- pmc/pmcdev      52460 2017-01-01 14:16:21 PMC13900/BCR-3-1-055.nxml 
?rw-rw-r-- pmc/pmcdev     106618 2017-01-01 14:16:21 PMC13900/BCR-3-1-055.pdf 
None


In [39]:
member = tar_file.getmembers()[1]

In [40]:
member.path

'PMC13900/BCR-3-1-055.nxml'

In [41]:
tar_file.extractall(path=f"{NOTEBOOK_NAME}/delete", members=[member])

In [43]:
!ls -al {NOTEBOOK_NAME}/delete/PMC13900

total 9
drwxrwsr-x 2 strokach def-pmkim  4096 Dec  2 23:35 .
drwxrwsr-x 3 strokach def-pmkim  4096 Dec  2 23:35 ..
-rw-rw-r-- 1 strokach def-pmkim 52460 Jan  1  2017 BCR-3-1-055.nxml


In [35]:
member.name

'PMC13900'

In [36]:
member.

SyntaxError: invalid syntax (<ipython-input-36-e5ed5e978f0c>, line 1)