# Summary

Compress downloaded files (in subsequent runs, this should be done automatically as we download).

# Imports

In [1]:
%run _imports.ipynb

Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2017-12-07 16:28:14.428551


In [2]:
NOTEBOOK_NAME = 'compress'
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

# Functions

In [3]:
def get_pmc_root_dir(pmc_id, output_dir):
    if output_dir.strip('/').endswith(pmc_id):
        return output_dir
    else:
        return op.join(output_dir, pmc_id)

# Download

In [4]:
SOURCE_DIR = Path('download/pmc_3/').absolute()
SOURCE_DIR

PosixPath('/project/6008029/strokach/datapkg/pmc_tables/notebooks/download/pmc_3')

In [5]:
OUTPUT_DIR = Path(f'{NOTEBOOK_NAME}/pmc_3/').absolute()
OUTPUT_DIR

PosixPath('/project/6008029/strokach/datapkg/pmc_tables/notebooks/compress/pmc_3')

#### `to_download`

In [6]:
to_download = pd.read_csv('pmc_statistics/to_download.csv')

In [7]:
to_download.head()

Unnamed: 0,pmc_id,source_url,output_dir
0,PMC13900,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/e0/PMC13900.tar.gz,oa_package/08/e0/
1,PMC13901,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b0/ac/PMC13901.tar.gz,oa_package/b0/ac/
2,PMC13902,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f7/98/PMC13902.tar.gz,oa_package/f7/98/
3,PMC13911,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/9c/7f/PMC13911.tar.gz,oa_package/9c/7f/
4,PMC13912,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c6/fb/PMC13912.tar.gz,oa_package/c6/fb/


In [8]:
to_download.tail()

Unnamed: 0,pmc_id,source_url,output_dir
2083565,PPR7031,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7031.zip,suppl/OA/preprint/PPR7031
2083566,PPR7032,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7032.zip,suppl/OA/preprint/PPR7032
2083567,PPR7033,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7033.zip,suppl/OA/preprint/PPR7033
2083568,PPR7034,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7034.zip,suppl/OA/preprint/PPR7034
2083569,PPR7035,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/OA/preprint/PPR7035.zip,suppl/OA/preprint/PPR7035


In [9]:
import hashlib


def compress_pmc_dir(pmc_id, output_dir):
    pmc_root_dir = get_pmc_root_dir(pmc_id, output_dir)
    logger.debug(pmc_root_dir)
    # Source dir
    source_dir = SOURCE_DIR.joinpath(pmc_root_dir)
    logger.debug(source_dir)
    if not source_dir.joinpath('info.json').is_file():
        logger.debug("Source dir does not contain info.json!")
        return
    # Output dir
    dgst = hashlib.md5(row.pmc_id.encode('utf-8')).hexdigest()
    output_dir = OUTPUT_DIR.joinpath(dgst[0:2]).joinpath(dgst[2:4])
    output_dir.mkdir(parents=True, exist_ok=True)
    logger.debug(output_dir)
    # Output file
    output_file = output_dir.joinpath(f'{row.pmc_id}')
    # Create archive
    archive = shutil.make_archive(output_file, format='zip', root_dir=source_dir)
    logger.debug(archive)
    # Remove source directory
    shutil.rmtree(source_dir)    

In [10]:
logger.setLevel(logging.INFO)

In [11]:
# count = 0

# for row in tqdm.tqdm_notebook(to_download.itertuples(), total=len(to_download)):
#     compress_pmc_dir(row.pmc_id, row.output_dir)
#     count += 1
#     if count > 10_000:
#         break

In [None]:
chunk_size = 50_000

In [None]:
for i in range(8, 42):
    chunk_offset = chunk_size * i
    with concurrent.futures.ThreadPoolExecutor(64) as pool:
        futures = [
            pool.submit(compress_pmc_dir, row.pmc_id, row.output_dir) 
            for row in itertools.islice(to_download.itertuples(), chunk_offset, chunk_offset + chunk_size)
        ]
        for future in tqdm.tqdm_notebook(
                concurrent.futures.as_completed(futures), 
                total=len(futures),
                desc=f"Chunk {i}"):
            pass

A Jupyter Widget




A Jupyter Widget




A Jupyter Widget




A Jupyter Widget




A Jupyter Widget




A Jupyter Widget




A Jupyter Widget