In [13]:
# imports

import zlib
import lzma
import gzip
import bz2
import os
import io
import sys
import time
import csv
import zstandard as zstd
import blosc as bl
import pickle
import re
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [14]:
# decorator to auth for google drive -- see: https://developers.google.com/drive/api/v3/quickstart/python
def auth_drive(func):
    def wrapper(*args, **kwargs):
        creds = None
        SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
        if os.path.exists('token.pickle'):
            with open('token.pickle', 'rb') as token:
                creds = pickle.load(token)
        # If there are no (valid) credentials available, let the user log in
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    'credentials.json', SCOPES)
                creds = flow.run_local_server()
            # Save the credentials for the next run
            with open('token.pickle', 'wb') as token:
                pickle.dump(creds, token)

        service =  build('drive', 'v2', credentials=creds)
        kwargs['service'] = service
        return func(*args, **kwargs)
            
    return wrapper

# generator that obtains and collates files
@auth_drive
def gen_links(service):
    results = service.children().list(folderId='1mHg9cmN6GrjEQCK02c_nd-FUHODbZ4Z5').execute()
    items = results.get('items', [])
    for item in items:
        print('generating item link')
        yield item['id']
        
@auth_drive
def gen_files(file_ids, service):
    for file_id in file_ids:
        result = service.files().get(fileId=file_id).execute()
        if re.search('genome\.(gz)$', result['title']):
            request = service.files().get_media(fileId=file_id)
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print("Download %d%%." % int(status.progress() * 100))
                
            # construct object with metadata?
            yield {'id': file_id, 'file': fh}       

In [19]:
def decompress(genomes):
    for genome in genomes:
        yield {'id': genome['id'], 'file': gzip.decompress(genome['file'].getvalue())}

In [16]:
# compression transforms
# binary file --> binary compressed file

def comp_ratio(compressor):
    def wrapper(*args, **kwargs):
        uncompressed_size = sys.getsizeof(args[0])
        compressed = compressor(*args, **kwargs)
        compressed_size = sys.getsizeof(compressed)
        return uncompressed_size / compressed_size
    return wrapper

@comp_ratio
def zlib_compress(f):
    c_z = zlib.compress(f,9)
    return c_z

@comp_ratio
def lzma_compress(f):
    c = lzma.LZMACompressor()
    c_l = c.compress(f)
    c_l += c.flush()
    return c_l

@comp_ratio
def bz2_compress(f):
    c = bz2.BZ2Compressor()
    c_b = c.compress(f)
    c_b += c.flush()
    return c_b

@comp_ratio
def zstd_compress(f):
    c = zstd.ZstdCompressor()
    c_zs = c.compress(f)
    return c_zs

@comp_ratio
def bl_compress(f):
    return bl.compress(f)

compress_transforms = [zlib_compress, lzma_compress, bz2_compress, zstd_compress, bl_compress]

In [21]:
#generator that applies list of transforms to list of files
def apply(genomes, transforms, log='results.csv'):
    with open(log, 'a', newline='') as outfile:
        writer = csv.writer(outfile)
        cols = ['genome'] + [t.__name__ for t in transforms]
        writer.writerow(cols)
        for genome in genomes:
            stats = []
            for transform in transforms:
                print('Applying {0} to {1}',transform.__name__, genome['id'])
                stats.append(transform(genome['file'])) # transform should output some stat for us to log
            writer.writerow([genome['id']] + stats)

In [22]:
apply(decompress(gen_files(gen_links())), compress_transforms)

generating item link
generating item link
Download 54%.
Download 100%.
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
generating item link
Download 100%.
Applying {0} to {1} wrapper 1s9dNf0i3kbW88HIVaeDppT7JILDgL2wo
Applying {0} to {1} wrapper 1s9dNf0i3kbW88HIVaeDppT7JILDgL2wo
Applying {0} to {1} wrapper 1s9dNf0i3kbW88HIVaeDppT7JILDgL2wo
Applying {0} to {1} wrapper 1s9dNf0i3kbW88HIVaeDppT7JILDgL2wo
Applying {0} to {1} wrapper 1s9dNf0i3kbW88HIVaeDppT7JILDgL2wo
generating item link
Download 100%.
Applying {0} to {1} wrapper 1bj4eybCY3XwujjYIKilegdtbj1q2pJ_C
Applying {0} to {1} wrapper 1bj4eybCY3XwujjYIKilegdtbj1q2pJ_C
Applying {0} to {1} wrapper 1bj4eybCY3XwujjYIKilegdtbj1q2pJ_C
Applying {0} to {1} wrapper 1bj4eybCY3XwujjYIKilegd

ValueError: bytesobj cannot be larger than 2147483631 bytes