In [41]:
# imports

import zlib
import lzma
import gzip
import bz2
import os
import io
import sys
import time
import csv
import zstandard as zstd
import blosc as bl
import pickle
import re
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [42]:
# decorator to auth for google drive -- see: https://developers.google.com/drive/api/v3/quickstart/python
def auth_drive(func):
    def wrapper(*args, **kwargs):
        creds = None
        SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
        if os.path.exists('token.pickle'):
            with open('token.pickle', 'rb') as token:
                creds = pickle.load(token)
        # If there are no (valid) credentials available, let the user log in
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    'credentials.json', SCOPES)
                creds = flow.run_local_server()
            # Save the credentials for the next run
            with open('token.pickle', 'wb') as token:
                pickle.dump(creds, token)

        service =  build('drive', 'v2', credentials=creds)
        kwargs['service'] = service
        return func(*args, **kwargs)
            
    return wrapper

# generator that obtains and collates files
@auth_drive
def gen_links(service):
    results = service.children().list(folderId='1mHg9cmN6GrjEQCK02c_nd-FUHODbZ4Z5').execute()
    items = results.get('items', [])
    for item in items:
        print('generating item link')
        yield item['id']
        
@auth_drive
def gen_files(file_ids, service):
    for file_id in file_ids:
        result = service.files().get(fileId=file_id).execute()
        if re.search('genome\.(gz)$', result['title']):
            request = service.files().get_media(fileId=file_id)
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print("Download %d%%." % int(status.progress() * 100))
                
            # construct object with metadata?
            yield {'id': file_id, 'file': fh}       

In [51]:
def decompress(genomes):
    for genome in genomes:
        yield {'id': genome['id'], 'file': io.BytesIO(gzip.decompress(genome['file'].getvalue()))}

In [63]:
# compression transforms
# binary file --> binary compressed file

COMP_BUFFER_SIZE = 2000000000#2147483631
CHUNK_SIZE = 100000000

def comp_ratio(compressor):
    def wrapper(*args, **kwargs):
        uncompressed_size = sys.getsizeof(args[0])
        compressed_size = sum(sys.getsizeof(compressed) for compressed in compressor(*args, **kwargs))
        return uncompressed_size / compressed_size
    return wrapper

@comp_ratio
def zlib_compress(f):
    f.seek(0)
    c = zlib.compressobj(level=9)
    done = False
    while not done:
        chunk = f.read(CHUNK_SIZE)
        done = len(chunk) == 0
        if done:
            yield c.flush()
        else:
            yield c.compress(chunk)
@comp_ratio
def lzma_compress(f):
    f.seek(0)
    c = lzma.LZMACompressor()
    done = False
    while not done:
        chunk = f.read(CHUNK_SIZE)
        done = len(chunk) == 0
        if done:
            yield c.flush()
        else:
            yield c.compress(chunk)

@comp_ratio
def bz2_compress(f):
    f.seek(0)
    c = bz2.BZ2Compressor()
    done = False
    while not done:
        chunk = f.read(CHUNK_SIZE)
        done = len(chunk) == 0
        if done:
            yield c.flush()
        else:
            yield c.compress(chunk)

@comp_ratio
def zstd_compress(f):
    f.seek(0)
    c = zstd.ZstdCompressor()
    done = False
    while not done:
        chunk = f.read(CHUNK_SIZE)
        done = len(chunk) == 0
        if not done:
            yield c.compress(chunk)

@comp_ratio
def bl_compress(f):
    f.seek(0)
    done = False
    while not done:
        chunk = f.read(CHUNK_SIZE)
        done = len(chunk) == 0
        if not done:
            yield bl.compress(chunk)

compress_transforms = [zlib_compress, lzma_compress, bz2_compress, zstd_compress, bl_compress]

In [64]:
#generator that applies list of transforms to list of files
def apply(genomes, transforms, log='results2.csv'):
    with open(log, 'a', newline='') as outfile:
        writer = csv.writer(outfile)
        cols = ['genome'] + [t.__name__ for t in transforms]
        writer.writerow(cols)
        for genome in genomes:
            stats = []
            for transform in transforms:
                print('Applying {0} to {1}',transform.__name__, genome['id'])
                stats.append(transform(genome['file'])) # transform should output some stat for us to log
            print([genome['id'] + stats])
            writer.writerow([genome['id']] + stats)

In [65]:
apply(decompress(gen_files(gen_links())), compress_transforms)

generating item link
generating item link
generating item link
Download 54%.
Download 100%.
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k
Applying {0} to {1} wrapper 1OsHimlxsGE1D3M5tNXTAbnOuq9UAmN2k


TypeError: can only concatenate str (not "list") to str

In [26]:
b = io.BytesIO(b'232342545')

In [33]:
b.read(100)

b'232342545'

In [61]:
def f(x):
    for i in range(10):
        yield i

In [62]:
[i for i in f(10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]