In [45]:
import datetime
import urllib
prefix = '/Users/dd/Downloads/.metagraph_run37/'

fServer = open(f'{prefix}/server.log')
lines = fServer.readlines()


downloaded_sras = set()
build_sras = set()
clean_sras = set()
transfer_sras = set()
ndownloaded_sras = set()
nbuild_sras = set()
nclean_sras = set()
ntransfer_sras = set()
time_first = 0
time_last = 0
for l in lines:
    if not 'ack/' in l:
        continue
    split_l = l.split(' ')
    date_time = datetime.datetime.strptime(split_l[0] + ' ' + split_l[1], '%Y-%m-%d %H:%M:%S,%f')
    parsed = urllib.parse.parse_qs(split_l[4])
    
    sra_id = parsed['id'][0]
    if time_first == 0:
        time_first = date_time
    time_last = date_time
    if '/ack/download' in l:
        downloaded_sras.add(sra_id)
    elif '/ack/build' in l:
        build_sras.add(sra_id)
    elif '/ack/clean' in l:
        clean_sras.add(sra_id)
    elif '/ack/transfer' in l:
        transfer_sras.add(sra_id)
    elif '/nack/download' in l:
        ndownloaded_sras.add(sra_id)
    elif '/nack/build' in l:
        nbuild_sras.add(sra_id)
    elif '/nack/clean' in l:
        nclean_sras.add(sra_id)
    elif '/nack/transfer' in l:
        ntransfer_sras.add(sra_id)

# remove SRAs that were re-tried and successfully processed after an initial failure
ndownloaded_sras = ndownloaded_sras.difference(downloaded_sras)
nbuild_sras = nbuild_sras.difference(build_sras)
nclean_sras = nclean_sras.difference(clean_sras)
print(f'Downloaded {len(downloaded_sras)}, Built {len(build_sras)}, Cleaned {len(clean_sras)}, Transferred {len(transfer_sras)}')
print(f'Not downloaded {len(ndownloaded_sras)}, Not built {len(nbuild_sras)}, Not cleaned {len(nclean_sras)}, Not transferred {len(ntransfer_sras)}')
    


Downloaded 42865, Built 33785, Cleaned 28937, Transferred 28846
Not downloaded 735, Not built 4294, Not cleaned 94, Not transferred 0


In [50]:
from collections import defaultdict
import urllib.parse

def format_bytes(size):
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0 : '', 1: 'K', 2: 'M', 3: 'G', 4: 'T', 5: 'P'}
    while size > power:
        size /= power
        n += 1
    return str(round(size,2)) + power_labels[n]+'B'

download_size_processed = 0
ndownload_size = 0
download_size_downloaded = 0
size_all = 0
download_size_processed_hist = defaultdict(int)
download_size_processeds = []
download_time = 0
sra_to_size = {}
ndownload_size_hist = defaultdict(int)
coverage = []
coverage_size = []
coverage_total_size = 0
seen_acks = set()
seen_nacks = set()
for l in lines:
    if not 'ack/down' in l:
        continue
    parsed = urllib.parse.parse_qs(l.split(' ')[4])
    sra_id = parsed['id'][0]
    size = float(parsed['download_size_mb'][0])
    
    if '/ack/download' in l and sra_id in downloaded_sras:
        if sra_id in seen_acks:
            continue
        
        seen_acks.add(sra_id)
        kmer_coverage = float(parsed['kmer_coverage'][0])
        unique_kmers = float(parsed['kmer_count_unique'][0])
        coverage.append(int(kmer_coverage))
        coverage_size.append(size)
        coverage_total_size += size
        download_size_processed_hist[int(size/100) if int(size/100)<200 else 200] += 1
        download_size_processeds.append(size/1e3)
        sra_to_size[sra_id] = size
        download_size_downloaded += size
        
        if sra_id in transfer_sras:
            download_time += int(l.split('&')[2].split('=')[1])
            download_size_processed += size
    elif '/nack/download' in l and sra_id in ndownloaded_sras:
        if sra_id in seen_nacks:  # download attempt that failed twice (e.g. after a re-process of failed downloads)
            continue
        seen_nacks.add(sra_id)
        ndownload_size_hist[int(size/100) if int(size/100)<200 else 200] += 1
        size_all += size
        ndownload_size += size
size_all += download_size_downloaded
print(f'Total size of all SRAs, including not downloaded: {format_bytes(1e6 * size_all)}')
print(f'Total size of downloaded SRAS: {format_bytes(1e6*download_size_downloaded)}')
print(f'Total download size (of finished SRAs): {format_bytes(download_size_processed)}')
print(f'Total not downloaded size: {format_bytes(ndownload_size)}')
print(f'Total download time (of finished SRAs): {download_time}s')
print(f'Download bandwidth (of finished SRAs): {round(download_size_processed/download_time,2)}MB/s/machine')


Total size of all SRAs, including not downloaded: 81.71TB
Total size of downloaded SRAS: 58.36TB
Total download size (of finished SRAs): 20.54MB
Total not downloaded size: 24.49MB
Total download time (of finished SRAs): 4803366s
Download bandwidth (of finished SRAs): 4.48MB/s/machine


In [51]:
time = 0
size_build = 0
size_build_sra = 0
size_not_built_sra = 0
build_size_hist = defaultdict(int)
build_sizes = []
build_size_to_time = {}
nbuild_size_hist = defaultdict(int)
nbuild_sizes = []
too_large = 0
nacked_builds = set()
acked_builds = set()

for d in lines:
    if not 'ack/build' in d:
        continue
    sz = sra_to_size[sra_id]
    parsed = urllib.parse.parse_qs(d.split(' ')[4])
    sra_id = parsed['id'][0]
    
    if '/ack/build' in d:
        if sra_id in acked_builds:  # account for double acks (e.g. when retrying a failed clean)
            continue
        acked_builds.add(sra_id)
        
        build_size_hist[int(sz/100) if int(sz/100)<200 else 200] += 1
        build_sizes.append(sz/1e3)
        tm = int(parsed['time'][0])
        size_build = float(parsed['size_mb'][0])
        size_build_sra += sz
        build_size_to_time[sz] = tm
        if sra_id in transfer_sras:
            time = time + tm      
    elif '/nack/build' in d:
        if sra_id in nacked_builds or sra_id in build_sras:
            continue
        nacked_builds.add(sra_id)
        nbuild_size_hist[int(sz/100) if int(sz/100)<200 else 200] += 1
        nbuild_sizes.append(sz/1e3)
        if 'required_ram_gb' in parsed:
            too_large += sz
        else:
            size_not_built_sra += sz

print(f'Total build time (of finished SRAs): {time}s\n')
print(f'Total SRA size of built SRAs: {format_bytes(1e6 * size_build_sra)}')
print(f'Total SRA size of failed SRAs: {format_bytes(1e6 * size_not_built_sra)}')
print(f'Total SRA size of too large SRAs: ', format_bytes(1e6 * too_large), '\n')
print(f'Total build size (of finished SRAs): {format_bytes(1e6 * size_build)}')
print(f'Build bandwidth is {download_size_processed/time}MB/s')


Total build time (of finished SRAs): 6425499s

Total SRA size of built SRAs: 41.25TB
Total SRA size of failed SRAs: 133.63GB
Total SRA size of too large SRAs:  11.13TB 

Total build size (of finished SRAs): 2.22GB
Build bandwidth is 3.352411977653426MB/s


In [57]:
time = 0
clean_size = 0
for d in lines:
    if not 'ack/clean' in d:
        continue
    parsed = urllib.parse.parse_qs(d.split(' ')[4])
    sra_id = parsed['id'][0]
    
    if sra_id in transfer_sras:
        time = time + int(parsed['time'][0])
        clean_size = clean_size + float(parsed['size_mb'][0])
        
    
print(f'Total clean time (of finished SRAs): {time}s')
print (f'Clean bandwidth is {download_size_processed} {time} {round(download_size_processed/time,2)}MB/s')
print(f'Total size of cleaned graphs: ', format_bytes(clean_size*1e6))
print(f'Compression factor is {download_size_processed/clean_size}')

Total clean time (of finished SRAs): 15077410s
Clean bandwidth is 21540919.81000011 15077410 1.43MB/s
Total size of cleaned graphs:  723.15GB
Compression factor is 27.741755287315513


In [19]:
# import matplotlib.pyplot as plt

# plt.bar(list(download_size_processed_hist.keys()), download_size_processed_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (download successful)')
# plt.show()

# plt.bar(list(ndownload_size_hist.keys()), ndownload_size_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (download failed)')
# plt.show()

# plt.hist(download_size_processeds, bins=range(0,80))

In [20]:
import collections

# plt.figure(1)
# plt.bar(list(build_size_hist.keys()), build_size_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (build successful)')
# #plt.show()

# plt.figure(2)
# plt.bar(list(nbuild_size_hist.keys()), nbuild_size_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (build failed)')
# #plt.show()

# plt.figure(3)
# plt.title('SRA histogram')
# plt.hist(build_sizes, bins=[0, 10, 20, 30, 40, 50])
# plt.hist(nbuild_sizes, bins=[0, 10, 20, 30, 40, 50])
# #plt.show()

# plt.figure(4)
# plt.title('Build time by size')
# od = collections.OrderedDict(sorted(build_size_to_time.items()))
# # plt.plot(list(od.keys()), list(od.values()))




In [21]:
# plt.figure(1)
# plt.title('K-mer coverage')
# plt.hist(coverage, bins=range(0,40), weights=coverage_size, density=True)
# plt.show()