In [2]:
import datetime
import urllib
prefix = '/Users/dd/Downloads/.metagraph_run36/'

fServer = open(f'{prefix}/server.log')
lines = fServer.readlines()

downloaded_sras = set()
build_sras = set()
clean_sras = set()
transfer_sras = set()
ndownloaded_sras = set()
nbuild_sras = set()
nclean_sras = set()
ntransfer_sras = set()
time_first = 0
time_last = 0
for l in lines:
    if not 'ack/' in l:
        continue
    split_l = l.split(' ')
    date_time = datetime.datetime.strptime(split_l[0] + ' ' + split_l[1], '%Y-%m-%d %H:%M:%S,%f')
    parsed = urllib.parse.parse_qs(split_l[4])
    
    sra_id = parsed['id'][0]
    if time_first == 0:
        time_first = date_time
    time_last = date_time
    if '/ack/download' in l:
        downloaded_sras.add(sra_id)
    elif '/ack/build' in l:
        build_sras.add(sra_id)
    elif '/ack/clean' in l:
        clean_sras.add(sra_id)
    elif '/ack/transfer' in l:
        transfer_sras.add(sra_id)
    elif '/nack/download' in l:
        ndownloaded_sras.add(sra_id)
    elif '/nack/build' in l:
        nbuild_sras.add(sra_id)
    elif '/nack/clean' in l:
        nclean_sras.add(sra_id)
    elif '/nack/transfer' in l:
        ntransfer_sras.add(sra_id)

# remove SRAs that were re-tried and successfully processed after an initial failure
ndownloaded_sras = ndownloaded_sras.difference(downloaded_sras)
nbuild_sras = nbuild_sras.difference(build_sras)
nclean_sras = nclean_sras.difference(clean_sras)
print(f'Downloaded {len(downloaded_sras)}, Built {len(build_sras)}, Cleaned {len(clean_sras)}, Transferred {len(transfer_sras)}')
print(f'Not downloaded {len(ndownloaded_sras)}, Not built {len(nbuild_sras)}, Not cleaned {len(nclean_sras)}, Not transferred {len(ntransfer_sras)}')
    


Downloaded 1207, Built 952, Cleaned 879, Transferred 879
Not downloaded 227, Not built 188, Not cleaned 0, Not transferred 0


In [4]:
from collections import defaultdict
import urllib.parse

download_size_processed = 0
ndownload_size = 0
download_size_downloaded = 0
size_all = 0
download_size_processed_hist = defaultdict(int)
download_size_processeds = []
download_time = 0
sra_to_size = {}
ndownload_size_hist = defaultdict(int)
coverage = []
coverage_size = []
coverage_total_size = 0
seen_acks = set()
seen_nacks = set()
for l in lines:
    if not 'ack/down' in l:
        continue
    parsed = urllib.parse.parse_qs(l.split(' ')[4])
    sra_id = parsed['id'][0]
    size = float(parsed['size_mb'][0])
    
    if '/ack/download' in l and sra_id in downloaded_sras:
        if sra_id in seen_acks:
            continue
        seen_acks.add(sra_id)
        kmer_coverage = float(parsed['kmer_coverage'][0])
        unique_kmers = float(parsed['kmer_count_unique'][0])
        coverage.append(int(kmer_coverage))
        coverage_size.append(size)
        coverage_total_size += size
        download_size_processed_hist[int(size/100) if int(size/100)<200 else 200] += 1
        download_size_processeds.append(size/1e3)
        sra_to_size[sra_id] = size
        download_size_downloaded += size
        
        if sra_id in transfer_sras:
            download_time += int(l.split('&')[2].split('=')[1])
            download_size_processed += size
    elif '/nack/download' in l and sra_id in ndownloaded_sras:
        if sra_id in seen_nacks:  # download attempt that failed twice (e.g. after a re-process of failed downloads)
            continue
        seen_nacks.add(sra_id)
        ndownload_size_hist[int(size/100) if int(size/100)<200 else 200] += 1
        size_all += size
        ndownload_size += size
size_all += download_size_processed
print(f'Total size of all SRAs, including not downloaded: {round(size_all,2)}MB')
print(f'Total size of downloaded SRAS: {round(download_size_downloaded,2)}MB')
print(f'Total download size (of finished SRAs): {round(download_size_processed,2)}MB')
print(f'Total not downloaded size: {round(ndownload_size,2)}MB')
print(f'Total download time (of finished SRAs): {download_time}s')
print(f'Download bandwidth (of finished SRAs): {round(download_size_processed/download_time,2)}MB/s/machine')


Total size of all SRAs, including not downloaded: 2380175.43MB
Total size of downloaded SRAS: 1778889.35MB
Total download size (of finished SRAs): 1120246.95MB
Total not downloaded size: 1259928.48MB
Total download time (of finished SRAs): 355959s
Download bandwidth (of finished SRAs): 3.15MB/s/machine


In [5]:
time = 0
size_download = 0
size_build = 0
build_size_hist = defaultdict(int)
build_sizes = []
build_size_to_time = {}
nbuild_size_hist = defaultdict(int)
nbuild_sizes = []
too_large = 0

for d in lines:
    if not 'ack/build' in d:
        continue
    if '/ack/build' in d:
        sra_id = d.split('&')[1].split('=')[1]
        sz = sra_to_size[sra_id] #float(d.split('&')[4].split('=')[1])
        build_size_hist[int(sz/100) if int(sz/100)<200 else 200] += 1
        build_sizes.append(sz/1e3)
        tm = float(d.split('&')[2].split('=')[1])
        size_build += float(d.split('&')[4].split('=')[1])
        build_size_to_time[sz] = tm
        if sra_id in transfer_sras:
            time = time + tm
            size_download += sz
    elif '/nack/build' in d:
        sz = sra_to_size[sra_id]
        nbuild_size_hist[int(sz/100) if int(sz/100)<200 else 200] += 1
        nbuild_sizes.append(sz/1e3)
        if 'required_ram_gb' in d:
            too_large += sz

print(f'Total build time (of finished SRAs): {time}s')
print(f'Total build size (of finished SRAs): {round(size_build,2)}MB')
print(f'Total size of too large SRAs: ', too_large)
print(f'Build bandwidth is {size_download/time}MB/s')


Total build time (of finished SRAs): 2116072.0s
Total build size (of finished SRAs): 12245943.34MB
Total size of too large SRAs:  0
Build bandwidth is 0.572730426941994MB/s


In [6]:
time = 0
clean_size = 0
for d in lines:
    if not 'ack/clean' in d:
        continue
    sra_id = d.split('&')[1].split('=')[1]
    if sra_id in transfer_sras:
        time = time + float(d.split('&')[2].split('=')[1])
        clean_size = clean_size + float(d.split('&')[3].split('=')[1])
        
    
print(f'Total clean time (of finished SRAs): {time}s')
print (f'Clean bandwidth is {download_size_processed} {time} {round(download_size_processed/time,2)}MB/s')
print(f'Compression factor is {download_size_processed/clean_size}')

Total clean time (of finished SRAs): 6635018.0s
Clean bandwidth is 1120246.9500000007 6635018.0 0.17MB/s
Compression factor is 1.6657670781255156


In [1]:
# import matplotlib.pyplot as plt

# plt.bar(list(download_size_processed_hist.keys()), download_size_processed_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (download successful)')
# plt.show()

# plt.bar(list(ndownload_size_hist.keys()), ndownload_size_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (download failed)')
# plt.show()

# plt.hist(download_size_processeds, bins=range(0,80))

In [None]:
import collections

# plt.figure(1)
# plt.bar(list(build_size_hist.keys()), build_size_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (build successful)')
# #plt.show()

# plt.figure(2)
# plt.bar(list(nbuild_size_hist.keys()), nbuild_size_hist.values(), color='g')
# plt.ylabel('count')
# plt.xlabel('size (100s of MB)')
# plt.title('SRA distribution by size (build failed)')
# #plt.show()

# plt.figure(3)
# plt.title('SRA histogram')
# plt.hist(build_sizes, bins=[0, 10, 20, 30, 40, 50])
# plt.hist(nbuild_sizes, bins=[0, 10, 20, 30, 40, 50])
# #plt.show()

# plt.figure(4)
# plt.title('Build time by size')
# od = collections.OrderedDict(sorted(build_size_to_time.items()))
# # plt.plot(list(od.keys()), list(od.values()))




In [None]:
# plt.figure(1)
# plt.title('K-mer coverage')
# plt.hist(coverage, bins=range(0,40), weights=coverage_size, density=True)
# plt.show()