In [11]:
import glob
import json
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from functools import reduce
from pylab import rcParams
rcParams['figure.figsize'] = 10,10

In [22]:
# benchmarking throughput

target_ids = [
    '30I8399019000',
    '30I8399019001',
    '30I8399019002',
    '30I8399019003',
    '30I8399019004',
    '30I8399019005',
    '30I8399019006',
    '30I8399019007',
    '30I8399019008',
    '30I8399019009',
    '30I8399019010',
    '30I8399019011',
    '30I8399019012',
    '30I8399019013',
]
base_dir = '../'
download_dir = 'tmp/collect-all'
benchmark_dir = 'tmp/benchmark_throughput'

def parse_elapsed(lines):
    ret = {}
    for x in lines:
        key, value = x.split('\t')
        m, s = value.replace('s', '').split('m')
        ret[key] = float(m)*60 + float(s)
    return ret

def file_stat(files):
    ret = {}
    ret['file_num'] = len(files)
    ret['file_size'] = reduce(lambda a,b: a+b ,map(os.path.getsize, files))

    return ret

cmd_rm_stat = 'cd {base_dir} && mkdir -p {benchmark_dir} && rm -f {benchmark_dir}/stat-{id}.json'
cmd_calc_stat = 'cd {base_dir} && time python3 lastomesh.py --local-scheduler --workers 4 DownloadShizuokaPCD --product-id {id} --output-dir {benchmark_dir} --work-dir {download_dir}/{id}'

throughputs = []
for id in target_ids:

    # ファイル数, 容量
    las_files = list(glob.glob(os.path.join(base_dir, download_dir, id, '*.las')))
    fstat = file_stat(las_files)
    !{cmd_rm_stat.format(base_dir=base_dir, benchmark_dir=benchmark_dir, id=id)}

    # 読み込み&特徴量算出 時間計測
    output = !{cmd_calc_stat.format(base_dir=base_dir, download_dir=download_dir, benchmark_dir=benchmark_dir, id=id)}
    elapsed = parse_elapsed(output[-3:])

    # 点の数
    filepath = os.path.join(base_dir, benchmark_dir, 'stat-{}.json'.format(id))
    with open(filepath, 'r') as f:
        pcd_stat = json.load(f)

    # 結果まとめる
    record = dict(fstat, **elapsed, point_num=pcd_stat['shape']['value'][0], id=id)
    print(record)
    throughputs.append(record)

df_throughputs = pd.DataFrame(throughputs)
df_throughputs

{'file_num': 4, 'file_size': 641946890, 'real': 6.353, 'user': 15.615, 'sys': 2.917, 'point_num': 18880735, 'id': '30I8399019000'}
{'file_num': 4, 'file_size': 635859292, 'real': 9.813, 'user': 15.659, 'sys': 3.451, 'point_num': 18701690, 'id': '30I8399019001'}
{'file_num': 4, 'file_size': 641868078, 'real': 9.723, 'user': 15.506, 'sys': 3.543, 'point_num': 18878420, 'id': '30I8399019002'}
{'file_num': 4, 'file_size': 649677538, 'real': 9.866, 'user': 15.898, 'sys': 3.503, 'point_num': 19108110, 'id': '30I8399019003'}
{'file_num': 4, 'file_size': 648882006, 'real': 9.814, 'user': 15.642, 'sys': 3.718, 'point_num': 19084710, 'id': '30I8399019004'}
{'file_num': 4, 'file_size': 637587546, 'real': 9.79, 'user': 15.432, 'sys': 3.415, 'point_num': 18752520, 'id': '30I8399019005'}
{'file_num': 3, 'file_size': 605604597, 'real': 9.172, 'user': 14.535, 'sys': 3.49, 'point_num': 17811860, 'id': '30I8399019006'}
{'file_num': 3, 'file_size': 618411615, 'real': 9.528, 'user': 15.16, 'sys': 3.537, '

Unnamed: 0,file_num,file_size,real,user,sys,point_num,id
0,4,641946890,6.353,15.615,2.917,18880735,30I8399019000
1,4,635859292,9.813,15.659,3.451,18701690,30I8399019001
2,4,641868078,9.723,15.506,3.543,18878420,30I8399019002
3,4,649677538,9.866,15.898,3.503,19108110,30I8399019003
4,4,648882006,9.814,15.642,3.718,19084710,30I8399019004
5,4,637587546,9.79,15.432,3.415,18752520,30I8399019005
6,3,605604597,9.172,14.535,3.49,17811860,30I8399019006
7,3,618411615,9.528,15.16,3.537,18188535,30I8399019007
8,4,629775264,9.763,15.506,3.469,18522750,30I8399019008
9,4,650957842,10.053,15.797,3.648,19145765,30I8399019009


In [29]:
throughput_point = df_throughputs['point_num'].sum() / df_throughputs['real'].sum()
file_size_sum = df_throughputs['file_size'].sum() / (1024 ** 3)
elapsed_per_1gb = df_throughputs['real'].sum() / file_size_sum
print(throughput_point, elapsed_per_1gb, file_size_sum)

1959808.4257140527 16.114103065974156 8.380174772813916
