# <div align='center'> 自动统计训练过程CPU/GPU内存峰值信息 </div>

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import json, os, time
import hashlib
import pandas as pd
import multiprocessing, threading
from multiprocessing.queues import Empty
from k12libs.utils.nb_easy import k12ai_train_execute
from k12libs.utils.nb_easy import k12ai_get_data
from k12libs.utils.nb_easy import k12ai_print

In [3]:
cpu_reserve_mem = 5000
gpu_reserve_mem = 5000
memstat_file = f'memstat/0316.csv'

In [4]:
# 'dogsVsCats', 'Boats'
models = ('vgg11', 'vgg16', 'vgg19', 'vgg16_bn', 'vgg19_bn', 'resnet18', 'resnet50', 'resnet101', 'resnet152')
datasets = ('Fruits360', 'cellular', 'kannada', 'FashionMNIST', 'EMNIST_Digits', 'cifar10', 'mnist', 'Chars74K', 'cactus', 'EMNIST_MNIST', 'Dogs', 'EMNIST_Letters', 'Animals', 'EMNIST_Balanced')
batchsizes = (16, 32, 64, 128)
inputsizes = (28, 32, 64, 96, 128, 224)

In [5]:
def gen_key(backbone, dataset, batchsize, inputsize=32):
    return '%s-0' % hashlib.md5(f'cls{backbone}{dataset}{batchsize}{inputsize}'.encode()).hexdigest()[0:6]

def check_exist(backbone, dataset, batchsize, inputsize):
    ID = gen_key(backbone, dataset, batchsize, inputsize)
    if os.path.exists(memstat_file):
        memstat_df = pd.read_csv(memstat_file, index_col='id')
        if ID in memstat_df.index:
            return True
    return False
    
def write_csv(backbone, dataset, batchsize, inputsize, uptime, memstat):
    ID = gen_key(backbone, dataset, batchsize, inputsize)
    if os.path.exists(memstat_file):
        memstat_df = pd.read_csv(memstat_file)
        if ID in memstat_df.set_index('id').index:
            return memstat_df
    else:
        fieldnames = ['id', 'model', 'dataset', 'batchsize', 'inputsize', 'uptime']
        fieldnames.extend(list(memstat.keys()))
        memstat_df = pd.DataFrame(columns=fieldnames)
    row = {
        'id': ID,
        'model': backbone,
        'dataset': dataset,
        'batchsize': batchsize,
        'inputsize': inputsize,
        'uptime': uptime,
        **memstat
    }
    memstat_df = memstat_df.append(row, ignore_index=True)
    memstat_df.to_csv(memstat_file, index=False)
    return memstat_df

# memstat_file = '/tmp/test.csv'
# memstat_df = write_csv('cls', 'vgg11', 'Animals', 16, 32, 100, memstat={
#     'app_cpu_memory_usage_MB': 1.0,
#     'app_gpu_memory_usage_MB': 1.0,
#     'sys_cpu_memory_free_MB': 1.0,
#     'sys_gpu_memory_free_MB': 1.0,
#     'app_cpu_max_memory_children_MB': 1.0,
#     'app_gpu_max_memory_cached_MB': 1.0,
#     'app_gpu_memory_allocated_MB': 1.0,
#     'app_gpu_memory_cached_MB': 1.0,
# })
# memstat_df

In [6]:
monitor_queue = multiprocessing.Queue()

tasks_running = []
tasks_waiting = []

def tasks_generator(models, datasets, batchsizes, inputsizes):
    tasks = []
    for m in models:
        for d in datasets:
            for b in batchsizes:
                for i in inputsizes:
                    if check_exist(m, d, b, i):
                        continue
                    tasks.append({'backbone': m, 'dataset': d, 'batchsize': b, 'inputsize': i})
    return tasks

def waiting2running():
    global tasks_running, tasks_waiting
    if len(tasks_waiting) == 0:
        print("no waiting task to run")
        return False
    task = tasks_waiting.pop(0)
    key = k12ai_train_execute('k12cv', 'cls', 'base_model', **task)[0]
    tasks_running.append((key, task))
    print('waiting[%d] running[%d] execute: %s' % (len(tasks_waiting), len(tasks_running), task))
    return True
                
def tasks_queue_work():
    print('start tasks_queue_work')
    waiting2running()
    waiting2running()
    waiting2running()
    while True:
        try:
            cpu_free, gpu_free = monitor_queue.get(True, timeout=10)
            if cpu_free >= cpu_reserve_mem and gpu_free >= gpu_reserve_mem:
                if not waiting2running():
                    return
                if gpu_free > 15000:
                    if not waiting2running():
                        return
            else:
                print(f'[Low Memory] cpu_free: {cpu_free}, gpu_free: {gpu_free}')
        except Empty:
            pass

def tasks_result_work():
    print("start tasks_result_work")
    global tasks_running, tasks_waiting
    while True:
        for key, task in tasks_running:
            data = k12ai_get_data(key, 'error', rm=True)
            data = data[0]['value']['data']['expand'] if data else None
            if not data or data['status'] in ('starting', 'running'):
                time.sleep(5)
                continue
            tasks_running.remove((key, task))
            if data['status'] == 'finish':
                print('key:%s, task:%s Finished' % (key, task))
                cpu_free = data['memstat']['sys_cpu_memory_free_MB']
                gpu_free = data['memstat']['sys_gpu_memory_free_MB'][0] # GPU-0
                monitor_queue.put((cpu_free, gpu_free))
                write_csv(**task, uptime=data['uptime'], memstat=data['memstat'])
                if len(tasks_running) == 0 and len(tasks_waiting) == 0:
                    print("no task!")
                    return
            else:
                print('key:%s, task:%s Error[%s]' % (key, task, data['errinfo']['err_text']))
                tasks_waiting.append(task)
                if len(tasks_running) == 0:
                    monitor_queue.put((cpu_reserve_mem, gpu_reserve_mem))
                    if tasks_waiting[0]['batchsize'] <= 64:
                        monitor_queue.put((cpu_reserve_mem, gpu_reserve_mem))

In [7]:
def test_memstat(models, datasets, batchsizes, inputsizes):
    global tasks_waiting
    tasks_waiting = tasks_generator(models, datasets, batchsizes, inputsizes)
    if len(tasks_waiting) == 0:
        print(f'already record, see {mamstat_file}')
        return
    t1 = threading.Thread(target=tasks_queue_work, args=())
    t1.start()
    time.sleep(1)
    t2 = threading.Thread(target=tasks_result_work, args=())
    t2.start()
    t1.join()
    t2.join()

## ALL Test

In [None]:
test_memstat(models, datasets, batchsizes, inputsizes)

start tasks_queue_work
start tasks_result_work


## vgg11 + vgg16 + vgg19

In [None]:
test_memstat(['vgg11', 'vgg16', 'vgg19'], datasets, bses)

## resnet18 + resnet50

In [None]:
test_memstat(['resnet18', 'resnet50'], datasets, bses)

## resnet101 + resnet152

In [None]:
# test_memstat(['resnet101', 'resnet152'], datasets, bses)
task = {'backbone': 'resnet152', 'dataset': 'dogsVsCats', 'batchsize': 64}
k12ai_train_execute('k12cv', 'cls', 'base_model', **task)

## vgg16_bn + vgg19_bn

In [None]:
test_memstat(['vgg16_bn', 'vgg19_bn'], datasets, bses)

In [None]:
memstat_df = pd.read_csv(memstat_file)
memstat_df = memstat_df.set_index('id')
memstat_df[:5]