# <div align='center'> 自动统计训练过程CPU/GPU内存峰值信息 </div>

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import json, os, time
import hashlib
import pandas as pd
import multiprocessing, threading
from multiprocessing.queues import Empty
from k12libs.utils.nb_easy import k12ai_train_execute
from k12libs.utils.nb_easy import k12ai_get_data
from k12libs.utils.nb_easy import k12ai_print

In [None]:
cpu_reserve_mem = 5000
gpu_reserve_mem = 5000
memstat_file = f'memstat/0307.csv'

In [None]:
datasets = ('Animals', 'Boats', 'cactus', 'cifar10', 'dogsVsCats', 'FashionMNIST', 'kannada', 'mnist')
bses = (16, 32, 64, 128)

In [None]:
def gen_key(backbone, dataset, batchsize):
    return '%s-0' % hashlib.md5(f'cls{backbone}{dataset}{batchsize}'.encode()).hexdigest()[0:6]

def check_exist(backbone, dataset, batchsize):
    ID = gen_key(backbone, dataset, batchsize)
    if os.path.exists(memstat_file):
        memstat_df = pd.read_csv(memstat_file, index_col='id')
        if ID in memstat_df.index:
            return True
    return False
    
def write_csv(backbone, dataset, batchsize, uptime, memstat):
    ID = gen_key(backbone, dataset, batchsize)
    if os.path.exists(memstat_file):
        memstat_df = pd.read_csv(memstat_file)
        if ID in memstat_df.set_index('id').index:
            return memstat_df
    else:
        fieldnames = ['id', 'model', 'dataset', 'batchsize', 'uptime']
        fieldnames.extend(list(memstat.keys()))
        memstat_df = pd.DataFrame(columns=fieldnames)
    row = {
        'id': ID,
        'model': backbone,
        'dataset': dataset,
        'batchsize': batchsize,
        'uptime': uptime,
        **memstat
    }
    memstat_df = memstat_df.append(row, ignore_index=True)
    memstat_df.to_csv(memstat_file, index=False)
    return memstat_df

# memstat_file = '/tmp/test.csv'
# memstat_df = write_csv('cls', 'vgg11', 'Animals', 16, 100, memstat={
#     'app_cpu_memory_usage_MB': 1.0,
#     'app_gpu_memory_usage_MB': 1.0,
#     'sys_cpu_memory_free_MB': 1.0,
#     'sys_gpu_memory_free_MB': 1.0,
#     'app_cpu_max_memory_children_MB': 1.0,
#     'app_gpu_max_memory_cached_MB': 1.0,
#     'app_gpu_memory_allocated_MB': 1.0,
#     'app_gpu_memory_cached_MB': 1.0,
# })
# memstat_df

In [None]:
monitor_queue = multiprocessing.Queue()

tasks_running = []
tasks_waiting = []

def tasks_generator(models, datasets, bses):
    tasks = []
    for m in models:
        for d in datasets:
            for b in bses:
                if check_exist(m, d, b):
                    continue
                tasks.append({'backbone': m, 'dataset': d, 'batchsize': b})
                # yield (m, d, b)
                # key = k12ai_train_execute('k12cv', 'cls', 'base_model', d, batchsize=b, backbone=m)[0]
                # print(key)
    return tasks

def waiting2running():
    global tasks_running, tasks_waiting
    if len(tasks_waiting) == 0:
        print("no waiting task to run")
        return False
    task = tasks_waiting.pop(0)
    key = k12ai_train_execute('k12cv', 'cls', 'base_model', **task)[0]
    tasks_running.append((key, task))
    print('waiting[%d] running[%d] execute: %s' % (len(tasks_waiting), len(tasks_running), task))
    return True
                
def tasks_queue_work():
    print('start tasks_queue_work')
    waiting2running()
    waiting2running()
    while True:
        try:
            cpu_free, gpu_free = monitor_queue.get(True, timeout=10)
            if cpu_free >= cpu_reserve_mem and gpu_free >= gpu_reserve_mem:
                if not waiting2running():
                    return
            else:
                print(f'[Low Memory] cpu_free: {cpu_free}, gpu_free: {gpu_free}')
        except Empty:
            pass

def tasks_result_work():
    print("start tasks_result_work")
    global tasks_running, tasks_waiting
    while True:
        for key, task in tasks_running:
            data = k12ai_get_data(key, 'error', rm=True)
            data = data[0]['value']['data']['expand'] if data else None
            if not data or data['status'] in ('starting', 'running'):
                time.sleep(5)
                continue
            tasks_running.remove((key, task))
            if data['status'] == 'finish':
                print('key:%s, task:%s Finished' % (key, task))
                cpu_free = data['memstat']['sys_cpu_memory_free_MB']
                gpu_free = data['memstat']['sys_gpu_memory_free_MB']
                monitor_queue.put((cpu_free, gpu_free))
                write_csv(**task, uptime=data['uptime'], memstat=data['memstat'])
                if len(tasks_running) == 0 and len(tasks_waiting) == 0:
                    print("no task!")
                    return
            else:
                print('key:%s, task:%s Error[%s]' % (key, task, data['errinfo']['err_text']))
                tasks_waiting.append(task)
                if len(tasks_running) == 0:
                    monitor_queue.put((cpu_reserve_mem, gpu_reserve_mem))
                    if tasks_waiting[0]['batchsize'] <= 64:
                        monitor_queue.put((cpu_reserve_mem, gpu_reserve_mem))

In [None]:
def test_memstat(models, datasets, bses):
    global tasks_waiting
    tasks_waiting = tasks_generator(models, datasets, bses)
    if len(tasks_waiting) == 0:
        print(f'already record, see {mamstat_file}')
        return
    t1 = threading.Thread(target=tasks_queue_work, args=())
    t1.start()
    time.sleep(1)
    t2 = threading.Thread(target=tasks_result_work, args=())
    t2.start()
    t1.join()
    t2.join()

In [None]:
# test_memstat(['vgg11'], ['Animals'], [32, 64])

## vgg11 + vgg16 + vgg19

In [None]:
test_memstat(['vgg11', 'vgg16', 'vgg19'], datasets, bses)

## resnet18 + resnet50

In [None]:
test_memstat(['resnet18', 'resnet50'], datasets, bses)

## resnet101 + resnet152

In [None]:
# test_memstat(['resnet101', 'resnet152'], datasets, bses)
task = {'backbone': 'resnet152', 'dataset': 'dogsVsCats', 'batchsize': 64}
k12ai_train_execute('k12cv', 'cls', 'base_model', **task)

## vgg16_bn + vgg19_bn

In [None]:
test_memstat(['vgg16_bn', 'vgg19_bn'], datasets, bses)

In [3]:
memstat_df = pd.read_csv(os.path.join('memstat', '0307.csv'))
memstat_df = memstat_df.set_index('id')
memstat_df[:5]

Unnamed: 0_level_0,model,dataset,batchsize,uptime,app_cpu_memory_usage_MB,app_gpu_memory_usage_MB,sys_cpu_memory_free_MB,sys_gpu_memory_free_MB,peak_cpu_self_ru_maxrss,peak_cpu_children_ru_maxrss,peak_gpu_0_memory_cached_MB,peak_gpu_0_memory_allocated_MB,peak_gpu_0_max_memory_cached_MB,peak_gpu_0_max_memory_allocated_MB
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
652833-0,vgg11,Animals,32,18,4765.378,3651.829,39252.395,16357.0,2771.023,1994.355,2840.0,2462.836,4360.0,3651.829
dcd514-0,vgg11,Animals,64,18,4831.48,3705.231,41698.023,19786.0,2794.621,2036.859,2534.0,2461.84,4142.0,3705.231
55d9e8-0,vgg11,Animals,16,23,4760.102,3705.356,36327.082,12230.0,2766.875,1993.227,2746.0,2462.211,4284.0,3705.356
fccbae-0,vgg11,Animals,128,21,4931.282,4472.964,38339.496,15565.0,2812.684,2118.598,2614.0,2464.748,5026.0,4472.964
81d4c2-0,vgg11,Boats,16,26,5230.586,3696.065,41709.566,19558.0,2766.758,2463.828,2762.0,2461.841,4258.0,3696.065
