# Combine Tensorboard Logs

This file extracts all logged tensorboard data from multiple runs of model configurations and writes the combined dataframe into a new file.

* adds the step counter / episode from previous runs to all subsequent files to have a continuous increment across files of different runs
* can add a datetime variable with `add_date=True`
* can remove all non-eval rows of datasets to make statistics file more compact with `keep_only_eval_rows=True`



In [1]:
import tensorboard as tb
from tbparse import SummaryReader
import pandas as pd
import os
import time

In [2]:
def get_logs(run_name, base='../tensorboard_log/cs', pivot=True, extra_columns=None):
    f = os.path.join(base, run_name)
    return SummaryReader(f, pivot=pivot, extra_columns=extra_columns)

def parse_scalars(reader):
    """
    parses the scalar dataframe for a SummaryReader object.
    extracts first element from wall_time column and removes prefixes from column names
    """
    scalars = reader.scalars
    if 'wall_time' in scalars.columns:
        # convert wall time list into single number column, just use first value
        scalars['wall_time'] = [x[0] for x in scalars['wall_time'].tolist()]
    from_columns = scalars.columns.tolist()
    to_columns = [x.replace('my-stats/', '').replace('train/', '').replace('time/', '').replace('eval/', 'eval_') for x in from_columns]
    scalars = scalars.rename(columns=dict(zip(from_columns, to_columns)))
    return scalars

def get_filenames(model, run_cfg, base='../tensorboard_log/cs', res=None):
    """
    gets filename base on model and run_cfg filter and sorted them by datestr (3rd element in _.split)
    optional filter res (4th element) for intraday
    removes invalid names that do not match the filter conditions
    """
    items = []
    timestamps = []
    for filename in os.listdir(base):
        f = os.path.join(base, filename)
        if not os.path.isfile(f):
            fn_parts = filename.split('_')
            if parts_match_filter(fn_parts, model, run_cfg, res):
                items.append(filename)
                timestamps.append(int(fn_parts[2]))
    sorted_items = [x for _, x in sorted(zip(timestamps, items))]
    return sorted_items

def parts_match_filter(parts, model, run_cfg, res=None):
    if len(parts) < 3:
        return False
    if parts[0] != model:
        return False
    if parts[1] != run_cfg:
        return False
    if res:
        if parts[3] != res:
            return False
    return True

def fix_counters(dataframes):
    """
    Adds max. step / episode from previous df to next to have continuous values
    """
    # correct step and episode counters
    start_step = 0
    start_ep   = 0
    print(f" - fixing counters of individual datasets")
    for d in dataframes:
        d['step'] += start_step
        d['episode'] += start_ep
        start_step = d.iloc[-1]['step']
        start_ep   = d.iloc[-1]['episode']
        d['step'] = d['step'].astype('Int64')
        d['episode'] = d['episode'].astype('Int64')                                     
    return dataframes

def load_dataframes(filenames, base):
    """
    loads a set of tensorboard logs into scalar dataframes
    """
    data = []
    for fn in filenames:
        print(f" - reading {fn}")
        reader = get_logs(fn, extra_columns={'wall_time', }, base=base)
        sc = parse_scalars(reader)
        # print(f"loaded {sc.shape[0]} rows from {fn}")
        data.append(sc)
    return data

def merge_data(dataframes, add_date_column=False, keep_only_eval_rows=True):
    print(" - merging dataframes")
    df_all = pd.concat(dataframes).reset_index(drop=True)
    df_all['wall_time'] = df_all['wall_time'].astype('int')
    if add_date_column:
        df_all['date'] = pd.to_datetime(df_all['wall_time'], unit='s').astype('datetime64[s]')
    if "fps" in df_all.columns:
        df_all['fps'] = df_all['fps'].ffill()
    if keep_only_eval_rows:
        df_all = df_all[df_all['eval_episode_rewards'].notna()]
    return df_all

def combine_tensorlogs(model, cfg, base_path='../tensorboard_log/cs', target_path='../logs_parsed', 
                       add_date=False, keep_only_eval_rows=True):
    print(f"Combine Logs for {model}_{cfg}")
    start = time.time()
    filenames = get_filenames(model, cfg, base=base_path)
    
    if len(filenames) == 0:
        print(f"No files found - skipping")
        return
    print(f"{len(filenames)} files found")
    
    x = load_dataframes(filenames, base=base_path)
    x = fix_counters(x)
    x = merge_data(x, add_date_column=add_date, keep_only_eval_rows=keep_only_eval_rows)

    x.to_csv(f"{target_path}/{model}_{cfg}.csv")
    print(f"Combine Logs for {model}_{cfg} done in {(time.time() - start):.1f}s")
        
    return x

def combine_tensorlogs_intraday(model, cfg, res, base_path='../tensorboard_log/cs', target_path='../logs_parsed', 
                       add_date=False, keep_only_eval_rows=True):
    print(f"Combine Logs for {model}_{cfg}_{res}")
    start = time.time()
    filenames = get_filenames(model, cfg, base=base_path, res=res)
    
    if len(filenames) == 0:
        print(f"No files found - skipping")
        return
    print(f"{len(filenames)} files found")
    
    x = load_dataframes(filenames, base=base_path)
    x = fix_counters(x)
    x = merge_data(x, add_date_column=add_date, keep_only_eval_rows=keep_only_eval_rows)

    x.to_csv(f"{target_path}/{model}_{cfg}_{res}.csv")
    print(f"Combine Logs for {model}_{cfg} done in {(time.time() - start):.1f}s")
        
    return x

# A2C Tensorboard Logs

In [None]:
cfgs = ['V213', 'V219', 'V221']
for cfg in cfgs:
    df = combine_tensorlogs("A2C", cfg, add_date=True, base_path='../tensorboard_log/_old_A2C_v2xx', keep_only_eval_rows=True)

In [None]:
cfgs = ['V221']
reslist = ["1H", "6H", "12H"]
for cfg in cfgs:
    for res in reslist:
        df = combine_tensorlogs_intraday("A2C", cfg, res, add_date=True, base_path='../tensorboard_log/intraday_A2C', keep_only_eval_rows=True)

# PPO Tensorboard Logs

In [None]:
cfgs = ['V205', 'V207', 'V208']
for cfg in cfgs:
    df = combine_tensorlogs("PPO", cfg, add_date=True, base_path='../tensorboard_log/_old_PPO_v2xx', keep_only_eval_rows=True)

In [None]:
cfgs = ['V208']
reslist = ["1H", "6H", "12H"]
for cfg in cfgs:
    for res in reslist:
        df = combine_tensorlogs_intraday("PPO", cfg, res, add_date=True, base_path='../tensorboard_log/intraday_PPO', keep_only_eval_rows=True)

# TD3 Tensorboard Logs

In [None]:
cfgs = ['V203', 'V206', 'V213', 'V214']
for cfg in cfgs:
    df = combine_tensorlogs("TD3", cfg, add_date=True, base_path='../tensorboard_log/_old_TD3_v2xx', keep_only_eval_rows=True)

In [None]:
cfgs = ['V214']
reslist = ["1H", "6H", "12H"]
for cfg in cfgs:
    for res in reslist:
        df = combine_tensorlogs_intraday("TD3", cfg, res, add_date=True, base_path='../tensorboard_log/intraday_TD3', keep_only_eval_rows=True)