In [None]:
import ast
import pandas as pd

def convert_to_list(s):
    return ast.literal_eval(s)

df = pd.read_csv('../results/output_compress_cdawg.csv', converters={'memory_usage_ma': convert_to_list, 'elapsed_time_lz78': convert_to_list})
df['memory_usage_ma_average'] = df['memory_usage_ma'].apply(lambda x: (sum(x) - max(x) - min(x)) / (len(x) - 2)) / 1024.0 / 1024.0
df['elapsed_time_lz78_average'] = df['elapsed_time_lz78'].apply(lambda x: (sum(x) - max(x) - min(x)) / (len(x) - 2)) / 1e6
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FuncFormatter
import numpy as np

filenames = ['sources', 'dna', 'english',  'fib']
colors = { 'sources': 'tab:orange', 'dna': 'tab:green', 'english': 'tab:blue', 'fib': 'tab:red'}
lengths = [1 << x for x in range(3, 27)]

df_construct = pd.read_csv('../results/output_construct.csv', converters={'num_iter_bins': convert_to_list})

memory_usage_cdawg = df_construct[df_construct.text_length == df_construct.text_length.max()].groupby('filename')['memory_usage_cdawg'].mean() / 1024.0 / 1024.0

df_g = df.groupby('filename')

fig, ax = plt.subplots(figsize=(8, 4))
for filename, sub_df in df_g:
    # plot the graph of memory_usage_ma_average
    # set ax as log_scale
    ax.set_xscale('log', base=10)
    ax.set_yscale('log', base=10)
    # write horizontal bar indicates memory_usage_cdawg
    ax.axhline(y=memory_usage_cdawg[filename], linestyle=':', color=colors[filename])
    ax.plot(sub_df['substr_length'], sub_df['memory_usage_ma_average'], label=filename, color=colors[filename])
ax.set_xlabel('substring length')
ax.set_ylabel('memory usage (MiB)')
ax.legend()
fig.savefig(f'../results/memory_usage_ma.pdf')
fig.show()


In [None]:
df_st = pd.read_csv('../results/output_compress_suffixtree.csv', converters={'elapsed_time_lz78': convert_to_list})
df_st['elapsed_time_lz78_average'] = df_st['elapsed_time_lz78'].apply(lambda x: (sum(x) - max(x) - min(x)) / (len(x) - 2)) / 1e6
df_st.drop(columns=['elapsed_time_lz78', 'num_iter', 'text_length', 'memory_usage_st'], inplace=True)
df_m = pd.merge(df, df_st, on=['filename', 'substr_length'], suffixes=('', '_st'))
df_g = df_m.groupby('filename')

for filename, sub_df in df_g:
    fig, ax = plt.subplots(figsize=(8, 4))
    # set ax as log_scale
    ax.set_xscale('log', base=10)
    ax.set_yscale('log', base=10)

    ax.plot(sub_df['substr_length'], sub_df['elapsed_time_lz78_average'], label='CDAWG')
    # plot with linestyle
    ax.plot(sub_df['substr_length'], sub_df['elapsed_time_lz78_average_st'], label='ST')
    ax.set_xlabel('substring length')
    ax.set_ylabel('elapsed time (s)')
    ax.legend()
    ax.set_title(filename)
    fig.savefig(f'../results/elapsed_time_{filename}.pdf')
    fig.show()