In [None]:
import ast
import pandas as pd

def convert_to_list(s):
    return ast.literal_eval(s)

df = pd.read_csv('../results/output_construct.csv', converters={'num_iter_bins': convert_to_list})
df['bytes_per_character'] = df['memory_usage_cdawg'] / df['memory_usage_text']
df['vertices_per_character'] = df['num_vertices'] / df['text_length']
df['edges_per_character'] = df['num_edges'] / df['text_length']
df['memory_usage_cdawg'] /= 1024.0 * 1024.0
df['memory_usage_ma'] /= 1024.0 * 1024.0
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FuncFormatter
import numpy as np


filenames = ['sources', 'dna', 'english', 'fib']
lengths = [1 << x for x in range(3, 28)]


for label in ['num_vertices', 'num_edges', 'edges_per_character', 'bytes_per_character', 'elapsed_time_lz78']:
    fig, ax = plt.subplots(figsize=(8, 4))
    # for filename in filenames:
    sns.lineplot(df, x='text_length', y=label, hue='filename', ax=ax)
    if label.startswith('memory_usage'):
        ax.set_title(label + ' [kB]')
    if label.endswith('per_character'):
        ax.set_xscale('log', base=2)
        ax.set_xticks(df['text_length'])
        ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, _: f"$2^{{{int(np.log2(x))}}}$"))
        ax.set_title(label + ' [bytes]')
    if label.startswith('elapsed_time_lz78'):
        ax.set_title(label + ' [ms]')
    else:
        ax.set_title(label)

    # handler, label = ax.get_legend_handles_labels()
    # ax.legend(handler, filenames)
    ax.legend()


In [None]:
df_st = pd.read_csv('../results/output_compress_suffixtree.csv', converters={'elapsed_time_lz78': convert_to_list})
df_st_memory = df_st.groupby('filename')['memory_usage_st'].mean() / 1024.0 / 1024.0

# create the dataframe such that extract the largest data for each file name
df_max = df[df.text_length == 1 << 27].copy()
df_max.drop(columns=['memory_usage_ma', 'memory_usage_text', 'num_iter_bins'], inplace=True)
df_max.set_index('filename', inplace=True)
df_max['memory_usage_st'] = df_st_memory
df_max

In [None]:
df2 = df.set_index(['filename', 'text_length'])

plt.rcParams["font.size"] = 12
fig, ax = plt.subplots(figsize=(12, 3))
# ax2 = ax.twinx()


colors = { 'sources': 'tab:orange', 'dna': 'tab:green', 'english': 'tab:blue', 'fib': 'tab:red'}

alpha = 0.7

for filename in filenames:
    bin = df2.loc[filename, lengths[-1]]['num_iter_bins']
    n = sum(bin)
    bin = bin[:41]

    # Plot the distribution of the number of edges on the path representing S[i, n]
    ax.bar(range(len(bin)), bin, alpha=alpha, label=filename, color=colors[filename])
    alpha -= 0.1
    # ax.set_xticks(range(0, len(bin) + 1, 5))
    ax.set_ylim(0, n)
    ax.set_xlim(0, len(bin))
    # ax.set_xticklabels(range(0, len(bin) + 1, 5))
    ax.set_xlabel('number of edges on the path representing T[i, n]')
    ax.set_ylabel('number of paths')
    ax.set_yticks(range(0, n + 1, n // 10))
    ax.set_yticklabels([f'{x / n:.1f}n' for x in range(0, n + 1, n // 10)])
    # add the cumulative sum of the number of edges on the path representing S[i, n]
    cumsum = np.cumsum(bin)
    ax.plot(range(len(bin)), cumsum, color=colors[filename], alpha=0.8)

    average = sum(map(lambda x: x[0] * x[1], enumerate(bin))) // sum(bin)

    # Plot vertical line
    ax.axvline(x=average, linestyle='--', color=colors[filename], alpha=0.8)
ax.legend()
fig.savefig(f'../results/num_iter_bins.pdf', bbox_inches='tight')
plt.show()
