In [None]:
import ast
import pandas as pd

def convert_to_list(s):
    return ast.literal_eval(s)

# DataFrameの読み込み
df = pd.read_csv('../results/output_construct.csv', converters={'num_iter_bins': convert_to_list})
df['bytes_per_character'] = df['memory_usage_cdawg'] / df['memory_usage_text']
df['edges_per_character'] = df['num_edges'] / df['text_length']
df['memory_usage_cdawg'] /= 1024.0 * 1024.0
df['memory_usage_ma'] /= 1024.0 * 1024.0
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FuncFormatter
import numpy as np


filenames = ['sources', 'dna', 'english']
lengths = [1 << x for x in range(3, 27)]


for label in ['num_vertices', 'num_edges', 'edges_per_character', 'bytes_per_character', 'elapsed_time_lz78']:
    fig, ax = plt.subplots(figsize=(8, 4))
    # for filename in filenames:
    sns.lineplot(df, x='text_length', y=label, hue='filename', ax=ax)
    if label.startswith('memory_usage'):
        ax.set_title(label + ' [kB]')
    if label.endswith('per_character'):
        ax.set_xscale('log', base=2)
        ax.set_xticks(df['text_length'])
        ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, _: f"$2^{{{int(np.log2(x))}}}$"))
        ax.set_title(label + ' [bytes]')
    if label.startswith('elapsed_time_lz78'):
        ax.set_title(label + ' [ms]')
    else:
        ax.set_title(label)

    # handler, label = ax.get_legend_handles_labels()
    # ax.legend(handler, filenames)
    ax.legend()


In [None]:
# create the dataframe such that extract the largest data for each file name
df_max = df[df.text_length == 1 << 27].copy()
df_max.drop(columns=['memory_usage_ma', 'memory_usage_text', 'num_iter_bins'], inplace=True)
df_max

In [None]:
df2 = df.set_index(['filename', 'text_length'])
for filename in filenames:
    fig, ax = plt.subplots(figsize=(12, 4))
    bin = df2.loc[filename, lengths[-1]]['num_iter_bins']
    n = sum(bin)
    bin = bin[:101]

    # Plot the distribution of the number of edges on the path representing S[i, n]
    ax.bar(range(len(bin)), bin)
    ax.set_xticks(range(0, len(bin), 10))
    ax.set_ylim(0, n)
    ax.set_xticklabels(range(0, len(bin), 10))
    ax.set_xlabel('number of edges on the path representing T[i, n]')
    ax.set_ylabel('number of paths')
    # add the cumulative sum of the number of edges on the path representing S[i, n]
    cumsum = np.cumsum(bin)
    ax2 = ax.twinx()
    ax2.plot(range(len(bin)), cumsum, color='red')
    ax2.set_ylim(0, n)
    ax2.set_yticks(range(0, n + 1, n // 10))
    ax2.set_yticklabels([f'{x / n:.0%}' for x in range(0, n + 1, n // 10)])
    ax2.set_ylabel('cumulative percentage')

    ax.set_title(f'distribution of the number of edges on the path representing T[i, n] ({filename})')


    average = sum(map(lambda x: x[0] * x[1], enumerate(bin))) // sum(bin)

    # # Plot vertical line
    # ax.axvline(x=average, color='red', linestyle='--')

    plt.savefig(f'../results/num_iter_bins_{filename}.pdf')
    plt.show()


In [None]:
df2 = df.set_index(['filename', 'text_length'])
fig, ax = plt.subplots(figsize=(12, 4))
ax2 = ax.twinx()

colors = { 'sources': 'tab:orange', 'dna': 'tab:green', 'english': 'tab:blue' }

alpha = 0.7

for filename in filenames:
    bin = df2.loc[filename, lengths[-1]]['num_iter_bins']
    n = sum(bin)
    bin = bin[:101]

    # Plot the distribution of the number of edges on the path representing S[i, n]
    ax.bar(range(len(bin)), bin, alpha=alpha, label=filename, color=colors[filename])
    alpha -= 0.1
    ax.set_xticks(range(0, len(bin), 10))
    ax.set_ylim(0, n)
    ax.set_xticklabels(range(0, len(bin), 10))
    ax.set_xlabel('number of edges on the path representing T[i, n]')
    ax.set_ylabel('number of paths')
    # add the cumulative sum of the number of edges on the path representing S[i, n]
    cumsum = np.cumsum(bin)
    ax2.plot(range(len(bin)), cumsum, color=colors[filename], alpha=0.8)
    ax2.set_ylim(0, n)
    ax2.set_yticks(range(0, n + 1, n // 10))
    ax2.set_yticklabels([f'{x / n:.0%}' for x in range(0, n + 1, n // 10)])
    ax2.set_ylabel('percentage')

    average = sum(map(lambda x: x[0] * x[1], enumerate(bin))) // sum(bin)

    # # Plot vertical line
    ax.axvline(x=average, linestyle='--', color=colors[filename], alpha=0.8)
ax.legend()
plt.savefig(f'../results/num_iter_bins.pdf')
plt.show()
