In [None]:
import os
from collections import Counter
from pathlib import Path
from typing import List, Tuple

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import textdistance
from paragraph2actions.action_string_converter import ReadableConverter
from smiles2actions.utils import load_list_from_file, colorblind_color_palette

# Analysis of action lengths

Notebook for:
* Creating plots for distribution of action lengths
* Calculating the single-action accuracy

### Useful functions

In [None]:
converter = ReadableConverter(separator=' ; ', end_mark='')

In [None]:
def action_length(actions_str: str) -> int:
    """Get the number of actions for an action string."""
    return len(converter.string_to_actions(actions_str))

In [None]:
def action_lengths(actions: List[str]) -> List[int]:
    """Get the number of actions for multiple action strings."""
    lengths = []
    for actions_str in actions:
        try:
            lengths.append(action_length(actions_str))
        except Exception:
            pass
    return lengths

### File location

In [None]:
s2a_dir = Path(os.environ['S2A_PAPER_DATA_DIR'])
src_file = str(s2a_dir / 'src-test.txt')
tgt_file = str(s2a_dir / 'tgt-test.txt')

In [None]:
transformer_file = str(s2a_dir / 'transformer_test.txt')
bart_file = str(s2a_dir / 'bart_test.txt')
nn_file = str(s2a_dir / 'nn_test.txt')

### Distribution of action lengths for the different models

In [None]:
models = [
    (tgt_file, 'Ground truth'),
    (nn_file, 'Nearest-neighbor model'),
    (bart_file, 'BART model'),
    (transformer_file, 'transformer model'),
]

In [None]:
model_names = [model[1] for model in models]
model_paths = [model[0] for model in models]

In [None]:
data = [load_list_from_file(model_path) for model_path in model_paths]

In [None]:
prediction_lengths = [action_lengths(d) for d in data]

In [None]:
print([len(p) for p in prediction_lengths])

In [None]:
ind = np.arange(5, 26)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 5))
ax.hist(
    prediction_lengths,
    ind,
    label=model_names,
    color=colorblind_color_palette(len(prediction_lengths))
)
ax.legend(loc='upper right')
ax.set_xlabel('Number of actions')
ax.set_ylabel('Count')
ax.set_xticks(ind + 0.5)
ax.set_xticklabels(ind)
ax.margins(x=0.02)
fig.tight_layout()
plt.savefig('/tmp/sequence_length.pdf')

## Accuracy histogram

Plot the histogram for lenghts depending on the accuracy

In [None]:
lengths_for_accuracies: List[Tuple[float, List[int]]] = [
    (1.0, []),
    (0.75, []),
    (0.5, []),
]

In [None]:
# We are interested in the ground truth and in the transformer model
gt_index = 0
transformer_index = 3
assert model_names[gt_index] == 'Ground truth'
assert model_names[transformer_index] == 'transformer model'

In [None]:
for gt, pred in zip(data[gt_index], data[transformer_index]):
    n_actions = action_length(pred)
    similarity = textdistance.levenshtein.normalized_similarity(gt, pred)
    for key, value in lengths_for_accuracies:
        if similarity >= key:
            value.append(n_actions)

In [None]:
print('Number of action sequences fulfilling thresholds:')
for key, value in lengths_for_accuracies:
    print(key, len(value))

In [None]:
accuracies_lengths = [d[1] for d in lengths_for_accuracies]

In [None]:
accuracies_lengths[0:0] = [prediction_lengths[transformer_index]]
accuracies_lengths.append(prediction_lengths[gt_index])
accuracies_labels = ['All predictions'] + [
    f'Predictions with {int(d[0]*100)}% accuracy' for d in lengths_for_accuracies
] + ['Ground truth']

In [None]:
print(accuracies_lengths[1][:5])

In [None]:
ind = np.arange(5, 26)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 5))
ax.hist(
    accuracies_lengths,
    ind,
    label=accuracies_labels,
    density=True,
    color=colorblind_color_palette(len(accuracies_lengths))
)
ax.legend(loc='upper right')
ax.set_xlabel('Number of actions')
ax.set_ylabel('Frequency')
ax.set_xticks(ind + 0.5)
ax.set_xticklabels(ind)
ax.margins(x=0.02)
fig.tight_layout()
plt.savefig('/tmp/accuracy_histogram.pdf')

## Single action accuracy

Starting from the full-sequence accuracy, one can calculate back the single-sequence accuracy from a probabilistic perspective (for illustrative purposes).

In [None]:
## Polynomial solving
gt_lengths = prediction_lengths[0]
gt_number = len(data[0])
cntr = Counter(gt_lengths)
max_idx = max(cntr.keys())
correct_preds = len(lengths_for_accuracies[0][1])
correct_preds_freq = correct_preds / gt_number

In [None]:
# [max_idx, max_idx-1, ..., 1, 0]
indices = list(range(max_idx, -1, -1))
coefficients = [cntr[idx] for idx in indices]
coefficients[-1] = -correct_preds

In [None]:
p = np.poly1d(coefficients)
print(np.poly1d(p))

In [None]:
# The roots - the last one is of interest
print(p.r)