# Plot images

Notebook for plotting images.

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#### helper functions

In [None]:
def add_value_labels(ax, horizontal=False, spacing=5):
    """Add labels to the end of each bar in a bar chart.

    Parameters
    -----------
    ax : matplotlib.axes.Axes)
        The matplotlib object containing the axes of the plot to annotate.
    spacing : int
        The distance between the labels and the bars.
    """
    
    if horizontal:
        # For each bar: Place a label
        for rect in ax.patches:
            # Get X and Y placement of label from rect.
            x_value = rect.get_width()
            y_value = rect.get_y() + rect.get_height() / 2

            # Number of points between bar and label. Change to your liking.
            space = spacing
            # Vertical alignment for positive values
            ha = 'left'

            # If value of bar is negative: Place label left of bar
            if x_value < 0:
                # Invert space to place label to the left
                space *= -1
                # Horizontally align label at right
                ha = 'right'

            
            label = x_value
            # Create annotation
            plt.annotate(
                label,                      # Use `label` as label
                (x_value, y_value),         # Place label at end of the bar
                xytext=(space, 0),          # Horizontally shift label by `space`
                textcoords="offset points", # Interpret `xytext` as offset in points
                va='center',                # Vertically center label
                ha=ha)                      # Horizontally align label differently for
                                            # positive and negative values.
    else:

        # For each bar: Place a label
        for rect in ax.patches:
            # Get X and Y placement of label from rect.
            y_value = rect.get_height()
            x_value = rect.get_x() + rect.get_width() / 2

            # Number of points between bar and label. Change to your liking.
            space = spacing
            # Vertical alignment for positive values
            va = 'bottom'

            # If value of bar is negative: Place label below bar
            if y_value < 0:
                # Invert space to place label below
                space *= -1
                # Vertically align label at top
                va = 'top'

            
            label = y_value

            # Create annotation
            ax.annotate(
                label,                      # Use `label` as label
                (x_value, y_value),         # Place label at end of the bar
                xytext=(0, space),          # Vertically shift label by `space`
                textcoords="offset points", # Interpret `xytext` as offset in points
                ha='center',                # Horizontally center label
                va=va)                      # Vertically align label differently for
                                            # positive and negative values.

## Plot count of the entities

### `expected` or `expected-na`

In [None]:
na = "-na"
train = pd.read_json(path_or_buf=f"../data/expected{na}/expected{na}-train.jsonl", lines=True)
val = pd.read_json(path_or_buf=f"../data/expected{na}/expected{na}-val.jsonl", lines=True)
test = pd.read_json(path_or_buf=f"../data/expected{na}/expected{na}-test.jsonl", lines=True)

In [None]:
train.head()

In [None]:
ax = train.question.value_counts().plot(kind="barh", legend=True, figsize=(10, 6),width=0.7)

plt.rcParams.update({'axes.titlesize': 'x-large'})
plt.title(f"Anzahl der Entitäten im Trainingdatensatz von 'new-expected{na}'\n\n")
plt.xlim([0, 40000])
plt.gca().invert_yaxis()
plt.legend(["Entitäten mit Häufigkeit"], loc="upper center", bbox_to_anchor=(0.5, 1.08))
add_value_labels(ax, horizontal=True)
plt.savefig(f"../misc/img/new-expected{na}-entities-count.png", dpi=300, facecolor="white")
plt.show()

In [None]:
new_train = train.copy()
new_train["answer"] = new_train.apply(lambda row: row.answers["text"][0], axis=1)
new_train = new_train[new_train.answer != "EMPTY"]

ax = new_train.question.value_counts().plot(kind="barh", legend=True, figsize=(10, 6),width=0.7)

plt.rcParams.update({'axes.titlesize': 'x-large'})
plt.title(f"Anzahl der beantwortbaren Entitäten im Trainingdatensatz von 'new-expected{na}'\n")
plt.xlim([0, 35000])
plt.gca().invert_yaxis()
plt.legend(["Entitäten mit Häufigkeit"], loc="lower right")
add_value_labels(ax, horizontal=True)
plt.savefig(f"../misc/img/new-expected{na}-answerable-entities-count.png", dpi=300, facecolor="white")
plt.show()

In [None]:
ans = new_train.shape[0]
no_ans = train.shape[0] - ans

In [None]:
no_ans / train.shape[0]

In [None]:
new_train = train.copy()
new_train["answer"] = new_train.apply(lambda row: row.answers["text"][0], axis=1)
new_train = new_train[new_train.answer != "EMPTY"]

In [None]:
new_train.shape

In [None]:
train.shape

### `crawl-na`

In [None]:
na = "-na"
train = pd.read_json(path_or_buf=f"../data/crawl{na}/crawl{na}-train.jsonl", lines=True)
val = pd.read_json(path_or_buf=f"../data/crawl{na}/crawl{na}-val.jsonl", lines=True)
test = pd.read_json(path_or_buf=f"../data/crawl{na}/crawl{na}-test.jsonl", lines=True)

In [None]:
ax = train.question.value_counts().plot(kind="barh", legend=True, figsize=(10, 6),width=0.7)

plt.rcParams.update({'axes.titlesize': 'x-large'})
plt.title(f"Anzahl der Entitäten im Trainingsdatensatz von 'crawl{na}'\n\n")
plt.xlim([0, 3200])
plt.gca().invert_yaxis()
#plt.legend(["Entitäten \nmit Häufigkeit"], loc="lower right")
plt.legend(["Entitäten mit Häufigkeit"], loc="upper center", bbox_to_anchor=(0.5, 1.08))
add_value_labels(ax, horizontal=True)
plt.savefig(f"../misc/img/crawl{na}-entities-count.png", dpi=300, facecolor="white")
plt.show()

In [None]:
new_train = train.copy()
new_train["answer"] = new_train.apply(lambda row: row.answers["text"][0], axis=1)
new_train = new_train[new_train.answer != "EMPTY"]

ax = new_train.question.value_counts().plot(kind="barh", legend=True, figsize=(10, 6),width=0.7)

plt.rcParams.update({'axes.titlesize': 'x-large'})
plt.title(f"Anzahl der beantwortbaren Entitäten im Trainingsdatensatz von 'crawl{na}'\n")
plt.xlim([0, 3000])
plt.gca().invert_yaxis()
plt.legend(["Entitäten mit Häufigkeit"], loc="lower right")
add_value_labels(ax, horizontal=True)
plt.savefig(f"../misc/img/crawl{na}-answerable-entities-count.png", dpi=300, facecolor="white")
plt.show()

## Plot Logit Scores for QA Example

In [None]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [None]:
%%time
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
question = "How many residents does Würzburg have?"
answer_text = """
Würzburg is a city in the traditional region of Franconia in the north of the German state of Bavaria. 
At the next-down tier of local government it is the administrative seat of Lower Franconia.
The city has around 130000 residents."""

In [None]:
# Apply the tokenizer to the input text, treating them as a text-pair.
input_ids = tokenizer.encode(question, answer_text)

print('The input has a total of {:} tokens.'.format(len(input_ids)))

# BERT only needs the token IDs, but for the purpose of inspecting the 
# tokenizer's behavior, let's also get the token strings and display them.
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# For each token and its id...
for token, id in zip(tokens, input_ids):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')
        
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

# Run our example through the model.
outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                             token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                             return_dict=True) 

start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Start with the first token.
answer = tokens[answer_start]

# Select the remaining answer tokens and join them with whitespace.
for i in range(answer_start + 1, answer_end + 1):
    
    # If it's a subword token, then recombine it with the previous token.
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    
    # Otherwise, add a space then the token.
    else:
        answer += ' ' + tokens[i]

print()
print('Answer: "' + answer + '"')

In [None]:
# Use plot styling from seaborn.
sns.set_theme(style="whitegrid")

params = {
    'figure.figsize': (14,20),
    'axes.titlesize':'xx-large'
}

# Increase the plot size and font size.
#sns.set(font_scale=1.5)
plt.rcParams.update(params) 

# Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.
s_scores = start_scores.detach().numpy().flatten()
e_scores = end_scores.detach().numpy().flatten()

# We'll use the tokens as the x-axis labels. In order to do that, they all need
# to be unique, so we'll add the token index to the end of each one.
token_labels = []
for (i, token) in enumerate(tokens):
    token_labels.append('{:} - {:>2}'.format(token, i))

## Saving figures

In [None]:
# Create a barplot showing the start word score for all of the tokens.
ax = sns.barplot(x=s_scores, y=token_labels, palette="Greens_d", ci=None)

# Turn the xlabels vertical.
#ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")

# Turn on the vertical grid to help align words to scores.
ax.grid(True)

plt.title('Logit-Scores der Starttokens der Voraussage\n')
#plt.savefig("../misc/img/wurzburg_start_scores.png", dpi=600)
plt.show()

In [None]:
# Create a barplot showing the end word score for all of the tokens.
ax = sns.barplot(x=e_scores, y=token_labels, palette="Reds_d", ci=None)

# Turn the xlabels vertical.
#ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")

# Turn on the vertical grid to help align words to scores.
ax.grid(True)

plt.title('Logit-Scores der Endtokens der Voraussage\n')
#plt.savefig("../misc/img/wurzburg_end_scores.png", dpi=600)
plt.show()

## Plot cross validation box plots

### expected

In [None]:
f1 = [95.54, 95.38, 95.46, 95.45, 95.59]
em = [94.43, 94.37, 94.45, 94.39, 94.57]

plt.rcParams["figure.figsize"] = (3,4)
sns.boxplot(data=[f1, em])
plt.xticks([0,1], ["F1", "EM"])
plt.title("CV Verteilung \nnew-expected-na\n")
plt.savefig(f"../misc/img/boxplot-new-expected.png", dpi=150, facecolor="white", bbox_inches='tight')
plt.show()

### crawl

In [None]:
f1 = [84.55, 86.92, 86.43, 85.7, 86.2]
em = [82.96, 85.49, 84.95, 84.2, 84.8]

plt.rcParams["figure.figsize"] = (3,4)
sns.boxplot(data=[f1, em])
plt.xticks([0,1], ["F1", "EM"])
plt.title("CV Verteilung \ncrawl-na\n")
plt.savefig(f"../misc/img/boxplot-crawl.png", dpi=150, facecolor="white", bbox_inches='tight')
plt.show()

In [None]:
np.mean(f1), np.mean(em)

In [None]:
np.std(f1), np.std(em)