# Exp 005: Test Classifiers
Qualitative evaluation of classifiers score

In [2]:
from dotenv import load_dotenv
load_dotenv()
import os

import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import random

import nltk
from nltk.tokenize import sent_tokenize

import sys
sys.path.append('../source')
import models
import importlib # in case models.py has changed
importlib.reload(models)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


<module 'models' from '/mnt/qb/work/meurers/mpb672/grammarctg/experiments/../source/models.py'>

In [3]:
texts = pd.read_csv("../data/cefr_leveled_texts.csv")
df = pd.read_json('../data/egp.json')

In [4]:
level = "B1"
level_models = {level: models.load_model(level, df) for level in [level]}

In [4]:
sents_per_text = list(texts.text.apply(sent_tokenize))
all_sents = [sentence for text in sents_per_text for sentence in text]
len(all_sents)

36241

In [7]:
index = models.get_scores(level_models[level], all_sents, use_tqdm=True).cpu().numpy()
index.shape

Computing scores...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 284/284 [01:52<00:00,  2.53it/s]


(36241, 338)

In [10]:
train_level = pd.read_json(f"../../LLM-grammar/dat/egp_{level}.json")
train_level = train_level[train_level['Level']==level]

In [11]:
augmented_examples = train_level.iloc[0].augmented_examples
pos_scores = models.get_scores(level_models[level], augmented_examples, use_tqdm=False, task_id=0).cpu().numpy()

In [26]:
def show_egp(egp_nr = 0, top_n=50, keywords = ['and']):
    print(f'NR: {egp_nr}')
    print(df[df['Level']==level].iloc[egp_nr]['Can-do statement'])
    print(df[df['Level']==level].iloc[egp_nr]['Example'])
    
    threshold = np.partition(index[:, egp_nr], -top_n)[-top_n]

    augmented_examples = train_level.iloc[egp_nr].augmented_examples
    pos_scores = models.get_scores(level_models[level], augmented_examples, use_tqdm=False, task_id=egp_nr).cpu().numpy()
    print(f'Positive example: {((pos_scores[:,1] - pos_scores[:,0])>threshold).mean()}')
    print(random.sample(augmented_examples, 10))
    negative_augmented_examples = train_level.iloc[egp_nr].augmented_negative_examples
    neg_scores = models.get_scores(level_models[level], negative_augmented_examples, use_tqdm=False, task_id=egp_nr).cpu().numpy()
    print(f'Negative example: {((neg_scores[:,1] - neg_scores[:,0])<0).mean()}')
    print(random.sample(negative_augmented_examples, 10))
    
    print((index>threshold)[:, egp_nr].sum())

    for sent in np.array(all_sents)[(index>threshold)[:, egp_nr]]:
        has_keyword = False
        for keyword in keywords:
            if sent.find(keyword)>-1: has_keyword = True
            sent = sent.replace(keyword, f"**{keyword}**")
        print(f'{"Yes" if has_keyword else "No"}: {sent}')
    print("_" * 100)

keywords = {
    0: [', '],
    1: [' and '],
    2: [' and '],
    3: [' and '],
    4: ['-'],
    10: ['enough'],
    11: ['quite'],
    12: ['too'],
    13: [' a'],
    100: ['going to'] ,
    101: ['going to'] ,
    102: ['going to'] ,
    104: ['going to', 'but'] ,
    109: ['as soon as'],
    150: ['should'],
    151: ['should have'],
    200: ['?'],
    201: ['ing'],
    202: ['because']
}
for nr in range(250, 255):#len(df[df['Level']==level])):
    show_egp(nr, keywords = keywords[nr] if nr in keywords else [])

NR: 250
Can use the singular reflexive pronouns 'myself', 'yourself', 'himself' and 'herself' after prepositions where the object of the preposition is the same as the subject of the verb.
He told me about himself and how he started playing the guitar. 

Now she was very proud of herself.
Positive example: 0.8927272727272727
["I'm going to treat myself to a nice dinner.", 'Are you talking to yourself again?', 'He found himself daydreaming during the boring lecture.', 'He was feeling good about himself after completing the marathon.', 'My brother is taking care of himself.', 'I need to take care of myself.', 'You need to take care of yourself.', "She's so proud of herself for finishing the race.", 'We need to be honest with ourselves about our mistakes.', 'Do you ever find yourself questioning your decisions?']
Negative example: 0.9927272727272727
['You have to make the decision on your own.', 'They enjoyed the party with their friends.', 'She found it difficult to relax in front of new

# Saliency maps
The gradient of the input tokens is assumed to reflect the sensitivity of the output with respect to it.

In [60]:
import torch
text = "It is mine."
task_id = 252
model = level_models[level]
for param in model.parameters():
    param.requires_grad = True

def save_grad(grads):
    global saved_grads
    saved_grads = grads

embeddings = model.bert.get_input_embeddings()
hook = embeddings.register_backward_hook(lambda module, grad_input, grad_output: save_grad(grad_output[0]))


encoding = models.bert_tokenizer.batch_encode_plus(
    [text],
    max_length=128,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)
input_ids = encoding['input_ids'].to('cuda')
print(input_ids.shape)
attention_mask = encoding['attention_mask'].to('cuda')

model.eval()
outputs = model.forward(input_ids, attention_mask=attention_mask, task_id=task_id)
criterion = torch.nn.CrossEntropyLoss()
print(outputs)
loss = criterion(outputs, torch.tensor([0], device='cuda'))
loss.backward()

#output_to_analyze.backward()
scores = abs(saved_grads.sum(dim=2)) / abs(saved_grads.sum(dim=2)).max()
tokens = models.bert_tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
saliency_map = dict(zip(tokens, scores[0,:].cpu().numpy().tolist()))
print(saliency_map)

hook.remove()

torch.Size([1, 128])
tensor([[ 2.5708, -2.5498]], device='cuda:0', grad_fn=<AddmmBackward0>)
{'[CLS]': 0.057692307978868484, 'it': 0.4615384638309479, 'is': 0.07692307978868484, 'mine': 1.0, '.': 0.6153846383094788, '[SEP]': 0.07692307978868484, '[PAD]': 0.0}


