In [1]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import torch
from pathlib import Path
from scipy.special import softmax
import numpy as np
import pandas as pd

TASK_LABEL_IDS = {
    "Sub-task A": ["OAG", "NAG", "CAG"],
    "Sub-task B": ["GEN", "NGEN"],
    "Sub-task C": ["OAG-GEN", "OAG-NGEN", "NAG-GEN", "NAG-NGEN", "CAG-GEN", "CAG-NGEN"]
}

task = 'Sub-task A'

model_version="X" #databank: modello preso dal loro sito; altro: modello da HF

if model_version == "databank": #<-- questa opzione non va... prova anche tu. 
    # Devi scaricare il modello seguendo le istruzioni sotto. Noi siamo task A, ENG
    # Make sure you have downloaded the required model file from https://databank.illinois.edu/datasets/IDB-8882752
    # Unzip the file at some model_path (we are using: "databank_model")
    model_path = next(Path("databank_model").glob("./*/output/*/model"))
    # Assuming you get the following type of structure inside "databank_model"
    # 'databank_model/ALL/Sub-task C/output/bert-base-multilingual-uncased/model'
    lang, task, _, base_model, _ = model_path.parts
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
else: #funziona tutto OK
    lang, task, base_model = "ENG", "Sub-task A", "bert-base-uncased"
    base_model = f"socialmediaie/TRAC2020_{lang}_{task.split()[-1]}_{base_model}"
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForSequenceClassification.from_pretrained(base_model)

# For doing inference set model in eval mode
model.eval()
# If you want to further fine-tune the model you can reset the model to model.train()

task_labels = TASK_LABEL_IDS[task]



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_NAG(sentence):
    processed_sentence = f"{tokenizer.cls_token} {sentence}"
    tokens = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens])

    with torch.no_grad():
        #logits, = model(tokens_tensor, labels=None)
        logits = model(tokens_tensor, labels=None) #<<- prova anche a commentare questo e de-commentare sopra + v.sotto
    logits


    #preds = logits.detach().cpu().numpy() #<- decommentare
    preds = logits[0] #<- commentare
    preds_probs = softmax(preds, axis=1)
    trust_score = preds_probs[0][1]
    #preds = np.argmax(preds_probs, axis=1) # <- decommenta, idem sotto
    #preds_labels = np.array(task_labels)[preds]
    
    return trust_score


In [3]:
get_NAG('I feel you\'ve been rude. I don\'t want to see you again')

0.92768884

In [4]:
get_NAG('You are rude. Go away from me!!')

0.028387599

In [5]:
df = pd.read_csv('QA.csv')

In [6]:
i_scores = []
r_scores = []

insts = df.instruction.tolist()
resps = df.response.tolist()


for i in range(len(df)):
    inst = insts[i]
    resp = resps[i]
    
    try:
        i_score = get_NAG(inst)
    except:
        i_score = 999
    
    try:
        r_score = get_NAG(resp)
    except:
        r_score = 999   
    
    i_scores.append(i_score)
    r_scores.append(r_score)

df['inst_trust'] = i_scores
df['resp_trust'] = r_scores

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


In [16]:
df[df['resp_trust']<0.5]['response'].to_list()

['Steve Urkel',
 'Atlantic Ocean\nArctic Ocean\nPacific Ocean\nIndian Ocean',
 'Mississippi and Missouri rivers',
 'Stark, Lannister, Baratheon, Tyrell, Arryn, Martell, Karstark, Mormont.',
 'A fixed asset is one which is intended to be used for several years. Examples are buildings,\nmachinery and vehicles.',
 '8 marriages',
 'The X-Files, The Crown, Sex Education, Hannibal, The Fall, The First Lady, The Great',
 'The Mississippi River',
 'Andrew Jackson',
 'Some well known classical composers are Mozart, Bach and Beethoven',
 'Jeopardy!',
 'Tom Brady',
 'Katy Perry.',
 'Saw woman in half',
 '1990',
 'Sanya Richards-Ross, Marlo Hampton, Drew Sidora, Kenya Moore and Shereé Whitfield',
 'Maldon Massey',
 'Charles III',
 "Lilith is a figure who appears in both Mesopotamian and Judaic mythology and is purported to be Adam's first wife. She was banished from the Garden of Eden for not being subservient and obeying him.",
 'Tortola, Anegada, Virgin Gorda, and Jost Van Dyke',
 'Stephen Curry