In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import nltk
import pandas as pd
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import load
import progressbar
import itertools
import string



In [3]:
# load the model LSGBERT_2560
output_dir = '/content/drive/MyDrive/Thesis/calibration/saved_model_multi_2560'
device = torch.device('cuda')
model = AutoModelForSequenceClassification.from_pretrained(output_dir, output_hidden_states=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.to(device)

LSGBertForSequenceClassification(
  (bert): LSGBertModel(
    (embeddings): LSGBertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (global_embeddings): Embedding(512, 768)
    )
    (encoder): LSGBertEncoder(
      (layer): ModuleList(
        (0-11): 12 x LSGBertLayer(
          (attention): LSGAttention(
            (self): LSGSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (attention): LSGAttentionProduct(
                (attention): BaseAttentionProduct(
                  (drop

In [4]:
# load the entire test set
test = pd.read_csv('/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/ILDC_expert/anno_dataset.csv')

In [5]:
pred_labels = list(test['label'])

In [6]:
nltk.download('punkt')
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#model, tokenizer, pred labels and all the libraries are ready

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# load the occlusion scores obtained previosly
chunk_scores = load("/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/only_annotation/occlusion_anno/occwts_anno.npy", allow_pickle = True)
chunk_scores = list(chunk_scores)

In [8]:
# load the embedding of the entire test set
path_transformer_chunk_embeddings_test = '/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/only_annotation/compute_embs_anno/LSGBERT_npy_files_cls_multi_anno/LSGBERT_cls_test_anno.npy'
x_test0 = load(path_transformer_chunk_embeddings_test, allow_pickle= True)

In [9]:
# check if the dimensions of test embedding, occlusion scores and number of chunks for each documents is the same
chunk_scores_anno = []
x_test0_anno = []
for i in range(len(test)):
  index = test.index[i]
  test.iloc[i]['text']
  all_toks = tokenizer.tokenize(test.iloc[i]['text'])
  if(len(all_toks) > 10000):
      all_toks = all_toks[len(all_toks)-10000:]
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = [CLS] + l_t + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="post")

  if len(e_sents) != len(x_test0[index]) or len(e_sents) != len(chunk_scores[index]) or len(chunk_scores[index]) != len(x_test0[index]):
    print(f'Dimension problem at index: {index}')

Token indices sequence length is longer than the specified maximum sequence length for this model (5458 > 4096). Running this sequence through the model will result in indexing errors


In [10]:
def lsgbert_tokenize(sents, tokenizer):
  tok_sents = []
  for sen in sents:
    tok_sents.append(tokenizer.tokenize(sen))

  return tok_sents

In [11]:
def sentence_marker(tokenized_sents):
  marker_array = []
  sent_num=1
  for tokenized_sentence in tokenized_sents: # for each sentence
    sentence_marker = []
    for i in range(len(tokenized_sentence)): # for each token in each sentence
      if(i==0):
        sentence_marker.append(sent_num) # give a num to the first token
      else:
        sentence_marker.append(0) # and 0 to the rest of the tokens

    sent_num+=1
    marker_array.append(sentence_marker)

  return marker_array

In [12]:
def chunked_tokens_maker(all_toks, markers):
  splitted_toks = []
  splitted_markers = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    splitted_markers.append(markers[l:min(r,len(markers))])
    l+=410
    r+=410

  return splitted_toks, splitted_markers

In [13]:
def calculate_num_of_sents(chunk_marker_list):
  ct=0
  for i in range(len(chunk_marker_list)):
    if(chunk_marker_list[i] != 0):
      ct+=1

  return ct-1

In [14]:
def sentence_tokens_maker(marks, chunk_toks): # this function creates a list of index pairs that indicates the beginning and the end of the sentences of a chunk
  pair_of_ids = []
  st = -1000
  ed = -1000
  for i, mark in enumerate(marks):
    if(mark==-777): # if true it means we are at the beginning of a sentence
      st=i # we set st to the current index

    if(mark!=-777 and mark!=777 and mark!=0): # if true it means we have a token inside a sentence
      ed=i-1 # we set ed to the previous index
      pair_of_ids.append((st,ed))
      st=i

    if(mark==777): # if true it means that we are at the end of a sentence
      ed=i
      pair_of_ids.append((st,ed))

  return pair_of_ids # it will contains all the pairs of index that indicates the beginning and the end of the sentences inside each chunk

In [15]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [16]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=att_masks)

  logits = outputs.logits

  return logits

In [17]:
def get_LSGBERT_output_logits(encoded_sents, tokenizer, model):
  e_sents = []
  e_sents.append(encoded_sents)
  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="pre")
  att_masks = att_masking(e_sents)
  return get_output_for_one_vec(e_sents[0], att_masks[0])

In [18]:
def lsgbert_detok(tokens):
    # Initialize an empty list for the detokenized sequence
    detokenized_sequence = []

    # Initialize a variable for handling '##' signs
    current_token = ""

    # Iterate through the tokens in the sequence
    for token in tokens:
        if token.startswith("##"):
            # If the token starts with '##', remove '##' and append the rest of the token to the previous token
            current_token += token[2:]
        else:
            # If the token does not start with '##', check if it is a punctuation mark
            if current_token and token in string.punctuation:
                # If the current token is a punctuation mark, attach it to the previous token
                current_token += token
            else:
                # If the current token is not a punctuation mark, add the previous token (if present) to the detokenized sequence
                if current_token:
                    detokenized_sequence.append(current_token)
                # Add the current token to the detokenized sequence
                current_token = token

    # Make sure to add the last token to the detokenized sequence
    if current_token:
        detokenized_sequence.append(current_token)


    tokens_concat = []

    prev_tok=""

    for token in detokenized_sequence:
      if token.endswith('-'):
          prev_tok = token
      else:
          if (prev_tok != ""):
              token = prev_tok + token
              prev_tok = ""
          tokens_concat.append(token)

    if (prev_tok!=""):
        tokens_concat.append(prev_tok)

    # Join the tokens in the detokenized sequence into a string
    #detokenized_text = " ".join(final_text)
    detokenized_text = " ".join(tokens_concat)

    return detokenized_text


In [20]:
def get_explanation(chunked_lsgbert_tokens, chunked_markers, chunk_scores, doc_num, tokenizer, predicted_label):
    explanation = ""
    for chunk_number,score in enumerate(chunk_scores[doc_num]): # we iterate over the chunk score of each document
        if(chunk_number == 0): # # we set specific markers for each chunk
          chunked_markers[chunk_number][0] = -777 # we set -777 for the first marker
          chunked_markers[chunk_number][-1] = 777 # we set 777 for the last marker of the chunk
        else: # for the rest of the chunks that are not the first one
          if(len(chunked_markers[chunk_number]) < 101): # if the number of markers in the current chunk is less than 101
            continue # if this is true the chunk is too small to be elaborated e it doesnt contains enough info, so we skip direclty to the next chunk
          # if the chunk is => 101
          chunked_markers[chunk_number][100] = -777 # we set the 100th token to -777 (overlap 100 token)
          chunked_markers[chunk_number][-1] = 777 # we set the last token to 777

        if(score < 0): # chunk with negative scores are avoided, we concetrate only to the positive ones
           #if (doc_num != 52): # the document 1999_1001.txt is composed of only 2 chunks the first chunk has occ score negative and the second chunk has lenght <101, we add an exception fro this doc or else we will obtain no explanation
           continue

        ct_sent = calculate_num_of_sents(chunked_markers[chunk_number])
        #print("ct_sent: {:}".format(ct_sent))
        top_k = 0.4*ct_sent
        dict_sent_to_score = {} # dizionario vuoto che verrà utilizzato per memorizzare le frasi del documento e i punteggi relativi alle frasi.
        #print(len(chunked_markers[chunk_number]))


        pair_of_ids = sentence_tokens_maker(chunked_markers[chunk_number], chunked_lsgbert_tokens[chunk_number]) # per ottenere una lista di coppie di indici
        # che rappresentano l'inizio e la fine di ciascuna frase nel chunk corrente. Questi indici saranno utilizzati per estrarre le frasi dal testo tokenizzato.

        #print(pair_of_ids)
        # lens = [len(t) for t in tokens_as_sentences_list]
        # print(str(sum(lens)) + "\n")

        CLS = tokenizer.cls_token
        SEP = tokenizer.sep_token
        PAD = tokenizer.pad_token

        original_logits = get_LSGBERT_output_logits(tokenizer.convert_tokens_to_ids(chunked_lsgbert_tokens[chunk_number] + [SEP] + [CLS]), tokenizer, model)
        #  Vengono ottenute le logits (uscite non normalizzate) dal modello per il chunk corrente. Per farlo, viene creato un input tokenizzato
        # che contiene i token del chunk corrente, il token [SEP] (separatore) e il token [CLS]

        original_score = float(original_logits[0][predicted_label]) # we access to the logit of the first dimensione and we take the value associated to the label

        for i in range(len(pair_of_ids)): # scorriamo tra tutte le coppie di indici
            if(pair_of_ids[i][0] == -1000): # Se l'indice iniziale della coppia è stato impostato a -1000 (questo accade se una frase inizia all'interno del chunk)
              pair_of_ids[i] = (0,pair_of_ids[i][1]) # viene aggiustato a 0.
            normalizing_length = pair_of_ids[i][1] - pair_of_ids[i][0] + 1 #  Calcola la lunghezza della frase normalizzata.
            if(normalizing_length == 0):
              continue
            pad_sentence = [PAD]*normalizing_length # Crea una lista di token di riempimento [PAD] con una lunghezza uguale alla lunghezza della frase normalizzata.
            #print("pad_length = {:}".format(len(pad_sentence)))
            left = []
            right = []
            left = chunked_lsgbert_tokens[chunk_number][:pair_of_ids[i][0]] # Estrae i token a sinistra della frase dalla lista di token del chunk corrente.
            right = chunked_lsgbert_tokens[chunk_number][pair_of_ids[i][1]+1:] # Estrae i token a destra della frase dalla lista di token del chunk corrente.

            final_tok_sequence = left + pad_sentence + right + [SEP] + [CLS]

            '''
            la frase di riferimento viene effettivamente sostituita con i token [PAD] per valutare quanto sia importante quella frase all'interno del contesto
            del documento. Questa sostituzione è fatta per determinare quanto la frase contribuisce o influenza la previsione della classe per il documento nel
            suo insieme.

            Quindi, per valutare l'importanza relativa delle frasi nel contesto del documento, vengono creati degli esempi in cui la frase di interesse viene
            sostituita con [PAD], e successivamente il modello calcola il punteggio previsto per questi esempi. Questi punteggi vengono quindi utilizzati per
            determinare quanto ciascuna frase contribuisce alla previsione della classe per il documento.
            '''

            #print("final_tok_sequence = {:}".format(len(final_tok_sequence)))
            encoded_sents = tokenizer.convert_tokens_to_ids(final_tok_sequence) # Converte la sequenza di token in ID di token utilizzando il tokenizer.
            logits = get_LSGBERT_output_logits(encoded_sents, tokenizer, model) #  Ottiene le logits del modello per la sequenza di token.
            score_for_predicted_label = float(logits[0][predicted_label]) # Estrae il punteggio previsto dal modello per la classe predicted_label dalla logits.

            sent_score = 100

            if(score_for_predicted_label > original_score): # verifica se il punteggio previsto per la frase è maggiore del punteggio originale del documento
              sent_score = -1*(score_for_predicted_label - original_score) # se il punteggio previsto è maggiore del punteggio originale
              # calcoliamo sent_score sottraendo il punteggio previsto dal punteggio originale. Questo punteggio risulterà negativo e rappresenterà quanto
              # il punteggio previsto supera il punteggio originale.
              # Questo è interessante perché ci indica quanto una frase contribuisce positivamente alla classe prevista rispetto al documento originale.
            else: # se il punteggio previsto è minore o uguale al punteggio originale
              sent_score = original_score - score_for_predicted_label # allora calcoliamo sent_score sottraendo il punteggio originale dal punteggio previsto.
              # misura quanto la frase contribuisce negativamente o non contribuisce affatto alla classe prevista rispetto al documento originale.

            sent_score_norm = sent_score/normalizing_length
            sentence_in_words = lsgbert_detok(chunked_lsgbert_tokens[chunk_number][pair_of_ids[i][0]:pair_of_ids[i][1]+1]) # Estrae la frase dal testo tokenizzato del chunk corrente utilizzando gli indici.
            dict_sent_to_score[sentence_in_words] = sent_score_norm # Aggiunge la frase e il suo punteggio normalizzato al dizionario

        sort_scores = sorted(dict_sent_to_score.items(), key=lambda x: x[1], reverse=True) #  Ordina le frasi e i punteggi nel dizionario in base ai punteggi in ordine decrescente.
        sorted_sentences =[]

        # Il codice successivo aggiunge le migliori frasi (quella con i punteggi più alti) alla spiegazione finale in base al valore top_k precedentemente
        # calcolato. La spiegazione viene costruita concatenando queste frasi.
        for i in sort_scores:
          sorted_sentences.append(i[0])

        for sentence in sorted_sentences[:int(top_k)]:
          explanation+=sentence


    return explanation

In [21]:
final_explanations = {}
for i in progressbar.progressbar(range(len(test["text"]))): # for each doc in df
  doc_text = test["text"].iloc[i] # extract the text of the doc
  doc_name = test["name"].iloc[i] # extract the doc name
  sents = nltk_tokenizer.tokenize(doc_text) # divide the text in sentences
  lsgbert_tokenized_sents = lsgbert_tokenize(sents, tokenizer) # tokenize the sentences
  marked_tokenized_sents = sentence_marker(lsgbert_tokenized_sents) # Assign numerical markers to words within tokenized sentences, so that words in the same sentence have the same numerical value while words in different sentences have a different numerical value.
  lsgbert_tokens = list(itertools.chain.from_iterable(lsgbert_tokenized_sents)) # convert in a 1D list
  markers = list(itertools.chain.from_iterable(marked_tokenized_sents)) # convert in a 1D list
  if(len(lsgbert_tokens) > 10000):
    lsgbert_tokens = lsgbert_tokens[len(lsgbert_tokens)-10000:]
    markers = markers[len(markers)-10000:]

  chunked_lsgbert_tokens, chunked_markers = chunked_tokens_maker(lsgbert_tokens, markers) # return tokens and mask in chunk of 510

  explanation_of_this_doc = get_explanation(chunked_lsgbert_tokens, chunked_markers, chunk_scores, i, tokenizer, pred_labels[i])
  final_explanations[doc_name] = explanation_of_this_doc

100% (56 of 56) |########################| Elapsed Time: 0:04:22 Time:  0:04:22


In [22]:
len(final_explanations)

56

In [None]:
tot = 0
mean= 0
for key, value in final_explanations.items():
    if isinstance(value, str):
        print(f"Length of value for {key}: {len(value)}")
        tot += len(value)
        mean =  tot/len(final_explanations)
    else:
        print(f"{key} is not a string.")
print(f'The mean lenght of the explanation is {mean}')

Length of value for 1962_213.txt: 4199
Length of value for 1962_47.txt: 2604
Length of value for 1951_35.txt: 6538
Length of value for 1953_74.txt: 6516
Length of value for 1960_100.txt: 3534
Length of value for 1960_72.txt: 2436
Length of value for 1963_37.txt: 6104
Length of value for 1951_33.txt: 8122
Length of value for 1952_42.txt: 10071
Length of value for 1953_26.txt: 5392
Length of value for 1962_128.txt: 2695
Length of value for 1959_5.txt: 5704
Length of value for 1951_30.txt: 3010
Length of value for 1962_118.txt: 5115
Length of value for 1952_60.txt: 1618
Length of value for 1951_40.txt: 5494
Length of value for 1959_26.txt: 1398
Length of value for 1961_363.txt: 12580
Length of value for 1954_144.txt: 7474
Length of value for 1951_64.txt: 4382
Length of value for 1960_12.txt: 3688
Length of value for 1954_0.txt: 2850
Length of value for 1961_344.txt: 3249
Length of value for 1959_76.txt: 6973
Length of value for 1960_327.txt: 2351
Length of value for 1951_36.txt: 1426
Leng

In [23]:
final_explanations

{'1960_12.txt': 'the relevant portion of this resolution was as follows from ancient times there has been a tradition in our family to grant a village to the maharani for her enjoyment in order to maintain her status and dignity.the question that arises for decision is whether the annual cash allowance paid to the appellant in circumstances stated below falls within paragraph 15( 1)( i) of the part b states( taxation concessions) order, 1950( hereinafter referred to as the order) and is therefore exempt from income-tax.the assessee, her highness maharani kesarkunverba saheb, the raj mata of morvi state, is the appellant and the commissioner of income-tax, bombay north, is the respondent.5, 000 per month be paid to the appellant and provision be made for the amount by the treasury office in the budget in the same manner as before.to this the appellant took objection and her son the maharaja of morvi also wrote a letter to the rajpramukh of saurashtra stating that the village had been il

In [2]:
import json
#json_file = open("/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/only_annotation/occlusion_anno/occ_exp_anno_0.4.json", "w")
#json.dump(final_explanations, json_file)

In [3]:
json_file = open("/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/only_annotation/occlusion_anno/occ_exp_anno_0.4.json","r")
data = json.load(json_file)

In [4]:
print(data)

{'1960_12.txt': 'the relevant portion of this resolution was as follows from ancient times there has been a tradition in our family to grant a village to the maharani for her enjoyment in order to maintain her status and dignity.the question that arises for decision is whether the annual cash allowance paid to the appellant in circumstances stated below falls within paragraph 15( 1)( i) of the part b states( taxation concessions) order, 1950( hereinafter referred to as the order) and is therefore exempt from income-tax.the assessee, her highness maharani kesarkunverba saheb, the raj mata of morvi state, is the appellant and the commissioner of income-tax, bombay north, is the respondent.5, 000 per month be paid to the appellant and provision be made for the amount by the treasury office in the budget in the same manner as before.to this the appellant took objection and her son the maharaja of morvi also wrote a letter to the rajpramukh of saurashtra stating that the village had been il