In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
import torch
from transformers import AutoTokenizer, BertForSequenceClassification
import nltk
import pandas as pd
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import load
import progressbar
import itertools
import string



In [3]:
output_dir = '/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/CaseLawBERT_bigru_occ/saved_model_multi_caselawbert'
device = torch.device('cuda')
model = BertForSequenceClassification.from_pretrained(output_dir, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [4]:
# load the entire test set
test = pd.read_csv('/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/ILDC_expert/anno_dataset.csv')

In [5]:
pred_labels = list(test['label'])

In [6]:
nltk.download('punkt')
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#model, tokenizer, pred labels and all the libraries are ready

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# load the occlusion scores obtained previosly
chunk_scores = load("/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/only_annotation/occlusion_anno/occwts_anno.npy", allow_pickle = True)
chunk_scores = list(chunk_scores)

In [8]:
# load the embedding of the entire test set
path_transformer_chunk_embeddings_test = '/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/only_annotation/compute_embs_anno/CaseLawBERT_npy_files_cls_multi_anno/CaseLawBERT_cls_test_anno.npy'
x_test0 = load(path_transformer_chunk_embeddings_test, allow_pickle= True)

In [9]:
# check if the dimensions of test embedding, occlusion scores and number of chunks for each documents is the same
chunk_scores_anno = []
x_test0_anno = []
for i in range(len(test)):
  index = test.index[i]
  test.iloc[i]['text']
  all_toks = tokenizer.tokenize(test.iloc[i]['text'])
  if(len(all_toks) > 10000):
      all_toks = all_toks[len(all_toks)-10000:]
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = [CLS] + l_t + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="post")

  if len(e_sents) != len(x_test0[index]) or len(e_sents) != len(chunk_scores[index]) or len(chunk_scores[index]) != len(x_test0[index]):
    print(f'Dimension problem at index: {index}')

In [10]:
def caselawbert_tokenize(sents, tokenizer):
  tok_sents = []
  for sen in sents:
    tok_sents.append(tokenizer.tokenize(sen))

  return tok_sents

In [11]:
def sentence_marker(tokenized_sents):
  marker_array = []
  sent_num=1
  for tokenized_sentence in tokenized_sents: # for each sentence
    sentence_marker = []
    for i in range(len(tokenized_sentence)): # for each token in each sentence
      if(i==0):
        sentence_marker.append(sent_num) # give a num to the first token
      else:
        sentence_marker.append(0) # and 0 to the rest of the tokens

    sent_num+=1
    marker_array.append(sentence_marker)

  return marker_array

In [12]:
def chunked_tokens_maker(all_toks, markers):
  splitted_toks = []
  splitted_markers = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    splitted_markers.append(markers[l:min(r,len(markers))])
    l+=410
    r+=410

  return splitted_toks, splitted_markers

In [13]:
def calculate_num_of_sents(chunk_marker_list):
  ct=0
  for i in range(len(chunk_marker_list)):
    if(chunk_marker_list[i] != 0):
      ct+=1

  return ct-1

In [14]:
def sentence_tokens_maker(marks, chunk_toks): # this function creates a list of index pairs that indicates the beginning and the end of the sentences of a chunk
  pair_of_ids = []
  st = -1000
  ed = -1000
  for i, mark in enumerate(marks):
    if(mark==-777): # if true it means we are at the beginning of a sentence
      st=i # we set st to the current index

    if(mark!=-777 and mark!=777 and mark!=0): # if true it means we have a token inside a sentence
      ed=i-1 # we set ed to the previous index
      pair_of_ids.append((st,ed))
      st=i

    if(mark==777): # if true it means that we are at the end of a sentence
      ed=i
      pair_of_ids.append((st,ed))

  return pair_of_ids # it will contains all the pairs of index that indicates the beginning and the end of the sentences inside each chunk

In [15]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [16]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=att_masks)

  logits = outputs.logits

  return logits

In [17]:
def get_caselawbert_output_logits(encoded_sents, tokenizer, model):
  e_sents = []
  e_sents.append(encoded_sents)
  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="pre")
  att_masks = att_masking(e_sents)
  return get_output_for_one_vec(e_sents[0], att_masks[0])

In [18]:
def caselawbert_detok(tokens):
    # Initialize an empty list for the detokenized sequence
    detokenized_sequence = []

    # Initialize a variable for handling '##' signs
    current_token = ""

    # Iterate through the tokens in the sequence
    for token in tokens:
        if token.startswith("##"):
            # If the token starts with '##', remove '##' and append the rest of the token to the previous token
            current_token += token[2:]
        else:
            # If the token does not start with '##', check if it is a punctuation mark
            if current_token and token in string.punctuation:
                # If the current token is a punctuation mark, attach it to the previous token
                current_token += token
            else:
                # If the current token is not a punctuation mark, add the previous token (if present) to the detokenized sequence
                if current_token:
                    detokenized_sequence.append(current_token)
                # Add the current token to the detokenized sequence
                current_token = token

    # Make sure to add the last token to the detokenized sequence
    if current_token:
        detokenized_sequence.append(current_token)


    tokens_concat = []

    prev_tok=""

    for token in detokenized_sequence:
      if token.endswith('-'):
          prev_tok = token
      else:
          if (prev_tok != ""):
              token = prev_tok + token
              prev_tok = ""
          tokens_concat.append(token)

    if (prev_tok!=""):
        tokens_concat.append(prev_tok)

    # Join the tokens in the detokenized sequence into a string
    #detokenized_text = " ".join(final_text)
    detokenized_text = " ".join(tokens_concat)

    return detokenized_text


In [19]:
chunk_scores_anno

[]

In [24]:
def get_explanation(chunked_caselawbert_tokens, chunked_markers, chunk_scores, doc_num, tokenizer, predicted_label):
    explanation = ""
    for chunk_number,score in enumerate(chunk_scores[doc_num]): # we iterate over the chunk score of each document
        if(chunk_number == 0): # # we set specific markers for each chunk
          chunked_markers[chunk_number][0] = -777 # we set -777 for the first marker
          chunked_markers[chunk_number][-1] = 777 # we set 777 for the last marker of the chunk
        else: # for the rest of the chunks that are not the first one
          if(len(chunked_markers[chunk_number]) < 101): # if the number of markers in the current chunk is less than 101
            continue # if this is true the chunk is too small to be elaborated e it doesnt contains enough info, so we skip direclty to the next chunk
          # if the chunk is => 101
          chunked_markers[chunk_number][100] = -777 # we set the 100th token to -777 (overlap 100 token)
          chunked_markers[chunk_number][-1] = 777 # we set the last token to 777

        if(score < 0): # chunk with negative scores are avoided, we concetrate only to the positive ones
           if doc_num not in [21, 31, 55]: # the document 1999_1001.txt is composed of only 2 chunks the first chunk has occ score negative and the second chunk has lenght <101, we add an exception fro this doc or else we will obtain no explanation
            continue

        ct_sent = calculate_num_of_sents(chunked_markers[chunk_number])
        #print("ct_sent: {:}".format(ct_sent))
        top_k = 0.4*ct_sent
        dict_sent_to_score = {} # dizionario vuoto che verrà utilizzato per memorizzare le frasi del documento e i punteggi relativi alle frasi.
        #print(len(chunked_markers[chunk_number]))


        pair_of_ids = sentence_tokens_maker(chunked_markers[chunk_number], chunked_caselawbert_tokens[chunk_number]) # per ottenere una lista di coppie di indici
        # che rappresentano l'inizio e la fine di ciascuna frase nel chunk corrente. Questi indici saranno utilizzati per estrarre le frasi dal testo tokenizzato.

        #print(pair_of_ids)
        # lens = [len(t) for t in tokens_as_sentences_list]
        # print(str(sum(lens)) + "\n")

        CLS = tokenizer.cls_token
        SEP = tokenizer.sep_token
        PAD = tokenizer.pad_token

        original_logits = get_caselawbert_output_logits(tokenizer.convert_tokens_to_ids(chunked_caselawbert_tokens[chunk_number] + [SEP] + [CLS]), tokenizer, model)
        #  Vengono ottenute le logits (uscite non normalizzate) dal modello per il chunk corrente. Per farlo, viene creato un input tokenizzato
        # che contiene i token del chunk corrente, il token [SEP] (separatore) e il token [CLS]

        original_score = float(original_logits[0][predicted_label]) # we access to the logit of the first dimensione and we take the value associated to the label

        for i in range(len(pair_of_ids)): # scorriamo tra tutte le coppie di indici
            if(pair_of_ids[i][0] == -1000): # Se l'indice iniziale della coppia è stato impostato a -1000 (questo accade se una frase inizia all'interno del chunk)
              pair_of_ids[i] = (0,pair_of_ids[i][1]) # viene aggiustato a 0.
            normalizing_length = pair_of_ids[i][1] - pair_of_ids[i][0] + 1 #  Calcola la lunghezza della frase normalizzata.
            if(normalizing_length == 0):
              continue
            pad_sentence = [PAD]*normalizing_length # Crea una lista di token di riempimento [PAD] con una lunghezza uguale alla lunghezza della frase normalizzata.
            #print("pad_length = {:}".format(len(pad_sentence)))
            left = []
            right = []
            left = chunked_caselawbert_tokens[chunk_number][:pair_of_ids[i][0]] # Estrae i token a sinistra della frase dalla lista di token del chunk corrente.
            right = chunked_caselawbert_tokens[chunk_number][pair_of_ids[i][1]+1:] # Estrae i token a destra della frase dalla lista di token del chunk corrente.

            final_tok_sequence = left + pad_sentence + right + [SEP] + [CLS]

            '''
            la frase di riferimento viene effettivamente sostituita con i token [PAD] per valutare quanto sia importante quella frase all'interno del contesto
            del documento. Questa sostituzione è fatta per determinare quanto la frase contribuisce o influenza la previsione della classe per il documento nel
            suo insieme.

            Quindi, per valutare l'importanza relativa delle frasi nel contesto del documento, vengono creati degli esempi in cui la frase di interesse viene
            sostituita con [PAD], e successivamente il modello calcola il punteggio previsto per questi esempi. Questi punteggi vengono quindi utilizzati per
            determinare quanto ciascuna frase contribuisce alla previsione della classe per il documento.
            '''

            #print("final_tok_sequence = {:}".format(len(final_tok_sequence)))
            encoded_sents = tokenizer.convert_tokens_to_ids(final_tok_sequence) # Converte la sequenza di token in ID di token utilizzando il tokenizer.
            logits = get_caselawbert_output_logits(encoded_sents, tokenizer, model) #  Ottiene le logits del modello per la sequenza di token.
            score_for_predicted_label = float(logits[0][predicted_label]) # Estrae il punteggio previsto dal modello per la classe predicted_label dalla logits.

            sent_score = 100

            if(score_for_predicted_label > original_score): # verifica se il punteggio previsto per la frase è maggiore del punteggio originale del documento
              sent_score = -1*(score_for_predicted_label - original_score) # se il punteggio previsto è maggiore del punteggio originale
              # calcoliamo sent_score sottraendo il punteggio previsto dal punteggio originale. Questo punteggio risulterà negativo e rappresenterà quanto
              # il punteggio previsto supera il punteggio originale.
              # Questo è interessante perché ci indica quanto una frase contribuisce positivamente alla classe prevista rispetto al documento originale.
            else: # se il punteggio previsto è minore o uguale al punteggio originale
              sent_score = original_score - score_for_predicted_label # allora calcoliamo sent_score sottraendo il punteggio originale dal punteggio previsto.
              # misura quanto la frase contribuisce negativamente o non contribuisce affatto alla classe prevista rispetto al documento originale.

            sent_score_norm = sent_score/normalizing_length
            c = chunked_caselawbert_tokens[chunk_number][pair_of_ids[i][0]:pair_of_ids[i][1]+1]
            new_c = ["." if token == "[UNK]" else token for token in c]
            sentence_in_words = caselawbert_detok(new_c) # Estrae la frase dal testo tokenizzato del chunk corrente utilizzando gli indici.
            dict_sent_to_score[sentence_in_words] = sent_score_norm # Aggiunge la frase e il suo punteggio normalizzato al dizionario

        sort_scores = sorted(dict_sent_to_score.items(), key=lambda x: x[1], reverse=True) #  Ordina le frasi e i punteggi nel dizionario in base ai punteggi in ordine decrescente.
        sorted_sentences =[]

        # Il codice successivo aggiunge le migliori frasi (quella con i punteggi più alti) alla spiegazione finale in base al valore top_k precedentemente
        # calcolato. La spiegazione viene costruita concatenando queste frasi.
        for i in sort_scores:
          sorted_sentences.append(i[0])

        for sentence in sorted_sentences[:int(top_k)]:
          explanation+=sentence


    return explanation

In [25]:
final_explanations = {}
for i in progressbar.progressbar(range(len(test["text"]))): # for each doc in df
  doc_text = test["text"].iloc[i] # extract the text of the doc
  doc_name = test["name"].iloc[i] # extract the doc name
  sents = nltk_tokenizer.tokenize(doc_text) # divide the text in sentences
  caselawbert_tokenized_sents = caselawbert_tokenize(sents, tokenizer) # tokenize the sentences
  marked_tokenized_sents = sentence_marker(caselawbert_tokenized_sents) # Assign numerical markers to words within tokenized sentences, so that words in the same sentence have the same numerical value while words in different sentences have a different numerical value.
  caselawbert_tokens = list(itertools.chain.from_iterable(caselawbert_tokenized_sents)) # convert in a 1D list
  markers = list(itertools.chain.from_iterable(marked_tokenized_sents)) # convert in a 1D list
  if(len(caselawbert_tokens) > 10000):
    caselawbert_tokens = caselawbert_tokens[len(caselawbert_tokens)-10000:]
    markers = markers[len(markers)-10000:]

  chunked_caselawbert_tokens, chunked_markers = chunked_tokens_maker(caselawbert_tokens, markers) # return tokens and mask in chunk of 510

  explanation_of_this_doc = get_explanation(chunked_caselawbert_tokens, chunked_markers, chunk_scores, i, tokenizer, pred_labels[i])
  final_explanations[doc_name] = explanation_of_this_doc

100% (56 of 56) |########################| Elapsed Time: 0:03:18 Time:  0:03:18


In [26]:
len(final_explanations)

56

In [27]:
tot = 0
mean= 0
for key, value in final_explanations.items():
    if isinstance(value, str):
        print(f"Length of value for {key}: {len(value)}")
        tot += len(value)
        mean =  tot/len(final_explanations)
    else:
        print(f"{key} is not a string.")
print(f'The mean lenght of the explanation is {mean}')

Length of value for 1960_12.txt: 2991
Length of value for 1953_14.txt: 4970
Length of value for 1952_60.txt: 3505
Length of value for 1951_64.txt: 2751
Length of value for 1962_384.txt: 5274
Length of value for 1999_1001.txt: 943
Length of value for 1961_344.txt: 5374
Length of value for 1960_44.txt: 2430
Length of value for 1962_113.txt: 3633
Length of value for 1959_66.txt: 5931
Length of value for 1951_10.txt: 4612
Length of value for 1962_339.txt: 1564
Length of value for 1953_74.txt: 7639
Length of value for 1961_365.txt: 2828
Length of value for 1962_118.txt: 5349
Length of value for 1960_327.txt: 2372
Length of value for 1960_103.txt: 4139
Length of value for 1960_10.txt: 3342
Length of value for 1963_37.txt: 3901
Length of value for 2013_30.txt: 1009
Length of value for 1960_100.txt: 3391
Length of value for 1951_80.txt: 1896
Length of value for 1960_265.txt: 2724
Length of value for 1954_114.txt: 3243
Length of value for 1951_35.txt: 16384
Length of value for 1959_134.txt: 157

In [28]:
final_explanations

{'1960_12.txt': '5. 000 a month and not to insist on anything more.the following extract fromto this the appellant took objection and her son the maharaja of morvi also wrote a letter to the rajpramukh of saurashtra stating that the village had been illegally resumed and that her jiwai had also been stopped.the government of saurashtra refused to continue the maintenance allowance or to recognise the grant of the village mota dahisara to the appellant.she then made certain representations and after some conferences and some discussion a copy of the order of the political department was sent to the appellant in which it was stated that the village would be resumed and an amount calculated on the basis of average revenue of the village for 3 years would be paid to her as cash allowance for lifetime.on appeal to the appellate assistant commissioner the amount of rs.24. 193.60. 000 is exempt from income. tax and super. tax.if you look at the substance of the transaction. it means that the 

In [33]:
import json
json_file = open("/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/only_annotation/occlusion_anno/occ_exp_anno_0.4.json", "w")
json.dump(final_explanations, json_file)

In [34]:
json_file = open("/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/only_annotation/occlusion_anno/occ_exp_anno_0.4.json","r")
data = json.load(json_file)

In [35]:
print(data)

