In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
import textwrap
import keras
import random
import pandas as pd
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, BertForSequenceClassification
import progressbar
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time
import os
from sklearn.metrics import confusion_matrix



In [3]:
df_test = pd.read_csv('/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation/ILDC_expert/anno_dataset.csv')

In [4]:
df_test

Unnamed: 0,text,name,label
0,"The assessee, Her Highness Maharani Kesarkunve...",1960_12.txt,1
1,CIVIL APPELLATE JURISDICTION Civil Appeal No. ...,1953_14.txt,1
2,CIVIL APPELLATE JURISDICTION Civil - Appeal No...,1952_60.txt,1
3,CIVIL APPELLATE JURISDICTION. Civil Appeal No....,1951_64.txt,1
4,Appeal by special leave from the award dated D...,1962_384.txt,1
5,Special leave granted.\n Respondent 1 was sus...,1999_1001.txt,1
6,Appeal by special leave from the judgment and ...,1961_344.txt,0
7,These appeals are directed against three judgm...,1960_44.txt,1
8,Appeal by special leave from the Resolution da...,1962_113.txt,0
9,"The award made by the Industrial Tribunal, Bom...",1959_66.txt,1


In [5]:
output_dir = '/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/CaseLawBERT_bigru_occ/saved_model_multi_caselawbert'
device = torch.device('cuda')
model = BertForSequenceClassification.from_pretrained(output_dir, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [7]:
def grouped_input_ids(all_toks):
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = [CLS] + l_t + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="post")
  att_masks = att_masking(e_sents)
  return e_sents, att_masks

In [8]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=att_masks)

  vec = outputs["hidden_states"][12][0][0]
  vec = vec.detach().cpu().numpy()
  return vec

In [9]:
def generate_np_files_for_emb(dataf, tokenizer):
  all_docs = []
  for i in progressbar.progressbar(range(len(dataf['text']))):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

    vecs = []
    for index,ii in enumerate(splitted_input_ids):
      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))

    one_doc = np.asarray(vecs)
    all_docs.append(one_doc)

  all_docs = np.asarray(all_docs)
  return all_docs

In [10]:
# CLS
path_val_npy_file = "/content/drive/MyDrive/Thesis/Models_whole_data/case_explanation_caselawbert/only_annotation/compute_embs_anno/CaseLawBERT_npy_files_cls_multi_anno/CaseLawBERT_cls_test_anno"
vecs_test = generate_np_files_for_emb(df_test, tokenizer)
np.save(path_val_npy_file, vecs_test)

print('npy file test saved')

100% (56 of 56) |########################| Elapsed Time: 0:00:21 Time:  0:00:21
  all_docs = np.asarray(all_docs)


npy file test saved
