In [2]:
# Necessary Imports
import json
import random
import torch
import gc
import re

import pandas as pd
import numpy as np

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
from transformers import DataCollatorWithPadding

In [6]:
# The text is clustered in the following labels
encoded_rr = {
    "PREAMBLE":0,
    "FAC":1,
    "RLC":2,
    "ISSUE":3,
    "ARG_PETITIONER":4,
    "ARG_RESPONDENT":5,
    "ANALYSIS":6,
    "STA":7,
    "PRE_RELIED":8,
    "PRE_NOT_RELIED":9,
    "RATIO":10,
    "RPC":11,
    "NONE":12
}

id2label = {
    0:"PREAMBLE",
    1:"FAC",
    2:"RLC",
    3:"ISSUE",
    4:"ARG_PETITIONER",
    5:"ARG_RESPONDENT",
    6:"ANALYSIS",
    7:"STA",
    8:"PRE_RELIED",
    9:"PRE_NOT_RELIED",
    10:"RATIO",
    11:"RPC",
    12:"NONE"
}

In [7]:
d = open('rr_dev.json')
t = open('train.json')
dev_dataset = json.load(d)
train_dataset = json.load(t)

In [8]:
# Combining the dev and train dataset and then splitting it into train and test of 70:30 ratio
train_dataset.extend(dev_dataset)
random.shuffle(train_dataset)
print("Total size:  %d"%(len(train_dataset)))
split = (len(train_dataset)*7)//10 
print('Splitting data from 0 to %d for train' % (split))
print('Splitting data from %d to %d for test' % (split,len(train_dataset)))
train = train_dataset[:split]
test = train_dataset[split:]

Total size:  277
Splitting data from 0 to 193 for train
Splitting data from 193 to 277 for test


In [5]:
train_data = []
test_data = []

for rec in test:
  for ele in rec['annotations'][0]['result']: 
    processed_text = ele['value']['text']
    processed_text = processed_text.replace('\n', ' ')
    processed_text = re.sub(' {2,}', ' ', processed_text)
    processed_text = processed_text.lower()
    test_data.append({"label":encoded_rr[ele['value']['labels'][0]],"text": processed_text})

for rec in train:
  for ele in rec['annotations'][0]['result']:
    processed_text = ele['value']['text']
    processed_text = processed_text.replace('\n', ' ')
    processed_text = re.sub(' {2,}', ' ', processed_text)
    processed_text = processed_text.lower() 
    train_data.append({"label":encoded_rr[ele['value']['labels'][0]],"text": processed_text})

In [6]:
train_data[0]['aux_text'] = ""
test_data[0]['aux_text'] = ""
for i in range(1, len(train_data)):
    train_data[i]['aux_text'] = train_data[i-1]['text']
for i in range(1, len(test_data)):
    test_data[i]['aux_text'] = test_data[i-1]['text']

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("law-ai/InLegalBERT", num_labels=13,label2id=encoded_rr,id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")

Downloading: 100%|██████████| 671/671 [00:00<00:00, 1.18MB/s]
Downloading: 100%|██████████| 510M/510M [00:47<00:00, 11.3MB/s] 
Some weights of the model checkpoint at law-ai/InLegalBERT were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceC

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [11]:
test_df = Dataset.from_pandas(pd.DataFrame.from_records(test_data))
train_df = Dataset.from_pandas(pd.DataFrame.from_records(train_data))
dataset = DatasetDict({"train":train_df,"test":test_df})
tokenized_dataset = dataset.map(preprocess_function, batched=True)

100%|██████████| 23/23 [00:01<00:00, 22.02ba/s]
100%|██████████| 10/10 [00:00<00:00, 24.48ba/s]


In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {'f1':f1_score(y_true=labels, y_pred=predictions,average='micro')}

In [14]:
training_args = TrainingArguments(
    output_dir="./resultsInlegalBERT",
    # overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='steps',
    save_steps=2000,
    evaluation_strategy='steps',
    eval_steps=1000,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

gc.collect()
torch.cuda.empty_cache()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aux_text, text. If aux_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22202
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13880


Step,Training Loss,Validation Loss,F1
1000,1.1719,1.156417,0.642761
2000,1.0974,1.124946,0.640795
3000,0.9809,1.158272,0.634586
4000,0.8891,1.085436,0.653006
5000,0.8215,1.215878,0.627341
6000,0.6425,1.274375,0.621236
7000,0.6382,1.253493,0.638518
8000,0.6433,1.245121,0.634275
9000,0.4732,1.365753,0.633861
10000,0.4657,1.416671,0.639967


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aux_text, text. If aux_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9663
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aux_text, text. If aux_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9663
  Batch size = 8
Saving model checkpoint to ./resultsInlegalBERT/checkpoint-2000
Configuration saved in ./resultsInlegalBERT/checkpoint-2000/config.json
Model weights saved in ./resultsInlegalBERT/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./resultsInlegalBERT/checkpoint-2000/tokenizer_config.jso

KeyboardInterrupt: 

In [22]:
id2label = {
    0:"PREAMBLE",
    1:"FAC",
    2:"RLC",
    3:"ISSUE",
    4:"ARG_PETITIONER",
    5:"ARG_RESPONDENT",
    6:"ANALYSIS",
    7:"STA",
    8:"PRE_RELIED",
    9:"PRE_NOT_RELIED",
    10:"RATIO",
    11:"RPC",
    12:"NONE"
}

In [15]:
model1 = AutoModelForSequenceClassification.from_pretrained("./resultsInlegalBERT/checkpoint-4000/", num_labels=13,label2id=encoded_rr,id2label=id2label)
tokenizer1 = AutoTokenizer.from_pretrained("./resultsInlegalBERT/checkpoint-4000/")

loading configuration file ./resultsInlegalBERT/checkpoint-4000/config.json
Model config BertConfig {
  "_name_or_path": "./resultsInlegalBERT/checkpoint-4000/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "PREAMBLE",
    "1": "FAC",
    "2": "RLC",
    "3": "ISSUE",
    "4": "ARG_PETITIONER",
    "5": "ARG_RESPONDENT",
    "6": "ANALYSIS",
    "7": "STA",
    "8": "PRE_RELIED",
    "9": "PRE_NOT_RELIED",
    "10": "RATIO",
    "11": "RPC",
    "12": "NONE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "ANALYSIS": 6,
    "ARG_PETITIONER": 4,
    "ARG_RESPONDENT": 5,
    "FAC": 1,
    "ISSUE": 3,
    "NONE": 12,
    "PREAMBLE": 0,
    "PRE_NOT_RELIED": 9,
    "PRE_RELIED": 8,
    "RATIO": 10,
    "RLC": 2,
    "RPC": 11,
  

In [16]:
def preprocessrr_function(examples):
    return tokenizer1(examples["text"], truncation=True,return_tensors='pt')

In [21]:
rr_test = open('rr_test.json')
rrtest_dataset = json.load(rr_test)
rrtest_data = []

for rec in rrtest_dataset:
  for ele in rec['annotations'][0]['result']: 
    processed_text = ele['value']['text']
    processed_text = processed_text.replace('\n', ' ')
    processed_text = re.sub(' {2,}', ' ', processed_text)
    processed_text = processed_text.lower()
    rrtest_data.append({"text": processed_text})

# rrtest_data[0]['aux_text'] = ""
# for i in range(1,len(rrtest_data)):
#   rrtest_data[i]['aux_text'] = rrtest_data[i-1]['text']

# rrtest_df = Dataset.from_pandas(pd.DataFrame.from_records(rrtest_data))
# rrtest_dataset = rrtest_df.map(preprocessrr_function)

In [6]:
tdata = [[i['aux_text'],i['text']]for i in rrtest_data]
rrtest_dataset = tokenizer1(tdata[:1000], truncation=True,padding=True, return_tensors='pt')

In [20]:
len(rrtest_dataset['input_ids'])

1000

In [None]:
tdata = [i['text'] for i in rrtest_data]
tdata

In [24]:
from transformers import pipeline

pipe = pipeline(task='text-classification', model =model1, tokenizer=tokenizer1,device=0)
pipe(tdata[:2])

[{'label': 'PREAMBLE', 'score': 0.9895686507225037},
 {'label': 'PREAMBLE', 'score': 0.9906486868858337}]

In [25]:
output = pipe(tdata,padding=True, truncation=True)
with open('op.txt','w+') as f:
    f.write(str(output))

In [26]:
output

[{'label': 'PREAMBLE', 'score': 0.9895686507225037},
 {'label': 'PREAMBLE', 'score': 0.9906486868858337},
 {'label': 'PREAMBLE', 'score': 0.9823678731918335},
 {'label': 'PREAMBLE', 'score': 0.9919686913490295},
 {'label': 'PREAMBLE', 'score': 0.99114990234375},
 {'label': 'PREAMBLE', 'score': 0.9906103014945984},
 {'label': 'PREAMBLE', 'score': 0.9911736845970154},
 {'label': 'PREAMBLE', 'score': 0.9908678531646729},
 {'label': 'PREAMBLE', 'score': 0.7254161238670349},
 {'label': 'NONE', 'score': 0.8613502979278564},
 {'label': 'PREAMBLE', 'score': 0.9116681814193726},
 {'label': 'FAC', 'score': 0.8108745813369751},
 {'label': 'PREAMBLE', 'score': 0.9754899740219116},
 {'label': 'FAC', 'score': 0.9437655210494995},
 {'label': 'FAC', 'score': 0.9306872487068176},
 {'label': 'FAC', 'score': 0.9502750635147095},
 {'label': 'FAC', 'score': 0.6308172941207886},
 {'label': 'ARG_PETITIONER', 'score': 0.40371522307395935},
 {'label': 'ARG_PETITIONER', 'score': 0.48923760652542114},
 {'label':

In [9]:
rr_test = open('rr_test.json')
rrtest_dataset = json.load(rr_test)
for rec in rrtest_dataset:
  for idx, ele in enumerate(rec['annotations'][0]['result']): 
    ele['value']['labels'][0] = output[idx]['label']


NameError: name 'output' is not defined

In [28]:
out_file = open("myfile.json", "w")
json.dump(rrtest_dataset,out_file, indent=6)
out_file.close()

In [15]:
train_dataset.extend(dev_dataset)
random.shuffle(train_dataset)

In [11]:
import sent2vec
from sentence_transformers import SentenceTransformer
# model = sent2vec.Sent2vecModel()
# model.load_model('./semantic-segmentation/infer/sent2vec.bin')

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


for count, rec in enumerate(train_dataset):
    if rec['annotations'][0]['result'] == []:
         continue
    with open(f'./semantic-segmentation/data/pretrained_emb/{rec["id"]}.txt','w+') as f:
            for ele in rec['annotations'][0]['result']: 
                processed_text = ele['value']['text']
                processed_text = processed_text.replace('\n', ' ')
                processed_text = re.sub(' {2,}', ' ', processed_text)
                processed_text = processed_text.lower()
                label = ele['value']['labels'][0]
                emb = model.encode(processed_text)
                processed_text = " ".join([str(i) for i in  emb])
                f.write(f'{processed_text}\t{label}\n')
print(count)

276


In [15]:
len(train_dataset)

277

In [9]:
with open(f'./semantic-segmentation/cat.txt','w+') as f:
    f.write('Legal\t')
    for rec in train_dataset:
        if rec['annotations'][0]['result'] == []:
            continue
        f.write(f'{rec["id"]} ')

In [7]:
test = open('rr_test.json')
test_dataset = json.load(test)
for count, rec in enumerate(test_dataset):
    if rec['annotations'][0]['result'] == []:
         print('yes')
         continue
    with open(f'./semantic-segmentation/infer/data/{rec["id"]}.txt','w+') as f:
            for ele in rec['annotations'][0]['result']: 
                processed_text = ele['value']['text']
                processed_text = processed_text.replace('\n', ' ')
                processed_text = re.sub(' {2,}', ' ', processed_text)
                f.write(f'{ele["id"]}\t{processed_text}\n')

In [3]:
with open('./semantic-segmentation/infer/pred_ids.txt','r') as f:
    ids = json.load(f)

In [4]:
preds = dict()
with open('./semantic-segmentation/infer/predictions.txt','r') as f:
    for line in f:
        arr = line.split('\t')
        arr[1] = arr[1].replace('\n','')
        preds[int(arr[0])] = arr[1].split(',')
len(preds)

50

In [5]:
for key, value in preds.items():
    with open(f'./semantic-segmentation/infer/data/{key}.txt') as f:
        text = f.readlines()
        if(len(value) != len(text)):
            print(key)

In [8]:
for rec in test_dataset:
    lab = preds[int(rec['id'])]
    i=0

    for ele in rec['annotations'][0]['result']:
        if ele['id'] in ids[str(rec['id'])]:
            ele['value']['labels'][0] = lab[i]
            i+=1
        else:
            ele['value']['labels'][0] = "NONE"

In [9]:
out_file = open("submissionSTemb.json", "w")
json.dump(test_dataset,out_file, indent=6)
out_file.close()

In [None]:
import glob
for filename in glob.glob('./semantic-segmentation/data/text/*.txt'):
    with open(filename, 'r') as f:
        arr = [l.strip() for l in f]
        print(arr)
        break

In [2]:
import sent2vec
model = sent2vec.Sent2vecModel()
model.load_model('./semantic-segmentation/infer/sent2vec.bin')
emb = model.embed_sentence("once upon a time .") 
emb

array([[ 0.3827696 , -0.18814288, -0.52750456,  0.0993287 ,  0.87292796,
        -0.61967283, -0.671658  ,  0.1319409 ,  0.33635676,  0.301886  ,
        -0.2546241 , -0.20579605,  0.24103308, -0.23798686, -0.08603713,
         0.36622182,  0.5031561 , -0.29081622, -0.42823997, -0.14480692,
         0.5837328 ,  0.01628563, -0.17452843, -0.53246737, -0.18692206,
         0.4429304 ,  0.50153977, -0.29309112, -0.0926116 ,  1.0668284 ,
         0.3150685 ,  0.00236383, -0.02039053, -0.38179708, -0.2799668 ,
        -0.501665  , -0.21791966, -0.27207178,  0.6428038 ,  0.03471715,
         0.06431601,  0.95592946,  0.63953406, -0.32514006,  0.5782566 ,
         0.12229088,  0.53094846,  0.3701749 , -0.21859041,  0.599999  ,
         0.44534752, -0.07129437,  0.06976022,  0.24513888, -0.5483347 ,
         0.1396512 ,  0.62140644, -1.1654739 , -0.5549987 ,  0.5456811 ,
         0.547042  , -0.15461458,  0.10945165,  0.5032859 ,  0.07112101,
         0.76943856, -0.18670817,  0.34202027,  1.0

In [4]:
len(emb[0])

200

In [10]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode("gello ajsda")
print(embeddings)

[ 3.23235542e-02  2.76791155e-02 -1.79122910e-02 -3.78156528e-02
  5.51560111e-02  1.03917988e-02  5.57913398e-03  2.40380820e-02
  4.48452719e-02  3.34757715e-02  4.10673060e-02 -5.51949218e-02
  5.09915203e-02  3.94745357e-03  2.13389099e-02 -4.15916741e-02
  8.36438965e-03 -9.72388126e-03 -6.58859238e-02 -3.77151258e-02
 -3.80968228e-02 -3.30935145e-04 -3.35505907e-03  2.88008489e-02
  4.87894341e-02 -6.14981726e-03  4.92815208e-03 -1.96512714e-02
  4.00273968e-03  8.38761702e-02 -1.54896630e-02 -1.56159159e-02
 -2.65765190e-02 -3.43008619e-03  1.97007284e-06 -6.20464887e-03
 -1.76943501e-03 -1.54109783e-02  1.38676278e-02 -2.41489410e-02
 -9.84460860e-02 -3.73590477e-02 -5.04393578e-02 -6.53715432e-02
 -1.68510713e-02 -6.50279000e-02  2.77446397e-02 -6.16434105e-02
 -3.36593240e-02  2.40848772e-02  1.13347005e-02 -1.23019628e-01
  1.35630909e-02 -1.11678503e-02  7.98444152e-02 -7.20923543e-02
 -1.17533952e-02 -4.16300185e-02 -3.58461626e-02 -1.49025340e-02
  8.09680205e-03  4.69611

In [3]:
len(embeddings[0])

768

In [None]:
# python3 -u run.py --dataset_size 275 --cat_path cat.txt --print_every 5 --pretrained True --data_path data/pretrained_emb/ --epoch 100 --emb_dim 768 &> out100_sentence_transformer_emb.txt &
# python3 infer.py --pretrained True --model_path infer/modelSTemb.tar --emb_dim 768