In [5]:
from transformers import AutoModelForSequenceClassification
import json 
from transformers import AutoTokenizer
from transformers import pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import pyarrow as pa
from datasets import Dataset
from sklearn import preprocessing
from transformers import TrainingArguments, Trainer

model_checkpoint = "distilbert-base-uncased"
training_file = "./cuad-data/test_classification_data.json"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

In [6]:
label_y = dict()
def load_data():
    with open(training_file) as json_file:
        data = json.load(json_file)
    dataset = data["train"]
    return dataset

def process_data(row, ):
    text = row['sentence']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    label = label_y[row['label']]

    encodings['label'] = label
    encodings['text'] = text

    return encodings

def prepare_train_valid_df(): 
    processed_data = []    
    train_data = load_data()
    
    label_count = 0
    for i in range(len(train_data[:1000])):        
        key = train_data[i]["label"].lower().strip()
        if not key in label_y.keys():
            label_y.update({key : label_count})
            label_count += 1
        processed_data.append(process_data(train_data[i]))
        
    print (">>>>>> label_y : ", label_y)
    print (processed_data)
    new_df = pd.DataFrame(processed_data)

    train_df, valid_df = train_test_split(
        new_df,
        test_size=0.2,
        random_state=2022
    )

    train_hg = Dataset(pa.Table.from_pandas(train_df))
    valid_hg = Dataset(pa.Table.from_pandas(valid_df))
    return train_hg, valid_hg

In [7]:
def training(train_hg, valid_hg):
    training_args = TrainingArguments(output_dir="./result", evaluation_strategy="epoch")

    id2label = {0: "negative", 1: "positive"}
    label2id = {val: key for key, val in id2label.items()}
    num_labels = len(id2label)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id)  

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_hg,
        eval_dataset=valid_hg,
        tokenizer=tokenizer
    )
    trainer.train()
    trainer.evaluate()
    model.save_pretrained('./model/')
    return model


def predict(sentences, model): 
    #model = AutoModelForSequenceClassification.from_pretrained('./model/')
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    results = classifier(sentences)
    return results 

print(process_data({
    'sentence': 'this is a sample review of a movie.',
    'label': 1
}))

train_hg, valid_hg = prepare_train_valid_df()
model = training(train_hg, valid_hg) 
print (train_hg)

In [8]:
sentences = [
    {"sentence" : "Licensing and Wireless are referred to "},
    {"sentence" : "The judge told that the jurors to think carefully."}
]

train_hg, valid_hg = prepare_train_valid_df()
model = training(train_hg, valid_hg) 
results = predict(sentences[0]["sentence"], model)

print (results[0])

>>>>>> label_y :  {'positive': 0, 'negative': 1}
[{'input_ids': [101, 5062, 1010, 13202, 1998, 9949, 3615, 2000, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 0, 'text': 'Broadcasting, Licensing and Wireless referred to'}, {'input_ids': [101, 13202, 1998, 9949, 2024, 36

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

{'eval_loss': 0.6975694894790649, 'eval_runtime': 0.1423, 'eval_samples_per_second': 7.029, 'eval_steps_per_second': 7.029, 'epoch': 1.0}


 67%|██████▋   | 2/3 [00:02<00:01,  1.45s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
                                             
 67%|██████▋   | 2/3 [00:02<00:01,  1.45s/it]

{'eval_loss': 0.6794772148132324, 'eval_runtime': 0.162, 'eval_samples_per_second': 6.174, 'eval_steps_per_second': 6.174, 'epoch': 2.0}


100%|██████████| 3/3 [00:04<00:00,  1.44s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
                                             
100%|██████████| 3/3 [00:04<00:00,  1.44s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 3/3 [00:04<00:00,  1.45s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1

{'eval_loss': 0.6756847500801086, 'eval_runtime': 0.0862, 'eval_samples_per_second': 11.597, 'eval_steps_per_second': 11.597, 'epoch': 3.0}
{'train_runtime': 4.4056, 'train_samples_per_second': 2.724, 'train_steps_per_second': 0.681, 'train_loss': 0.6671514511108398, 'epoch': 3.0}


100%|██████████| 1/1 [00:00<00:00, 280.57it/s]
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin


{'label': 'positive', 'score': 0.5360666513442993}


In [9]:
test_data_file = './cuad-data/test_contract.json'
contract_terms_file = './cuad-data/contract_terms.json'
import re

def prep_data():
    with open(test_data_file) as json_file:
        data = json.load(json_file)
    contract = data['contract']
    return contract

def process_paragraph(article_text, model):
    return_value = {}
    catch_stmt = {}
    for c_sentence in article_text.split('.'):
        results = predict(c_sentence, model)
        score = (results[0]["score"]  * 100)
        try: 
            #print (">>>>>>>>  Found Sentence 1 : ", catch_stmt[results[0]["label"]])
            res = re.search(c_sentence, article_text) # TODO Revisite 
            #print (">>>>>> Found Sentences 2 - ", res)
            if res:
                #print(">>>>>>>> Index : ", res.start(), res.end())
                #print(">>>>>>>>  Input String : ", c_sentence)   
                #print(">>>>>>>>  Output String : ", article_text[res.start() : res.end()])   
                stmt_index = str(res.start()) + "-" + str(res.end())
                relevence = score
                return_value[stmt_index] = {"start_index" : res.start(), "end_index" : res.end(), "relevence" : relevence}
        except IndexError:
            #print (">>>>>>>> Word Not Found")
            print()
    #print ("return_value : \n", return_value)

    return return_value

def highlight_ranking(return_value):
    for r_key in return_value:
        score = return_value[r_key]['relevence'] 
        #print(return_value[r_key]['relevence'] , " >> " , score)
        if score > 67: 
            return_value[r_key]["relevence_degree"] = "HIGH"
        else: 
            if score > 34: 
                return_value[r_key]["relevence_degree"] = "MEDIUM"
            else:
                if score > 0: 
                    return_value[r_key]["relevence_degree"] = "LOW"

    #print ("return_value : \n", return_value)
    return return_value

article_text = prep_data()
return_value = process_paragraph(article_text, model)
return_value = highlight_ranking(return_value)
print (return_value)

{'0-87': {'start_index': 0, 'end_index': 87, 'relevence': 52.38409638404846, 'relevence_degree': 'MEDIUM'}, '88-113': {'start_index': 88, 'end_index': 113, 'relevence': 53.44003438949585, 'relevence_degree': 'MEDIUM'}, '479-689': {'start_index': 479, 'end_index': 689, 'relevence': 50.180864334106445, 'relevence_degree': 'MEDIUM'}, '690-734': {'start_index': 690, 'end_index': 734, 'relevence': 52.11862921714783, 'relevence_degree': 'MEDIUM'}, '0-0': {'start_index': 0, 'end_index': 0, 'relevence': 52.51891613006592, 'relevence_degree': 'MEDIUM'}}
