In [None]:
# utile folder ----> util

import torch
import json
from torch.utils.data import Dataset
import numpy as np
from transformers import AutoTokenizer, RobertaTokenizerFast

#from utils.utils import match_labels



import spacy
nlp = spacy.load("en_core_web_sm")

############################################################
#                                                          #
#                      DATASET CLASS                       #
#                   LeglNERTokenDataset                    #
############################################################
class LegalNERTokenDataset(Dataset):

    def __init__(self, dataset_path, model_path, labels_list=None, split="train", use_roberta=False):
        self.data = json.load(open(dataset_path))
        self.split = split
        self.use_roberta = use_roberta
        if self.use_roberta:     ## Load the right tokenizer
            self.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") # use roberta tokenization for tokenize the data / text
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)  # if it was roberta do it else use Auto Tokenizer to do that

        self.labels_list = sorted(labels_list + ["O"])[::-1]

        if self.labels_list is not None: # it is not none with ["O" , "I-Z" , ..... ] reverted
             self.labels_to_idx = dict(
                zip(sorted(self.labels_list)[::-1], range(len(self.labels_list)))
            )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["data"]["text"]

        ## Get the annotations
        annotations = [
            { # to pin point the location o fthe laz
                "start": v["value"]["start"],
                "end": v["value"]["end"],
                "labels": v["value"]["labels"][0],
            }
            for v in item["annotations"][0]["result"]
        ]

        ## Tokenize the text
        if not self.use_roberta:
            inputs = self.tokenizer(    # if the roberta tokenizer is active, So the inputs has roberta tokenizer output
                text,
                return_tensors="pt",
                truncation=True,
                verbose=False
                )
        else:
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                verbose=False,
                padding='max_length' # pay attention to this section latter for data pre-processing
            )

            # after the tokenizing the data we need to do some padding actions to make a better perfomance





        ## Match the labels
        aligned_labels = match_labels(inputs, annotations) #### what is the result of this function
        # the result of this function is
        aligned_labels = [self.labels_to_idx[l] for l in aligned_labels] # this section gives me a list of various numbers which are in labels_to_idx lke 4,2,8,5,1,0,....
        # aligned_labels from ["O" , "B-Practitioner" , .....] will be converted to he number of labels_list we set from 0 to a Number
        # ---> [0,4,2,7,1,8,3]
        inputs["input_ids"] = inputs["input_ids"].squeeze(0).long()
        inputs["attention_mask"] = inputs["attention_mask"].squeeze(0).long()
        if not self.use_roberta:
            inputs["token_type_ids"] = inputs["token_type_ids"].squeeze(0).long()

        ## Get the labels
        if self.labels_list:
            labels = torch.tensor(aligned_labels).squeeze(-1).long()

            if labels.shape[0] < inputs["attention_mask"].shape[0]:
                pad_x = torch.zeros((inputs["input_ids"].shape[0],))
                pad_x[: labels.size(0)] = labels
                inputs["labels"] = aligned_labels
            else:
                inputs["labels"] = labels[: inputs["attention_mask"].shape[0]]

        return inputs


#### editing this section for preprocessing


# --> match label function

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from nervaluate import Evaluator


############################################################
#                                                          #
#                  LABELS MATCHING FUNCTION                #
# Should be edited for preprocessing and enhance F1-scores #
############################################################
def match_labels(tokenized_input, annotations):

    # Make a list to store our labels the same length as our tokens
    aligned_labels = ["O"] * len(
        tokenized_input["input_ids"][0]
    ) #.        ------> for example if len aligned_labels is 7 so we have such list --> inout_ids : [  101, 17662,  2227, 19081,  2003,  2307,   999,   102] ---> ["O","O","O","O","O","O","O",]


      # Example : tensor([[ 101, 7592, 1010, 2047, 2259, 2103, 2003, 1037, 2307, 2103, 1999, 1996, 2088, 1012,  102]]),
      # aligned_labels  = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] --> 14 * 0
      # annotation

    # Loop through the annotations
    for anno in annotations:

      # anno ---> start / end / value

        previous_tokens = None

        # Loop through the characters in the annotation
        for char_ix in range(anno["start"], anno["end"]):

            token_ix = tokenized_input.char_to_token(char_ix) # convert the character to token that the output is a number

            # White spaces have no token and will return None
            if token_ix is not None:

                # If the token is a continuation of the previous token, we label it as "I"
                if previous_tokens is not None:
                    aligned_labels[token_ix] = (
                        "I-" + anno["labels"]
                        if aligned_labels[token_ix] == "O"
                        else aligned_labels[token_ix]
                    )

                # If the token is not a continuation of the previous token, we label it as "B"
                else:
                    aligned_labels[token_ix] = "B-" + anno["labels"]
                    previous_tokens = token_ix

    return aligned_labels



###################

#1- modifying both LegalNER and Matche labeling for preprocessing and enhance F1-Scores
#2- change the number of input parameters




###################

import os
import json
import numpy as np
from argparse import ArgumentParser
from nervaluate import Evaluator

from transformers import AutoModelForTokenClassification
from transformers import Trainer, DefaultDataCollator, TrainingArguments

# from utils.dataset import LegalNERTokenDataset ***************

#import LegalNERTokenDataset
#import datasets

import spacy
nlp = spacy.load("en_core_web_sm")


############################################################
#                                                          #
#                           MAIN                           #
#                                                          #
############################################################
if __name__ == "__main__":

    parser = ArgumentParser(description="Training of LUKE model")
    parser.add_argument(
        "--ds_train_path",
        help="Path of train dataset file",
        #default="data/NER_TRAIN/NER_TRAIN_ALL.json", ## we. should change the path  ---> I think ALL is the combination of that !!!!
        default = "//content//drive//MyDrive//DeepLearning_Project//NER_TRAIN//NER_TRAIN_JUDGEMENT.json",
        required=False,
        type=str,
    )
    parser.add_argument(
        "--ds_valid_path",
        help="Path of validation dataset file",
        #default="data/NER_DEV/NER_DEV_ALL.json",
        default = "//content//drive//MyDrive//DeepLearning_Project//NER_DEV//NER_DEV_JUDGEMENT.json",
        required=False,
        type=str,
    )

    ## Pay attention that we have just used Judgment for Train and DEV !!!!! **** !!!!!

    parser.add_argument(
        "--output_folder",
        help="Output folder",
        #default="results/",
        default = "//content//drive//MyDrive//DeepLearning_Project//",
        required=False,
        type=str,
    )
    parser.add_argument(
        "--batch",
        help="Batch size",
        default=1,
        required=False,
        type=int,
    )
    parser.add_argument(
        "--num_epochs",
        help="Number of training epochs",
        default=5,
        required=False,
        type=int,
    )
    parser.add_argument(
        "--lr",
        help="Learning rate",
        default=1e-5,
        required=False,
        type=float,
    )
    parser.add_argument(
        "--weight_decay",
        help="Weight decay",
        default=0.01,
        required=False,
        type=float,
    )
    parser.add_argument(
        "--warmup_ratio",
        help="Warmup ratio",
        default=0.06,
        required=False,
        type=float,
    )

    args = parser.parse_args()

    # here is to make ready the input arguments for the model !!!!
    ## Parameters
    ds_train_path = args.ds_train_path  # e.g., 'data/NER_TRAIN/NER_TRAIN_ALL.json'
    ds_valid_path = args.ds_valid_path  # e.g., 'data/NER_DEV/NER_DEV_ALL.json'
    output_folder = args.output_folder  # e.g., 'results/'
    batch_size = args.batch             # e.g., 256 for luke-based, 1 for bert-based
    num_epochs = args.num_epochs        # e.g., 5
    lr = args.lr                        # e.g., 1e-4 for luke-based, 1e-5 for bert-based
    weight_decay = args.weight_decay    # e.g., 0.01
    warmup_ratio = args.warmup_ratio    # e.g., 0.06



    ## Define the labels, Actually these are the labels for Named Entity Recognition
    original_label_list = [
        "COURT",
        "PETITIONER",
        "RESPONDENT",
        "JUDGE",
        "DATE",
        "ORG",
        "GPE",
        "STATUTE",
        "PROVISION",
        "PRECEDENT",
        "CASE_NUMBER",
        "WITNESS",
        "OTHER_PERSON",
        "LAWYER"
    ]
    labels_list = ["B-" + l for l in original_label_list]
    labels_list += ["I-" + l for l in original_label_list]
    # label_list is the list with these items ----> #['B-COURT',
    #  'B-PETITIONER',
    #  'B-RESPONDENT',
    #  'B-JUDGE',
    #  'B-DATE',
    #  'B-ORG',
    #  'B-GPE',
    #  'B-STATUTE',
    #  'B-PROVISION',
    #  'B-PRECEDENT',
    #  'B-CASE_NUMBER',
    #  'B-WITNESS',
    #  'B-OTHER_PERSON',
    #  'B-LAWYER',
    #  'I-COURT',
    #  'I-PETITIONER',
    #  'I-RESPONDENT',
    #  'I-JUDGE',
    #  'I-DATE',
    #  'I-ORG',
    #  'I-GPE',
    #  'I-STATUTE',
    #  'I-PROVISION',
    #  'I-PRECEDENT',
    #  'I-CASE_NUMBER',
    #  'I-WITNESS',
    #  'I-OTHER_PERSON',
    #  'I-LAWYER']
    num_labels = len(labels_list) + 1
    # there is no need to specify the data as O --> outside label list




    ## Compute metrics, this function is considered as Evaluator to assess the performance of the model
    # IDX to LABELS
    # LABELS to IDX
    def compute_metrics(pred):

        # Preds
        predictions = np.argmax(pred.predictions, axis=-1)
        predictions = np.concatenate(predictions, axis=0) ### --> row by row concatenates the numbers of data
        prediction_ids = [[idx_to_labels[p] if p != -100 else "O" for p in predictions]]
        # put zero if the number is eqaul to -100 ---> for index to label

        # Labels
        labels = pred.label_ids
        labels = np.concatenate(labels, axis=0)
        labels_ids = [[idx_to_labels[p] if p != -100 else "O" for p in labels]]
        unique_labels = list(set([l.split("-")[-1] for l in list(set(labels_ids[0]))]))
        unique_labels.remove("O")

        # Evaluator
        evaluator = Evaluator(
            labels_ids, prediction_ids, tags=unique_labels, loader="list"
        )
        results, results_per_tag = evaluator.evaluate()



        return {
            "f1-type-match": 2
            * results["ent_type"]["precision"]
            * results["ent_type"]["recall"]
            / (results["ent_type"]["precision"] + results["ent_type"]["recall"] + 1e-9),
            "f1-partial": 2
            * results["partial"]["precision"]
            * results["partial"]["recall"]
            / (results["partial"]["precision"] + results["partial"]["recall"] + 1e-9),
            "f1-strict": 2
            * results["strict"]["precision"]
            * results["strict"]["recall"]
            / (results["strict"]["precision"] + results["strict"]["recall"] + 1e-9),
            "f1-exact": 2
            * results["exact"]["precision"]
            * results["exact"]["recall"]
            / (results["exact"]["precision"] + results["exact"]["recall"] + 1e-9),
        } # end of the function

    ## Define the models
    model_paths = [
        "dslim/bert-large-NER",                     # ft on NER
        "Jean-Baptiste/roberta-large-ner-english",  # ft on NER
        "nlpaueb/legal-bert-base-uncased",          # ft on Legal Domain
        "saibo/legal-roberta-base",                 # ft on Legal Domain
        "nlpaueb/bert-base-uncased-eurlex",         # ft on Eurlex
        "nlpaueb/bert-base-uncased-echr",           # ft on ECHR
        "studio-ousia/luke-base",                   # LUKE base
        "studio-ousia/luke-large",                  # LUKE large
    ]


    ### the program will start from here *************########
    for model_path in model_paths:

        print("MODEL: ", model_path)

        ## Define the train and test datasets
        use_roberta = False
        if "luke" in model_path or "roberta" in model_path:
            use_roberta = True

        train_ds = LegalNERTokenDataset(
            ds_train_path,
            model_path,
            labels_list=labels_list,
            split="train",
            use_roberta=use_roberta
        )

        val_ds = LegalNERTokenDataset(
            ds_valid_path,
            model_path,
            labels_list=labels_list,
            split="val",
            use_roberta=use_roberta
        )


        ## Define the model
        model = AutoModelForTokenClassification.from_pretrained(
            model_path,
            num_labels=num_labels,
            ignore_mismatched_sizes=True
        )



        ## Map the labels
        idx_to_labels = {v[1]: v[0] for v in train_ds.labels_to_idx.items()}
        # in the dictionary the labels will be converted to the index Number
        # As an example : Ali --> 343.3434

        ## Output folder
        new_output_folder = os.path.join(output_folder, 'all')
        new_output_folder = os.path.join(new_output_folder, model_path)
        if not os.path.exists(new_output_folder):
            os.makedirs(new_output_folder)

        ## Training Arguments
        training_args = TrainingArguments(
            output_dir=new_output_folder,
            num_train_epochs=num_epochs,
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=1,
            gradient_checkpointing=True,
            warmup_ratio=warmup_ratio,
            weight_decay=weight_decay,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=False,
            save_total_limit=2,
            fp16=False,
            fp16_full_eval=False,
            metric_for_best_model="f1-strict",
            dataloader_num_workers=4,
            dataloader_pin_memory=True,
        )

        ## Collator
        data_collator = DefaultDataCollator()

        ## Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=compute_metrics,
            data_collator=data_collator,
        )

        ## Train the model and save it
        trainer.train()
        trainer.save_model(output_folder)
        trainer.evaluate()




usage: colab_kernel_launcher.py [-h] [--ds_train_path DS_TRAIN_PATH]
                                [--ds_valid_path DS_VALID_PATH] [--output_folder OUTPUT_FOLDER]
                                [--batch BATCH] [--num_epochs NUM_EPOCHS] [--lr LR]
                                [--weight_decay WEIGHT_DECAY] [--warmup_ratio WARMUP_RATIO]
colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-4b0f6ef6-85c4-4e03-8c80-58bc95be4e79.json


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
