<a href="https://colab.research.google.com/github/shahtvisha/Brain-AI-exploration/blob/main/TNER(Preprocessing_and_Finetuning)_FSIL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import re
from bs4 import BeautifulSoup
import os

In [9]:
def get_json_files(dir_path: str) -> list:
    return [file for file in os.listdir(dir_path) if file.endswith('.json')]

def extract_json_ids(json_data):
    ids_list = []
    if isinstance(json_data, list):
        for entry in json_data:
            if isinstance(entry, dict) and 'id' in entry:
                ids_list.append(entry['id'])
    return ids_list

def partition_list(data, train_ratio, val_ratio):
    train_end = int(len(data) * train_ratio)
    val_end = int(len(data) * (train_ratio + val_ratio))
    train_set = data[:train_end]
    val_set = data[train_end:val_end]
    test_set = data[val_end:]
    return train_set, val_set, test_set

def decompose_label(label):
    # Decompose label based on camel case
    return ' '.join(re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', label))

def create_dataset(file_path: str, target_id: int, output_file: str) -> None:
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    for item in json_data:
        if item.get('id') == target_id and 'annotations' in item:
            try:
                html_content = item['data']['html']
            except KeyError:
                print(f"Skipping id {target_id} due to missing 'html' key")
                continue
            soup = BeautifulSoup(html_content, 'lxml')
            text_content = soup.get_text()
            annotations = []
            for annotation in item['annotations']:
                for result in annotation['result']:
                    if 'value' in result:
                        start_pos = result['value']['globalOffsets']['start']
                        end_pos = result['value']['globalOffsets']['end']
                        label = decompose_label(result['value']['hypertextlabels'][0].replace(' ', ''))
                        annotation_text = result['value']['text']
                        annotations.append((start_pos, end_pos, label, annotation_text))
            annotations.sort(key=lambda x: x[0])

            # Add "O" labels in between
            all_labels = []
            current_pos = 0
            for start, end, label, text in annotations:
                if current_pos < start:
                    all_labels.append((current_pos, start, "O", text_content[current_pos:start]))
                all_labels.append((start, end, label, text))
                current_pos = end
            if current_pos < len(text_content):
                all_labels.append((current_pos, len(text_content), "O", text_content[current_pos:len(text_content)]))

            # Perform IOB tagging
            iob_tags = []
            previous_label = "O"

            for start, end, label, text in all_labels:
                if label == "O":
                    words = text.split()
                    iob_tags.extend([(word, "O") for word in words])
                else:
                    words = text.split()
                    for i, word in enumerate(words):
                        if i == 0 and label != previous_label:
                            iob_tags.append((word, f"B-{label}"))
                        else:
                            iob_tags.append((word, f"I-{label}"))

                previous_label = label

            with open(output_file, 'a', encoding='utf-8') as out_file:
                for word, tag in iob_tags:
                    out_file.write(f"{word}\t{tag}\n")
                    if word.endswith('.'):
                        out_file.write("\n")
                out_file.write("\n")

    print(f"IOB tagging for id {target_id} in file {file_path} has been added to {output_file}")


In [10]:
# Configurations
json_directory = '/content/all_json'
test_file = 'test.txt'
train_file = 'train.txt'
val_file = 'valid.txt'

# Clear the output files
open(test_file, 'w').close()
open(train_file, 'w').close()
open(val_file, 'w').close()

# Process each JSON file in the directory
json_file_list = get_json_files(json_directory)
print(json_file_list)

['shahtvisha03@gmail.com_AdditionalAnnotations.json', 'shahtvisha03@gmail.com_update2.0.json']


In [11]:
for json_file in json_file_list:
    json_path = f'{json_directory}/{json_file}'
    try:
        with open(json_path, 'r') as file:
            data = json.load(file)
            id_list = extract_json_ids(data)

            train_ids, val_ids, test_ids = partition_list(id_list, 0.7, 0.15)

            for ids in train_ids:
                create_dataset(json_path, ids, train_file)
            for ids in val_ids:
                create_dataset(json_path, ids, val_file)
            for ids in test_ids:
                create_dataset(json_path, ids, test_file)
    except json.JSONDecodeError:
        print(f"Error reading JSON file: {json_path}. File may be empty or malformed.")
    except Exception as e:
        print(f"An error occurred while processing file {json_path}: {str(e)}")

IOB tagging for id 11 in file /content/all_json/shahtvisha03@gmail.com_AdditionalAnnotations.json has been added to train.txt
IOB tagging for id 12 in file /content/all_json/shahtvisha03@gmail.com_AdditionalAnnotations.json has been added to train.txt
IOB tagging for id 13 in file /content/all_json/shahtvisha03@gmail.com_AdditionalAnnotations.json has been added to train.txt
IOB tagging for id 14 in file /content/all_json/shahtvisha03@gmail.com_AdditionalAnnotations.json has been added to valid.txt
IOB tagging for id 15 in file /content/all_json/shahtvisha03@gmail.com_AdditionalAnnotations.json has been added to test.txt
IOB tagging for id 1 in file /content/all_json/shahtvisha03@gmail.com_update2.0.json has been added to train.txt
IOB tagging for id 2 in file /content/all_json/shahtvisha03@gmail.com_update2.0.json has been added to train.txt
IOB tagging for id 3 in file /content/all_json/shahtvisha03@gmail.com_update2.0.json has been added to train.txt
IOB tagging for id 4 in file /co

Finetuning


In [3]:
%pip install tner -U
%pip list | grep tner

Collecting tner
  Downloading tner-0.2.4.tar.gz (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting allennlp>=2.0.0 (from tner)
  Downloading allennlp-2.10.1-py3-none-any.whl.metadata (21 kB)
Collecting seqeval (from tner)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets (from tner)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting torch (from tner)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl.metadata (22 kB)
Collecting torchvision<0.14.0,>=0.8.1 (from allennlp>=2.0.0->tner)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (10 kB)
Collecting cached-path<1.2.0,>=1.1.3 (from allennlp>=2.

In [10]:
!pip install --upgrade huggingface-hub

Collecting huggingface-hub
  Using cached huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Using cached huggingface_hub-0.24.5-py3-none-any.whl (417 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggingface-hub-0.10.1:
      Successfully uninstalled huggingface-hub-0.10.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cached-path 1.1.6 requires huggingface-hub<0.11.0,>=0.8.1, but you have huggingface-hub 0.24.5 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface-hub-0.24.5


In [4]:
pip install --upgrade huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.24.5-py3-none-any.whl (417 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.5/417.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggingface-hub-0.10.1:
      Successfully uninstalled huggingface-hub-0.10.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cached-path 1.1.6 requires huggingface-hub<0.11.0,>=0.8.1, but you have huggingface-hub 0.24.5 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface_hub-0.24.5


In [5]:
import logging
from tner import GridSearcher, TransformersNER

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [6]:
local_dataset = {"train": "/content/train.txt", "test":"/content/test.txt", "validation":"/content/valid.txt" }

In [7]:
searcher = GridSearcher(
   checkpoint_dir='./ckpt_bert_custom_dataset',
   local_dataset=local_dataset,
   model="distilbert-base-cased",  # language model to fine-tune
   epoch=2,  # the total epoch (`L` in the figure)
   epoch_partial=1,  # the number of epoch at 1st stage (`M` in the figure)
   n_max_config=1,  # the number of models to pass to 2nd stage (`K` in the figure)
   batch_size=4,
   gradient_accumulation_steps=[1],
   crf=[True],
   lr=[1e-4],
   weight_decay=[None],
   random_seed=[42],
   lr_warmup_step_ratio=[0.1],
   max_grad_norm=[None, 10]
)
searcher.train()

INFO:root:INITIALIZE GRID SEARCHER: 2 configs to try
INFO:root:## 1st RUN: Configuration 0/2 ##
INFO:root:hyperparameters
INFO:root:	 * dataset: None
INFO:root:	 * dataset_split: train
INFO:root:	 * dataset_name: None
INFO:root:	 * local_dataset: {'train': '/content/train.txt', 'test': '/content/test.txt', 'validation': '/content/valid.txt'}
INFO:root:	 * model: distilbert-base-cased
INFO:root:	 * crf: True
INFO:root:	 * max_length: 128
INFO:root:	 * epoch: 2
INFO:root:	 * batch_size: 4
INFO:root:	 * lr: 0.0001
INFO:root:	 * random_seed: 42
INFO:root:	 * gradient_accumulation_steps: 1
INFO:root:	 * weight_decay: None
INFO:root:	 * lr_warmup_step_ratio: 0.1
INFO:root:	 * max_grad_norm: None
INFO:root:initialize language model with `distilbert-base-cased`


Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

INFO:root:dataset preprocessing


KeyError: 'I-'