In [2]:
import json
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
label_map = {
    "O": 0,       # Outside
    "B-ADE": 1,   # Beginning of an ADE
    "I-ADE": 2,   # Inside of an ADE
    "B-DRUG": 3,  # Beginning of a DRUG
    "I-DRUG": 4   # Inside of a DRUG
}
id2label = {v: k for k, v in label_map.items()}

ner_tags_feature = ClassLabel(names=list(label_map.keys()))

In [10]:
def load_and_prepare_data(filepath, is_weak_data=False):

    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = {
        "id": [],
        "tokens": [],
        "ner_tags": []
    }
    
    print(f"Processing {filepath}... Found {len(data)} records.")

    for i, record in enumerate(data):
        if is_weak_data:
            text = record['data']['text']
            annotations = record['annotations'][0]['result']
        else: 
            text = record['data']['SYMPTOM_TEXT'] 
            annotations = record.get('annotations', [{}])[0].get('result', [])

        if not isinstance(text, str):
            continue


        tokens = text.split()
        tags = [label_map["O"]] * len(tokens)
        
        try:
            for ann in annotations:
                label_info = ann['value']
                label_name = label_info['labels'][0]
                start, end = label_info['start'], label_info['end']
                
                span_text = text[start:end]
                span_tokens = span_text.split()
                if not span_tokens:
                    continue


                for i_tok in range(len(tokens) - len(span_tokens) + 1):

                    if tokens[i_tok:i_tok+len(span_tokens)] == span_tokens:
                        tags[i_tok] = label_map[f"B-{label_name}"] # Beginning tag
                        for j in range(1, len(span_tokens)):
                            tags[i_tok+j] = label_map[f"I-{label_name}"] # Inside tag
                        break 

            processed_data["id"].append(i)
            processed_data["tokens"].append(tokens)
            processed_data["ner_tags"].append(tags)

        except Exception as e:
            continue
            

    hf_dataset = Dataset.from_dict(processed_data)
    return hf_dataset

In [11]:
weak_dataset = load_and_prepare_data(r"C:\Users\Siddu\Downloads\New folder\ds-rpc-02\data\processed\weak_data.json", is_weak_data=True)

gold_dataset = load_and_prepare_data(r"C:\Users\Siddu\Downloads\ds-rpc-02\data\processed\annotations_output.json", is_weak_data=False)


gold_dataset_dict = gold_dataset.train_test_split(test_size=0.2, seed=42)

weak_dataset.save_to_disk(r"C:\Users\Siddu\Downloads\New folder\ds-rpc-02\data\processed\hf_weak_dataset")
gold_dataset_dict.save_to_disk(r"C:\Users\Siddu\Downloads\New folder\ds-rpc-02\data\processed\hf_gold_dataset_dict")



Processing C:\Users\Siddu\Downloads\New folder\ds-rpc-02\data\processed\weak_data.json... Found 80331 records.
Processing C:\Users\Siddu\Downloads\ds-rpc-02\data\processed\annotations_output.json... Found 500 records.


Saving the dataset (1/1 shards): 100%|██████████| 80331/80331 [00:00<00:00, 229276.22 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 400/400 [00:00<00:00, 26358.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 55849.59 examples/s]
