Key enhancements for handling class imbalance:

Class Weight Calculation:

Added label counting during chunk creation

Calculated weights using inverse frequency with sqrt smoothing:
weight = sqrt(total_count / (count + 1))

Printed weights for verification

Converted weights to tensor for loss calculation

Custom Weighted Trainer:

Created WeightedTrainer subclass

Implemented compute_loss with weighted CrossEntropyLoss

Handled device placement automatically

Preserved ignore_index for special tokens (-100)

Enhanced Entity Detection:

Improved span matching logic in create_chunks_with_labels

Added separate metrics tracking for B-DATASET and I-DATASET

Set metric_for_best_model to B-DATASET_f1

Training Improvements:

Increased epochs to 5 for better convergence

Added logging steps for better monitoring

Set save_total_limit to prevent storage overflow

Class Weight Logic:

sqrt smoothing prevents extreme weights while still boosting minority classes

Automatic handling of unseen labels (weight=1.0)

Weights are applied during loss calculation only to relevant tokens

The weighting strategy will:

Boost learning for rare DATASET classes

Prevent the model from ignoring minority classes

Maintain performance on common "O" class

Improve overall entity detection F1 score

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate

In [3]:
import pandas as pd

df= pd.read_csv("/content/drive/MyDrive/mapped_data (1).csv")

In [23]:
df

Unnamed: 0,article_id,dataset_id,type,text
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary,PUBLICATIONS\nJournal of Geophysical Research:...
1,10.1002_anie.201916483,https://doi.org/10.5517/ccdc.csd.cc1npvt0,Missing,Angewandte\nChemie Research Articles\nInternat...
2,10.1002_anie.202005531,https://doi.org/10.5517/ccdc.csd.cc24wxqp,Missing,Angewandte\nChemie Communications\nArenes\nHow...
3,10.1002_anie.202007717,https://doi.org/10.5517/ccdc.csd.cc24rrb0,Missing,Angewandte\nChemie Research Articles\nDynamic ...
4,10.1002_chem.201902131,https://doi.org/10.5517/ccdc.csd.cc221dk3,Missing,Full Paper DOI: 10.1002/chem.201902131\n& Nitr...
...,...,...,...,...
1061,10.7717_peerj.12422,https://doi.org/10.15468/dl.t3h8b4,Secondary,Spatial and temporal distribution patterns of ...
1062,10.7717_peerj.12422,https://doi.org/10.15468/dl.yak5vd,Secondary,Spatial and temporal distribution patterns of ...
1063,10.7717_peerj.13193,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary,Foliar nutrient concentrations of six northern...
1064,10.7717_peerj.13193,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary,Foliar nutrient concentrations of six northern...


In [4]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=4b08747c8548a10f5afdaa322f2a0ea5d58d11601da14ad4aa06a75fd9849696
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [5]:

import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
from collections import Counter
import math

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", use_fast=True)

# Configuration
MAX_LENGTH = 512  # Model's max token limit
STRIDE = 128      # Overlap size
CHUNK_SIZE = MAX_LENGTH - 2  # Account for [CLS] and [SEP]

def find_dataset_positions(text, dataset_ids):
    """Find character positions of dataset mentions in text"""
    positions = []
    for ds_id in dataset_ids:
        # Handle different formatting of the same dataset ID
        patterns = [re.escape(ds_id)]
        if ds_id.startswith("https://doi.org/"):
            patterns.append(re.escape(ds_id.replace("https://doi.org/", "doi:")))
            patterns.append(re.escape(ds_id.split("/")[-1]))

        for pattern in patterns:
            for match in re.finditer(pattern, text):
                positions.append((match.start(), match.end(), "DATASET"))
    return positions

def create_chunks_with_labels(text, positions):
    """Process text into chunks with sliding window and create NER labels"""
    # Tokenize entire text to get accurate offsets
    encoding = tokenizer(text, return_offsets_mapping=True,
                         add_special_tokens=False, truncation=False)
    tokens = encoding["input_ids"]
    offset_mapping = encoding["offset_mapping"]

    # Create token-level labels (without special tokens)
    token_labels = ["O"] * len(tokens)
    for start_char, end_char, label_type in positions:
        entity_tokens = []
        for token_idx, (token_start, token_end) in enumerate(offset_mapping):
            # Check if token is inside entity span
            if (token_start >= start_char and token_end <= end_char) or \
               (start_char <= token_start < end_char) or \
               (start_char < token_end <= end_char):
                entity_tokens.append(token_idx)

        # Apply IOB tagging to entity tokens
        if entity_tokens:
            # Mark first token as B- and subsequent as I-
            token_labels[entity_tokens[0]] = f"B-{label_type}"
            for token_idx in entity_tokens[1:]:
                token_labels[token_idx] = f"I-{label_type}"

    # Create sliding window chunks
    chunks = []
    start = 0

    while start < len(tokens):
        end = min(start + CHUNK_SIZE, len(tokens))

        # Extract chunk tokens and labels
        chunk_tokens = tokens[start:end]
        chunk_labels = token_labels[start:end]

        # Add special tokens
        input_ids = [tokenizer.cls_token_id] + chunk_tokens + [tokenizer.sep_token_id]
        labels = [-100] + chunk_labels + [-100]  # -100 ignores special tokens in loss

        # Create attention mask
        attention_mask = [1] * len(input_ids)

        chunks.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "token_offset": start  # Track original position
        })

        # Exit if at end of text
        if end == len(tokens):
            break

        # Apply stride (with overlap)
        start += (CHUNK_SIZE - STRIDE)

    return chunks

# Group by article to process each paper
df = df
grouped = df.groupby("article_id").agg({
    "dataset_id": list,
    "text": "first"
})

# Process articles with sliding window chunking
ner_data = []
label_counter = Counter()

for article_id, row in tqdm(grouped.iterrows(), total=len(grouped)):
    text = row["text"]
    dataset_ids = row["dataset_id"]

    # Find dataset positions in text
    positions = find_dataset_positions(text, dataset_ids)

    # Create chunks with sliding window
    chunks = create_chunks_with_labels(text, positions)

    for chunk in chunks:
        # Collect label counts for weighting
        for label in chunk["labels"]:
            if label != -100:  # Ignore special tokens
                label_counter[label] += 1

        ner_data.append({
            "input_ids": chunk["input_ids"],
            "attention_mask": chunk["attention_mask"],
            "labels": chunk["labels"],
            "article_id": article_id
        })

# Convert to DataFrame
ner_df = pd.DataFrame(ner_data)
ner_df.to_pickle("ner_training_data.pkl")  # Save processed data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

100%|██████████| 523/523 [00:38<00:00, 13.63it/s]


In [7]:

# Dataset class
class SciDataset(Dataset):
    def __init__(self, data, label2id):
        self.data = data
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]

        # Convert string labels to IDs
        label_ids = []
        for tag in item["labels"]:
            if tag == -100:  # Special token
                label_ids.append(-100)
            else:
                label_ids.append(self.label2id.get(tag, -100))

        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"]),
            "labels": torch.tensor(label_ids)
        }

# Prepare label mapping
all_tags = set()
for labels in ner_df["labels"]:
    for tag in labels:
        if tag != -100:
            all_tags.add(tag)

label2id = {tag: i for i, tag in enumerate(sorted(all_tags))}
id2label = {i: tag for tag, i in label2id.items()}

# Calculate class weights
total_count = sum(label_counter.values())
class_weights = []

# Calculate weights with smoothing
for label in sorted(label2id.keys()):
    count = label_counter.get(label, 0)

    # Handle zero-count labels
    if count == 0:
        weight = 1.0
    else:
        # Inverse frequency weighting with sqrt smoothing
        weight = math.sqrt(total_count / (count + 1))  # +1 to avoid division by zero

    class_weights.append(weight)

# Convert to tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("\nClass weights:")
for label, weight in zip(sorted(label2id.keys()), class_weights):
    print(f"{label}: {weight:.2f}")

# Split data
train_df, val_df = train_test_split(ner_df, test_size=0.2, random_state=42)

train_dataset = SciDataset(train_df, label2id)
val_dataset = SciDataset(val_df, label2id)

# Data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=MAX_LENGTH,
    label_pad_token_id=-100
)

# Initialize model
model = AutoModelForTokenClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Custom Trainer with weighted loss
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Move weights to correct device
        weights = self.class_weights.to(logits.device)

        # Flatten the tensors
        loss_fct = torch.nn.CrossEntropyLoss(
            weight=weights,
            ignore_index=-100
        )
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Metrics
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # Calculate F1 for DATASET classes separately
    dataset_metrics = {}
    for label in ["B-DATASET", "I-DATASET"]:
        if label in results:
            dataset_metrics.update({
                f"{label}_precision": results[label]["precision"],
                f"{label}_recall": results[label]["recall"],
                f"{label}_f1": results[label]["f1-score"],
                f"{label}_support": results[label]["support"],
            })

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        **dataset_metrics
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  # Increased epochs
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Focus on entity detection
    greater_is_better=True,
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=2,
    logging_steps=100,
    save_total_limit=2,
)

# Initialize Trainer with weighted loss
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()
trainer.save_model("ner_model")
tokenizer.save_pretrained("ner_model")


Class weights:
B-DATASET: 80.27
I-DATASET: 39.08
O: 1.00


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0518,0.015331,0.108378,0.5,0.178143,0.998881
2,0.0151,0.006928,0.435484,0.861702,0.578571,0.999491
3,0.0067,0.006875,0.529661,0.886525,0.66313,0.999534
4,0.0021,0.00778,0.632754,0.904255,0.744526,0.999743
5,0.0016,0.011497,0.592326,0.875887,0.706724,0.999744
6,0.0045,0.011564,0.709402,0.882979,0.78673,0.999752
7,0.0003,0.01729,0.789137,0.875887,0.830252,0.99981
8,0.0002,0.01783,0.781646,0.875887,0.826087,0.999812




('ner_model/tokenizer_config.json',
 'ner_model/special_tokens_map.json',
 'ner_model/vocab.txt',
 'ner_model/added_tokens.json',
 'ner_model/tokenizer.json')

In [2]:
import os

folder_path = r'C:\Users\ADMIN\Desktop\make data count kaggle\code'
print("Files in directory:", os.listdir(folder_path))

Files in directory: ['Fine_tuning_NER_1.ipynb', 'Fine_tuning_NER_2.ipynb', 'Fine_tuning_NER_4.ipynb']


In [3]:

# Use raw strings (r'...') to avoid unicode escape errors
input_path = r'C:\Users\ADMIN\Desktop\make data count kaggle\code\Fine_tuning_NER_2.ipynb'
output_path = r'C:\Users\ADMIN\Desktop\make data count kaggle\code\NER_2.ipynb'

# Load the notebook
with open(input_path, 'r', encoding='utf-8') as f:
    notebook = json.load(f)

# Remove 'widgets' metadata if present
if 'widgets' in notebook.get('metadata', {}):
    del notebook['metadata']['widgets']

# Remove 'widgets' from each cell's metadata
for cell in notebook.get('cells', []):
    if 'widgets' in cell.get('metadata', {}):
        del cell['metadata']['widgets']

# Save the cleaned notebook
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(notebook, f, indent=2)


NameError: name 'json' is not defined