In [1]:
#!pip install transformers datasets seqeval scikit-learn
#!pip install --upgrade transformers
import os
os.environ["WANDB_DISABLED"] = "true"


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [5]:
# Load CSV
df = pd.read_csv("BIOES_train_data.tsv", keep_default_na=False, na_values=None, sep="\t")
df

Unnamed: 0,Record Number,Category,Title,Token,Tag
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,MINI,S-Kompatible_Fahrzeug_Marke
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,1.6,S-Kompatibles_Fahrzeug_Modell
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W10B16A,B-Herstellernummer
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W11B16A,I-Herstellernummer
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,R50,E-Herstellernummer
...,...,...,...,...,...
52683,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Opel,S-Kompatible_Fahrzeug_Marke
52684,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Saab,B-Kompatibles_Fahrzeug_Modell
52685,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,1.8,E-Kompatibles_Fahrzeug_Modell
52686,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,/,O


In [6]:
df = df[df['Category']==2].copy()
df

Unnamed: 0,Record Number,Category,Title,Token,Tag
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,MINI,S-Kompatible_Fahrzeug_Marke
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,1.6,S-Kompatibles_Fahrzeug_Modell
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W10B16A,B-Herstellernummer
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W11B16A,I-Herstellernummer
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,R50,E-Herstellernummer
...,...,...,...,...,...
52683,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Opel,S-Kompatible_Fahrzeug_Marke
52684,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Saab,B-Kompatibles_Fahrzeug_Modell
52685,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,1.8,E-Kompatibles_Fahrzeug_Modell
52686,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,/,O


In [7]:

# Group by record to form sequences
grouped = df.groupby("Record Number")
sentences = []
labels = []

for _, group in grouped:
    tokens = group["Token"].tolist()
    tags = group["Tag"].tolist()
    sentences.append(tokens)
    labels.append(tags)

# Train-test split
train_texts, val_texts, train_tags, val_tags = train_test_split(sentences, labels, test_size=0.1)


In [72]:
from transformers import AutoTokenizer

model_checkpoint = "deepset/gbert-large" #deepset/gbert-large, bert-large-german-cased
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [None]:
# Get label list
all_tags = set(tag for doc in train_tags + val_tags for tag in doc)
unique_tags = sorted(all_tags)
label2id = {tag: i for i, tag in enumerate(unique_tags)} # {'B-LOC': 0, 'B-ORG': 1, ...}
id2label = {i: tag for tag, i in label2id.items()} # Inverts the dictionary: ID → label

In [74]:
def tokenize_and_align_labels(batch):
    texts = batch["tokens"]
    tags = batch["tags"]
    tokenized_inputs = tokenizer(texts, is_split_into_words=True, truncation=True, padding=True, max_length=128, return_tensors=None)
    labels = []

    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                # Same word, usually subword
                label_ids.append(label2id[label[word_idx]] if "I" in label[word_idx] or "E" in label[word_idx] else label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [75]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "tokens": train_texts,
    "tags": train_tags
}).map(tokenize_and_align_labels, batched=True)

val_dataset = Dataset.from_dict({
    "tokens": val_texts,
    "tags": val_tags
}).map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/2088 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

In [76]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id
)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./ner_model",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [78]:
trainer.train()

Step,Training Loss
500,0.6344
1000,0.265


TrainOutput(global_step=1310, training_loss=0.3787853240966797, metrics={'train_runtime': 1110.6298, 'train_samples_per_second': 18.8, 'train_steps_per_second': 1.18, 'total_flos': 2121330102232320.0, 'train_loss': 0.3787853240966797, 'epoch': 10.0})

In [79]:
# Save the fine-tuned model and tokenizer
model_dir = "./ner_model_artifacts_for_2_large"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)


('./ner_model_artifacts_for_2_large/tokenizer_config.json',
 './ner_model_artifacts_for_2_large/special_tokens_map.json',
 './ner_model_artifacts_for_2_large/vocab.txt',
 './ner_model_artifacts_for_2_large/added_tokens.json',
 './ner_model_artifacts_for_2_large/tokenizer.json')

In [80]:
from google.colab import files
import shutil

# Zip the directory
shutil.make_archive('ner_model_artifacts_for_2_large', 'zip', 'ner_model_artifacts_for_2_large')



'/content/ner_model_artifacts_for_2_large.zip'

In [83]:
# Download the zipped file
files.download('ner_model_artifacts_for_2_large.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>