In [2]:
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu114

Keyring is skipped due to an exception: 'keyring.backends'
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu114
[0m

In [3]:
!pip3 install omegaconf hydra-core fairseq sentencepiece
!pip3 install seqeval

Keyring is skipped due to an exception: 'keyring.backends'
[0mKeyring is skipped due to an exception: 'keyring.backends'
[0m

In [4]:
!pip -q install transformers
!pip  -q install datasets

Keyring is skipped due to an exception: 'keyring.backends'
[0mKeyring is skipped due to an exception: 'keyring.backends'
[0m

In [5]:
def load_dataset(filename):
    
    with open(filename) as file:
        lines = [x.strip() for x in file.readlines()]
    
    tokens_list = []
    ner_tags_list = []
    
    tokens = []
    ner_tags = []
    for l in lines:

        if l == "":
            tokens_list.append(tokens)
            ner_tags_list.append(ner_tags)
            tokens = []
            ner_tags = []
        else:
            t, n = l.split(" _ _ ")
            tokens += [t]
            ner_tags += [n]
    
    tokens_list.append(tokens)
    ner_tags_list.append(ner_tags) 
    
    return tokens_list, ner_tags_list

In [6]:
tokens_list, ner_tags_list = load_dataset('train.txt')

In [7]:
dev_tokens_list, dev_ner_tags_list = load_dataset('dev.txt')

In [8]:
from datasets import Dataset, Features, Sequence, Value, ClassLabel

features = Features({
  "tokens": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
  "ner_tags": Sequence(feature=ClassLabel(names=["O", "B-PER", "B-LOC", "B-CORP", "B-GRP", "B-PROD", "B-CW", "I-PER", "I-LOC", "I-CORP", "I-GRP", "I-PROD", "I-CW"], id=None), length=-1, id=None)
})

In [9]:
train_ds = Dataset.from_dict(
  {"tokens": tokens_list, "ner_tags": ner_tags_list},
  features=features
)
dev_ds = Dataset.from_dict(
  {"tokens": dev_tokens_list, "ner_tags": dev_ner_tags_list},
  features=features
)

In [10]:
tags = train_ds.features["ner_tags"].feature

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}


# separate dataset into train dataset and validation dataset
# ds = ds.train_test_split(test_size=0.1, shuffle=True)

In [11]:
dev_ds[1]['ner_tags'], dev_ds[1]['tokens']

([0, 0, 0, 5, 0, 0, 0],
 ['ভিনেগার', 'মাঝে', 'মাঝে', 'চাটনি', 'ব্যবহার', 'করা', 'হয়।'])

In [12]:
from transformers import AutoTokenizer
xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(data):
    text = ["".join(t) for t in data["tokens"]]
    tokenized_inputs = xlmr_tokenizer(text)

    labels = []
    for row_idx, label_old in enumerate(data["ner_tags"]):
        label_new = [[] for t in tokenized_inputs.tokens(batch_index=row_idx)]
        for char_idx in range(len(data["tokens"][row_idx])):
            token_idx = tokenized_inputs.char_to_token(row_idx, char_idx)
            if token_idx is not None:
                label_new[token_idx].append(data["ner_tags"][row_idx][char_idx])
                if (tokenized_inputs.tokens(batch_index=row_idx)[token_idx] == "▁") and (data["ner_tags"][row_idx][char_idx] != 0):
                    label_new[token_idx+1].append(data["ner_tags"][row_idx][char_idx])
        label_new = list(map(lambda i : max(i, default=0), label_new))
        labels.append(label_new)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# run conversion
tokenized_train_ds = train_ds.map(
  tokenize_and_align_labels,
  remove_columns=["tokens", "ner_tags"],
  batched=True,
  batch_size=128)
tokenized_dev_ds = dev_ds.map(
  tokenize_and_align_labels,
  remove_columns=["tokens", "ner_tags"],
  batched=True,
  batch_size=128)


  0%|          | 0/120 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [13]:
import torch
from transformers import AutoConfig
from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

xlmr_config = AutoConfig.from_pretrained(
  "xlm-roberta-base",
  num_labels=tags.num_classes,
  id2label=index2tag,
  label2id=tag2index
)
model = (RobertaForTokenClassification
         .from_pretrained("xlm-roberta-base", config=xlmr_config)
         .to(device))


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir = "google-mt5-base-ner-ja",
  log_level = "error",
  num_train_epochs = 20,
  per_device_train_batch_size = 12,
  per_device_eval_batch_size = 12,
  evaluation_strategy = "epoch",
  fp16 = True,
  logging_steps = len(tokenized_train_ds),
  push_to_hub = False
)

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
  xlmr_tokenizer,
  return_tensors="pt")

In [18]:
import numpy as np
from seqeval.metrics import f1_score

def metrics_func(eval_arg):
    preds = np.argmax(eval_arg.predictions, axis=2)
    batch_size, seq_len = preds.shape
    y_true, y_pred = [], []
    for b in range(batch_size):
        true_label, pred_label = [], []
        for s in range(seq_len):
            if eval_arg.label_ids[b, s] != -100:  # -100 must be ignored
                true_label.append(index2tag[eval_arg.label_ids[b][s]])
                pred_label.append(index2tag[preds[b][s]])
        y_true.append(true_label)
        y_pred.append(pred_label)
    return {"f1": f1_score(y_true, y_pred)}

In [19]:
from transformers import Trainer

trainer = Trainer(
  model = model,
  args = training_args,
  data_collator = data_collator,
  compute_metrics = metrics_func,
  train_dataset = tokenized_train_ds,
  eval_dataset = tokenized_dev_ds,
  tokenizer = xlmr_tokenizer
)

In [20]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,No log,0.145011,0.151229
2,No log,0.128096,0.267096
3,No log,0.1151,0.327749
4,No log,0.104761,0.370889
5,No log,0.111345,0.338645
6,No log,0.105062,0.405333
7,No log,0.112907,0.394472
8,No log,0.117604,0.396746
9,No log,0.119834,0.404066
10,No log,0.131412,0.387742


TrainOutput(global_step=25500, training_loss=0.052216264313342524, metrics={'train_runtime': 6321.4147, 'train_samples_per_second': 48.407, 'train_steps_per_second': 4.034, 'total_flos': 9997567081562448.0, 'train_loss': 0.052216264313342524, 'epoch': 20.0})

In [None]:
trainer.evaluate()

In [None]:
test_tokens_list, test_ner_tags_list = load_dataset('bn_test.conll')
test_ds = Dataset.from_dict(
  {"tokens": test_tokens_list, "ner_tags": test_ner_tags_list},
  features=features
)

In [None]:
import os
import torch
from transformers import AutoConfig

# save fine-tuned model in local
os.makedirs("./trained_ner_classifier_jp", exist_ok=True)
if hasattr(trainer.model, "module"):
    trainer.model.module.save_pretrained("./trained_ner_classifier_jp")
else:
    trainer.model.save_pretrained("./trained_ner_classifier_jp")

# load from the saved checkpoint
xlmr_config = AutoConfig.from_pretrained(
  "xlm-roberta-base",
  num_labels=tags.num_classes,
  id2label=index2tag,
  label2id=tag2index
)
model = (RobertaForTokenClassification
         .from_pretrained("./trained_ner_classifier_jp", config=xlmr_config)
         .to(device))

In [None]:
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
import pandas as pd

# create dataset for prediction
sample_encoding = xlmr_tokenizer(
    # [
    #     "উপরন্তু, রুটটি ব্যবসায়িক রুট এর মানদণ্ড পূরণ করেনি।",
    #     "উপরন্তু, রুটটি ব্যবসায়িক রুট এর মানদণ্ড পূরণ করেনি।",
    # ], 
    [" ".join(test_ds['tokens'][i]) for i in range(len(test_ds['tokens']))],
    truncation=True, max_length=512)
sample_dataset = Dataset.from_dict(sample_encoding)
sample_dataset = sample_dataset.with_format("torch")

# predict
sample_dataloader = DataLoader(sample_dataset, batch_size=1)
tokens = []
labels = []
for batch in sample_dataloader:
  # predict
    with torch.no_grad():
        output = model(batch["input_ids"].to(device), batch["attention_mask"].to(device))
    predicted_label_id = torch.argmax(output.logits, axis=-1).cpu().numpy()
    # create output
    tokens.append(xlmr_tokenizer.convert_ids_to_tokens(batch["input_ids"][0]))
    labels.append([index2tag[i] for i in predicted_label_id[0]])

# show the first result
pd.DataFrame([tokens[0], labels[0]], index=["Tokens", "Tags"])

In [None]:
merged_labels = []
for i in range(len(labels)):
    ml = []
    for j in range(1, len(labels[i]), 1):
        if tokens[i][j].startswith('▁'):
            ml += [tag2index[labels[i][j]]]
    merged_labels += [ml]

In [None]:
merged_labels[:3], test_ds['ner_tags'][:3]

In [None]:
test_ds['tokens'][1], test_ds['ner_tags'][1]

In [None]:
index2tag

In [None]:
with open("dev_pred_labels.txt", "w") as file:
    for ml in merged_labels:
        for x in ml:
            file.write(x + "\n")
        file.write("\n")