<a href="https://colab.research.google.com/github/saribasmetehan/bert-base-turkish-uncased-ner/blob/main/bert_base_turkish_uncased_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q datasets

In [2]:
from collections import defaultdict
from datasets import load_dataset, DatasetDict

data_splits = defaultdict(DatasetDict)

dataset = load_dataset("turkish-nlp-suite/turkish-wikiNER")

for split in dataset:
    data_splits[split] = dataset[split]

train_data = data_splits["train"]
test_data = data_splits["test"]

print(train_data)
print(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['tokens', 'tags'],
    num_rows: 17967
})
Dataset({
    features: ['tokens', 'tags'],
    num_rows: 1000
})


In [3]:
element = dataset["train"][0]

for key,value in element.items():
    print(f"{key}: {value}")

tokens: ['Orda', 'Spike', ',', "First'ün", 'etkisiyle', "Buffy'ye", 'saldırır', 've', 'insanları', 'öldürüp', 'buraya', 'gömdüğünü', 'hatırlar', '.']
tags: ['O', 'B-PERSON', 'O', 'B-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [4]:
for key,value in dataset["train"].features.items():
    print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
tags: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [5]:
from datasets import ClassLabel

In [6]:
unique_tags = set(tag for tags in dataset['train']['tags'] for tag in tags)

tags_classlabel = ClassLabel(names=list(unique_tags))

def encode_tags(example):
    example['tag_ids'] = [tags_classlabel.str2int(tag) for tag in example['tags']]
    return example

dataset = dataset.map(encode_tags)

print(dataset['train'][0])

Map:   0%|          | 0/17967 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'tokens': ['Orda', 'Spike', ',', "First'ün", 'etkisiyle', "Buffy'ye", 'saldırır', 've', 'insanları', 'öldürüp', 'buraya', 'gömdüğünü', 'hatırlar', '.'], 'tags': ['O', 'B-PERSON', 'O', 'B-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [37, 36, 37, 36, 37, 36, 37, 37, 37, 37, 37, 37, 37, 37]}


In [7]:
tags_classlabel

ClassLabel(names=['I-LANGUAGE', 'I-MONEY', 'B-ORDINAL', 'I-DATE', 'I-FAC', 'I-EVENT', 'I-GPE', 'B-LAW', 'I-QUANTITY', 'B-FAC', 'I-PERSON', 'B-GPE', 'I-LOC', 'I-LAW', 'B-WORK_OF_ART', 'B-QUANTITY', 'I-ORDINAL', 'B-ORG', 'I-WORK_OF_ART', 'B-TIME', 'B-CARDINAL', 'I-ORG', 'I-CARDINAL', 'I-NORP', 'B-DATE', 'B-TITLE', 'B-EVENT', 'B-PERCENT', 'B-LOC', 'B-NORP', 'B-LANGUAGE', 'I-PERCENT', 'I-TITLE', 'B-PRODUCT', 'B-MONEY', 'I-TIME', 'B-PERSON', 'O', 'I-PRODUCT'], id=None)

In [8]:
import pandas as pd

In [9]:
element = dataset["train"][0]
pd.DataFrame([element['tokens'], element['tag_ids'], element["tags"]],
             index=['tokens', 'tags_ids',"tags"])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
tokens,Orda,Spike,",",First'ün,etkisiyle,Buffy'ye,saldırır,ve,insanları,öldürüp,buraya,gömdüğünü,hatırlar,.
tags_ids,37,36,37,36,37,36,37,37,37,37,37,37,37,37
tags,O,B-PERSON,O,B-PERSON,O,B-PERSON,O,O,O,O,O,O,O,O


In [10]:
from collections import Counter

In [11]:
from collections import Counter
import pandas as pd

split2freqs = defaultdict(Counter)

for row in dataset['train']:
    for tag in row["tags"]:
        if tag.startswith("B-"):
            tag_type = tag.split("-")[1]
            split2freqs['train'][tag_type] += 1

freqs_df = pd.DataFrame(split2freqs).fillna(0).astype(int)

print(freqs_df)

             train
CARDINAL      3876
DATE          6245
EVENT         2197
FAC            849
GPE           9209
LANGUAGE       755
LAW             71
LOC           1267
MONEY           92
NORP          3636
ORDINAL       1500
ORG           4081
PERCENT        176
PERSON       11492
PRODUCT        865
QUANTITY       559
TIME           121
TITLE         2136
WORK_OF_ART   2637


In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'tag_ids'],
        num_rows: 17967
    })
    validation: Dataset({
        features: ['tokens', 'tags', 'tag_ids'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'tags', 'tag_ids'],
        num_rows: 1000
    })
})

In [13]:
from transformers import AutoTokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")

In [15]:
text = "Mustafa Kemal Atatürk, Türk mareşal, devlet adamı, yazar, Türk Kurtuluş Savaşı'nın başkomutanı, Türkiye Cumhuriyeti'nin kurucusu ve ilk cumhurbaşkanıdır."

In [16]:
tokens = tokenizer(text).tokens()

In [17]:
pd.DataFrame([tokens], index=['tokens'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
tokens,[CLS],mustafa,kemal,ata,##tur,##k,",",turk,mar,##esa,...,nin,kurucusu,ve,ilk,cumhur,##bas,##kan,##ıdır,.,[SEP]


In [18]:
from transformers import AutoModelForTokenClassification

In [19]:
model_name = "dbmdz/bert-base-turkish-uncased"
index2tag = {index: tag for index, tag in enumerate(tags_classlabel.names)}
tag2index = {tag: index for index, tag in enumerate(tags_classlabel.names)}
num_labels = tags_classlabel.num_classes

In [20]:
index2tag

{0: 'I-LANGUAGE',
 1: 'I-MONEY',
 2: 'B-ORDINAL',
 3: 'I-DATE',
 4: 'I-FAC',
 5: 'I-EVENT',
 6: 'I-GPE',
 7: 'B-LAW',
 8: 'I-QUANTITY',
 9: 'B-FAC',
 10: 'I-PERSON',
 11: 'B-GPE',
 12: 'I-LOC',
 13: 'I-LAW',
 14: 'B-WORK_OF_ART',
 15: 'B-QUANTITY',
 16: 'I-ORDINAL',
 17: 'B-ORG',
 18: 'I-WORK_OF_ART',
 19: 'B-TIME',
 20: 'B-CARDINAL',
 21: 'I-ORG',
 22: 'I-CARDINAL',
 23: 'I-NORP',
 24: 'B-DATE',
 25: 'B-TITLE',
 26: 'B-EVENT',
 27: 'B-PERCENT',
 28: 'B-LOC',
 29: 'B-NORP',
 30: 'B-LANGUAGE',
 31: 'I-PERCENT',
 32: 'I-TITLE',
 33: 'B-PRODUCT',
 34: 'B-MONEY',
 35: 'I-TIME',
 36: 'B-PERSON',
 37: 'O',
 38: 'I-PRODUCT'}

In [21]:
tag2index

{'I-LANGUAGE': 0,
 'I-MONEY': 1,
 'B-ORDINAL': 2,
 'I-DATE': 3,
 'I-FAC': 4,
 'I-EVENT': 5,
 'I-GPE': 6,
 'B-LAW': 7,
 'I-QUANTITY': 8,
 'B-FAC': 9,
 'I-PERSON': 10,
 'B-GPE': 11,
 'I-LOC': 12,
 'I-LAW': 13,
 'B-WORK_OF_ART': 14,
 'B-QUANTITY': 15,
 'I-ORDINAL': 16,
 'B-ORG': 17,
 'I-WORK_OF_ART': 18,
 'B-TIME': 19,
 'B-CARDINAL': 20,
 'I-ORG': 21,
 'I-CARDINAL': 22,
 'I-NORP': 23,
 'B-DATE': 24,
 'B-TITLE': 25,
 'B-EVENT': 26,
 'B-PERCENT': 27,
 'B-LOC': 28,
 'B-NORP': 29,
 'B-LANGUAGE': 30,
 'I-PERCENT': 31,
 'I-TITLE': 32,
 'B-PRODUCT': 33,
 'B-MONEY': 34,
 'I-TIME': 35,
 'B-PERSON': 36,
 'O': 37,
 'I-PRODUCT': 38}

In [22]:
num_labels

39

In [23]:
import torch

In [24]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [25]:
model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                        num_labels=num_labels,
                                                        id2label = index2tag,
                                                        label2id = tag2index).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
input_ids = tokenizer.encode(text, return_tensors="pt")

In [27]:
pd.DataFrame([tokens,input_ids[0].numpy()], index=['tokens','input_ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
tokens,[CLS],mustafa,kemal,ata,##tur,##k,",",turk,mar,##esa,...,nin,kurucusu,ve,ilk,cumhur,##bas,##kan,##ıdır,.,[SEP]
input_ids,2,4257,4764,4172,3346,1021,16,6238,2822,31881,...,2527,11118,1992,2428,2916,9200,2977,28535,18,3


In [28]:
outputs = model(input_ids.to(device)).logits

In [29]:
outputs.shape

torch.Size([1, 41, 39])

In [30]:
predictions = torch.argmax(outputs, dim=-1)

In [31]:
preds = [tags_classlabel.names[p] for p in predictions[0].cpu().numpy()]

In [32]:
pd.DataFrame([tokens,preds],
             index = ["Tokens","Preds"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
Tokens,[CLS],mustafa,kemal,ata,##tur,##k,",",turk,mar,##esa,...,nin,kurucusu,ve,ilk,cumhur,##bas,##kan,##ıdır,.,[SEP]
Preds,I-GPE,I-LANGUAGE,I-CARDINAL,I-GPE,I-LOC,I-GPE,I-LOC,I-GPE,I-NORP,I-GPE,...,B-LOC,I-NORP,B-EVENT,I-DATE,B-ORG,I-QUANTITY,B-ORG,I-QUANTITY,I-QUANTITY,I-GPE


In [33]:
def tag_text(text, tags, model, tokenizer):
  tokens = tokenizer(text).tokens()
  input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
  outputs = model(input_ids)[0]
  predictions = torch.argmax(outputs, dim=2)
  preds = [tags_classlabel.names[p] for p in predictions[0].cpu().numpy()]
  return pd.DataFrame([tokens,preds],
             index = ["Tokens","Preds"])

In [34]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'tag_ids'],
        num_rows: 17967
    })
    validation: Dataset({
        features: ['tokens', 'tags', 'tag_ids'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'tags', 'tag_ids'],
        num_rows: 1000
    })
})

In [35]:
example = dataset["train"][0]
word, tag = example["tokens"], example["tag_ids"]

In [36]:
pd.DataFrame([word,tag], index = ["Tokens","labels"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,Orda,Spike,",",First'ün,etkisiyle,Buffy'ye,saldırır,ve,insanları,öldürüp,buraya,gömdüğünü,hatırlar,.
labels,37,36,37,36,37,36,37,37,37,37,37,37,37,37


In [37]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

In [38]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [39]:
pd.DataFrame([tokens], index = ["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
Tokens,[CLS],orda,sp,##ike,",",first,',un,etkisiyle,bu,...,oldu,##rup,buraya,go,##m,##du,##gunu,hatırlar,.,[SEP]


In [40]:
word_ids = tokenized_input.word_ids()

In [41]:
pd.DataFrame([tokens,word_ids], index = ["Tokens","Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
Tokens,[CLS],orda,sp,##ike,",",first,',un,etkisiyle,bu,...,oldu,##rup,buraya,go,##m,##du,##gunu,hatırlar,.,[SEP]
Word IDs,,0,1,1,2,3,3,3,4,5,...,9,9,10,11,11,11,11,12,13,


In [42]:
tag

[37, 36, 37, 36, 37, 36, 37, 37, 37, 37, 37, 37, 37, 37]

In [43]:
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
  if word_idx is None or word_idx == previous_word_idx:
    label_ids.append(-100)
  elif word_idx != previous_word_idx:
    label_ids.append(tag[word_idx])
  previous_word_idx = word_idx

labels = [index2tag[i] if i != -100 else "IGN" for i in label_ids]

index = ["Tokens","Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
Tokens,[CLS],orda,sp,##ike,",",first,',un,etkisiyle,bu,...,oldu,##rup,buraya,go,##m,##du,##gunu,hatırlar,.,[SEP]
Word IDs,,0,1,1,2,3,3,3,4,5,...,9,9,10,11,11,11,11,12,13,
Label IDs,-100,37,36,-100,37,36,-100,-100,37,36,...,37,-100,37,37,-100,-100,-100,37,37,-100
Labels,IGN,O,B-PERSON,IGN,O,B-PERSON,IGN,IGN,O,B-PERSON,...,O,IGN,O,O,IGN,IGN,IGN,O,O,IGN


In [44]:
dataset["train"]["tag_ids"]

[[37, 36, 37, 36, 37, 36, 37, 37, 37, 37, 37, 37, 37, 37],
 [37, 37, 37, 2, 37, 37, 37, 37, 37, 37, 37],
 [37, 11, 20, 37, 37, 37, 37, 20, 37, 37, 20, 37, 37, 20, 37, 37, 37, 37],
 [24,
  3,
  3,
  3,
  11,
  26,
  5,
  5,
  37,
  37,
  37,
  37,
  9,
  4,
  4,
  37,
  11,
  37,
  37,
  11,
  6,
  37,
  11,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37],
 [36, 10, 10, 37, 37, 37, 26, 5, 5, 5, 37, 26, 5, 37, 30, 37, 37],
 [25, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37],
 [17, 21, 37, 11, 6, 6, 37, 37, 37, 37],
 [36, 37, 36, 37, 37, 37, 37, 37, 37, 36, 25, 32, 37],
 [11, 37, 11, 37, 17, 21, 37, 37, 37, 37, 37],
 [37, 37, 37, 37, 37, 37, 37, 37, 28, 12, 37, 37, 37, 37, 37],
 [17,
  21,
  21,
  21,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  36,
  37,
  37,
  2,
  37,
  37,
  26,
  5,
  5,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37],
 [37, 37, 26, 5, 5, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37],
 [37, 29, 23, 37, 37, 37, 37, 29, 23, 37, 3

In [45]:
def tokenize_and_align_labels(example):
  tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)

  labels = []
  for idx, label in enumerate(example["tag_ids"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx

    labels.append(label_ids)

  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [46]:
def encode_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=['tokens', 'tags'])

In [47]:
dataset_encoded = encode_dataset(dataset)

Map:   0%|          | 0/17967 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [48]:
dataset_encoded["train"][0]

{'tag_ids': [37, 36, 37, 36, 37, 36, 37, 37, 37, 37, 37, 37, 37, 37],
 'input_ids': [2,
  13569,
  3863,
  16272,
  16,
  24710,
  11,
  2559,
  11704,
  2011,
  6457,
  1015,
  11,
  2968,
  4692,
  1024,
  1992,
  6108,
  2114,
  2648,
  4588,
  4333,
  1020,
  2046,
  11618,
  21831,
  18,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  37,
  36,
  -100,
  37,
  36,
  -100,
  -100,
  37,
  36,
  -100,
  -100,
  -100,
  -100,
  37,
  -100,
  37,
  37,
  37,
  -100,
  37,
  37,
  -100,
  -100,
  -100,
  37,
  37,
  -100]}

In [49]:
!pip install -q seqeval

In [50]:
from seqeval.metrics import classification_report

In [51]:
import numpy as np

In [52]:
def align_prediction(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx][seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list


In [53]:
from huggingface_hub import notebook_login

In [54]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [55]:
from transformers import TrainingArguments

In [56]:
num_epochs = 4
batch_size = 16
logging_steps = 18967 // batch_size
model_name = "bert-base-turkish-uncased-ner"

In [57]:
!pip install -q tensorflow

In [58]:
!pip install -q accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [58]:
training_args = TrainingArguments(
    output_dir= model_name,
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_steps = 1e6,
    weight_decay=0.01,
    logging_steps=logging_steps,
    learning_rate=2e-5,
    push_to_hub=True,
    report_to="tensorboard"
)



In [59]:
from seqeval.metrics import f1_score

In [60]:
def compute_metrics(eval_preds):
  y_pred, y_true = align_prediction(eval_preds.predictions, eval_preds.label_ids)
  return {"f1": f1_score(y_true, y_pred)}

In [61]:
from transformers import DataCollatorForTokenClassification

In [62]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [63]:
def model_init():
  return model

In [64]:
from transformers import Trainer

In [65]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 17967
    })
    validation: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [66]:
from datasets import concatenate_datasets

combined_dataset = concatenate_datasets([dataset_encoded['train'], dataset_encoded['test']])

dataset_encoded['combined'] = combined_dataset

print(dataset_encoded)

DatasetDict({
    train: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 17967
    })
    validation: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    combined: Dataset({
        features: ['tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 18967
    })
})


In [67]:
dataset_encoded['combined'][17967]

{'tag_ids': [36, 37, 37, 37, 37, 37, 37],
 'input_ids': [2,
  4689,
  9078,
  2312,
  16229,
  2412,
  2385,
  24811,
  2738,
  21335,
  27203,
  18,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 36, -100, 37, 37, -100, 37, -100, -100, 37, 37, 37, -100]}

In [68]:
dataset_encoded["test"][0]

{'tag_ids': [36, 37, 37, 37, 37, 37, 37],
 'input_ids': [2,
  4689,
  9078,
  2312,
  16229,
  2412,
  2385,
  24811,
  2738,
  21335,
  27203,
  18,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 36, -100, 37, 37, -100, 37, -100, -100, 37, 37, 37, -100]}

In [69]:
trainer = Trainer(model_init = model_init,
                  args= training_args,
                  data_collator = data_collator,
                  compute_metrics = compute_metrics,
                  train_dataset = dataset_encoded["combined"],
                  eval_dataset = dataset_encoded["validation"],
                  tokenizer = tokenizer,
                  )

In [70]:
trainer.train()
trainer.push_to_hub(commit_message="OK!")

Epoch,Training Loss,Validation Loss,F1
1,0.4,0.250248,0.770254
2,0.2227,0.243859,0.773996
3,0.1738,0.251133,0.778265
4,0.1474,0.260324,0.78215


events.out.tfevents.1717711486.b18afc294261.1365.0:   0%|          | 0.00/8.84k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saribasmetehan/bert-base-turkish-uncased-ner/commit/20a2369841590bcddeef40d75ecd9768895d2a58', commit_message='OK!', commit_description='', oid='20a2369841590bcddeef40d75ecd9768895d2a58', pr_url=None, pr_revision=None, pr_num=None)

In [71]:
from transformers import pipeline

In [72]:
model_id = "saribasmetehan/bert-base-turkish-uncased-ner"

In [73]:
ner = pipeline("ner", model=model_id)

config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/766k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [77]:
text = "Bu toplam sıfır ise, Newton'ın birinci yasası cismin hareket durumunun değişmeyeceğini söyler."

In [78]:
preds= ner(text, aggregation_strategy = "simple")
pd.DataFrame(preds)

Unnamed: 0,entity_group,score,word,start,end
0,CARDINAL,0.948703,sıfır,10,15
1,PERSON,0.349816,new,21,24
2,LAW,0.568786,##ton,24,27
3,LAW,0.365599,ın,28,30
4,ORDINAL,0.841787,birinci,31,38
5,LAW,0.614421,yasası,39,45


In [76]:
from transformers import pipeline
text = "Lienen, Ocak 2002'de takım lig sonuncusuyken kovuldu."
model_id = "saribasmetehan/bert-base-turkish-uncased-ner"
ner = pipeline("ner",model = model_id)
preds= ner(text, aggregation_strategy = "simple")

pd.DataFrame(preds)

Unnamed: 0,entity_group,score,word,start,end
0,PERSON,0.946456,lienen,0,6
1,DATE,0.996089,ocak 2002,8,17
