In [9]:
import pandas as pd
from datasets import load_dataset
import torch

import xtagger
from xtagger import df_to_torchtext_data
from xtagger import LSTMForTagging

In [3]:
%%capture
dataset = load_dataset("wikiann", "tr")

In [4]:
print(dataset["train"][0])

{'langs': ['tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr', 'tr'], 'ner_tags': [0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'spans': ["ORG: Slovenya Millî Basketbol Takımı'nı"], 'tokens': ['3.lük', 'maçında', 'Slovenya', 'Millî', 'Basketbol', "Takımı'nı", 'yendikleri', 'maçta', '23', 'sayı', ',', '6', 'ribaund', ',', '2', 'blok', 'istatistikleriyle', 'oynamış', 've', '12', 'faul', 'yaptırmıştır', '.']}


In [5]:
ner_encoding = {0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}


train_tokens = []
train_tags = []
for sample in dataset["train"]:
  train_tokens.append(' '.join(sample["tokens"]))
  train_tags.append(' '.join([ner_encoding[a] for a in sample["ner_tags"]]))

test_tokens = []
test_tags = []
for sample in dataset["train"]:
  test_tokens.append(' '.join(sample["tokens"]))
  test_tags.append(' '.join([ner_encoding[a] for a in sample["ner_tags"]]))

df_train = pd.DataFrame({"sentence": train_tokens, "tags": train_tags})
df_test = pd.DataFrame({"sentence": test_tokens, "tags": test_tags})

In [7]:
device = torch.device("cuda")

In [8]:
train_iterator, valid_iterator, test_iterator, TEXT, TAGS = df_to_torchtext_data(
    df_train, 
    df_test, 
    device,
    batch_size=32
)

Number of training examples: 20000
Number of testing examples: 20000
Unique tokens in TEXT vocabulary: 32442
Unique tokens in TAGS vocabulary: 8


In [10]:
model = LSTMForTagging(
    input_dim = len(TEXT.vocab),
    output_dim = len(TAGS.vocab),
    TEXT = TEXT,
    TAGS = TAGS,
    dropout = 0.2,
    device = device,
    cuda = True
)

The model has 3,877,040 trainable parameters


In [22]:
model.fit(
    train_iterator, 
    valid_iterator, 
    eval_metrics = ["acc", "avg_f1"], 
    epochs = 5
)

HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))

Evaluating...


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


{'eval': {'acc': 99.82975712015809, 'avg_f1': {'weighted': 99.8296458569029, 'micro': 99.82975712015809, 'macro': 87.20380046360874}}, 'train': {'acc': 99.83109235843136, 'avg_f1': {'weighted': 99.83098245674992, 'micro': 99.83109235843136, 'macro': 87.20747522307533}}, 'eval_loss': 0.0057962334088981155, 'train_loss': 0.005736465491144918}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


{'eval': {'acc': 99.85712950476012, 'avg_f1': {'weighted': 99.85705554146296, 'micro': 99.85712950476012, 'macro': 87.24908394609692}}, 'train': {'acc': 99.85646188562349, 'avg_f1': {'weighted': 99.85638636006242, 'micro': 99.85646188562349, 'macro': 87.24720303053678}}, 'eval_loss': 0.0047501457734499125, 'train_loss': 0.00474989441386424}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


{'eval': {'acc': 99.86781141094629, 'avg_f1': {'weighted': 99.86777985125971, 'micro': 99.86781141094629, 'macro': 87.29681002043405}}, 'train': {'acc': 99.86914664921956, 'avg_f1': {'weighted': 99.86911738676734, 'micro': 99.86914664921956, 'macro': 87.29997387730441}}, 'eval_loss': 0.004236640985694248, 'train_loss': 0.004274631327704993}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


{'eval': {'acc': 99.89251331900178, 'avg_f1': {'weighted': 99.89248522297359, 'micro': 99.89251331900178, 'macro': 87.32144025584755}}, 'train': {'acc': 99.89251331900178, 'avg_f1': {'weighted': 99.89248504832744, 'micro': 99.89251331900178, 'macro': 87.32164676288659}}, 'eval_loss': 0.003552931280672783, 'train_loss': 0.003444847616250627}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


{'eval': {'acc': 99.9005247486414, 'avg_f1': {'weighted': 99.90047597514327, 'micro': 99.9005247486414, 'macro': 87.33014320068513}}, 'train': {'acc': 99.90119236777802, 'avg_f1': {'weighted': 99.90114337925291, 'micro': 99.90119236777802, 'macro': 87.33136477105805}}, 'eval_loss': 0.002831419196887873, 'train_loss': 0.0028605571439489722}



In [32]:
sentence = dataset["validation"][2]["tokens"]
sentence

["Avustralya'da",
 '25',
 'numaraya',
 'çıkmış',
 ',',
 'ayrıca',
 'Yeni',
 'Zelanda',
 'listesine',
 '32',
 'numaradan',
 'giriş',
 'yapmış',
 've',
 '8',
 'numaraya',
 'çıkmıştır',
 '.']

In [33]:
model.predict(sentence)

([("avustralya'da", 'B-LOC'),
  ('25', 'O'),
  ('numaraya', 'O'),
  ('çıkmış', 'O'),
  (',', 'O'),
  ('ayrıca', 'O'),
  ('yeni', 'B-LOC'),
  ('zelanda', 'I-LOC'),
  ('listesine', 'O'),
  ('32', 'O'),
  ('numaradan', 'O'),
  ('giriş', 'O'),
  ('yapmış', 'O'),
  ('ve', 'O'),
  ('8', 'O'),
  ('numaraya', 'O'),
  ('çıkmıştır', 'O'),
  ('.', 'O')],
 [])