In [1]:
import pandas as pd
import numpy as np

import joblib
import torch
import pickle as pkl

from sklearn import preprocessing
from sklearn import model_selection

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import config
import dataset
import engine
from model import Model

TRAINING_FILE = "./ner_dataset.csv"

MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 1  # 10
BASE_MODEL_PATH = "./bert-base-uncased"
MODEL_PATH = "model.bin"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BASE_MODEL_PATH,
    do_lower_case=True
)

# 1. Train

## 1.1 Preprocessing

In [2]:
df = pd.read_csv(TRAINING_FILE, encoding="latin-1")#[:150_000]
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [3]:
df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [4]:
enc_pos = preprocessing.LabelEncoder()
enc_tag = preprocessing.LabelEncoder()
df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,19,16
1,Sentence: 1,of,10,16
2,Sentence: 1,demonstrators,19,16
3,Sentence: 1,have,35,16
4,Sentence: 1,marched,34,16
...,...,...,...,...
1048570,Sentence: 47959,they,22,16
1048571,Sentence: 47959,responded,32,16
1048572,Sentence: 47959,to,29,16
1048573,Sentence: 47959,the,7,16


In [5]:
sentences = df.groupby("Sentence #")["Word"].apply(list).values
pos = df.groupby("Sentence #")["POS"].apply(list).values
tag = df.groupby("Sentence #")["Tag"].apply(list).values

## 1.2 Training model

In [6]:
meta_data = {
    "enc_pos": enc_pos,
    "enc_tag": enc_tag
}

with open("meta.bin", "wb") as fp:
    pkl.dump(meta_data, fp)

In [7]:
n_pos = len(list(enc_pos.classes_))
n_tag = len(list(enc_tag.classes_))

(
    train_sentences, test_sentences,
    train_pos,test_pos,
    train_tag, test_tag
) = model_selection.train_test_split(
    sentences, pos, tag, random_state=1337, test_size=0.1
)

train_dataset = dataset.Dataset(
    texts=train_sentences, pos=train_pos, tags=train_tag,
    tokenizer=TOKENIZER, max_len=MAX_LEN
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
)

valid_dataset = dataset.Dataset(
    texts=test_sentences, pos=test_pos, tags=test_tag,
    tokenizer=TOKENIZER, max_len=MAX_LEN
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)

In [8]:
device = torch.device("cuda")
model = Model(BASE_MODEL_PATH, n_tag=n_tag, n_pos=n_pos)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(
    len(train_sentences) 
    / 
    TRAIN_BATCH_SIZE * EPOCHS
)

optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

Some weights of the model checkpoint at ./bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
best_loss = np.inf
for epoch in range(config.EPOCHS):
    train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
    test_loss = engine.eval_fn(valid_data_loader, model, device)
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), config.MODEL_PATH)
        best_loss = test_loss

100%|██████████| 1349/1349 [10:42<00:00,  2.10it/s]
100%|██████████| 600/600 [00:32<00:00, 18.69it/s]


Train Loss = 0.245129476937212 Valid Loss = 0.11209187394939363


# Prediction

In [11]:
with open("meta.bin","rb") as fp:
    meta_data = joblib.load(fp)
enc_pos = meta_data["enc_pos"]
enc_tag = meta_data["enc_tag"]

n_pos = len(list(enc_pos.classes_))
n_tag = len(list(enc_tag.classes_))

sentence = "Ivan visited Vietnam a week ago. He took a room in hotel Matiott."
sentence_tok = sentence.split()
tokenized_sentence = TOKENIZER.encode(sentence_tok)

print(sentence)
print(TOKENIZER.tokenize(sentence))
print(tokenized_sentence)

test_dataset = dataset.Dataset(
    texts=[sentence_tok], 
    pos=[[0] * len(sentence_tok)], 
    tags=[[0] * len(sentence_tok)],
    tokenizer=TOKENIZER, 
    max_len=MAX_LEN
)

Ivan visited Vietnam a week ago. He took a room in hotel Matiott.
['ivan', 'visited', 'vietnam', 'a', 'week', 'ago', '.', 'he', 'took', 'a', 'room', 'in', 'hotel', 'mat', '##iot', '##t', '.']
[101, 100, 4716, 100, 1037, 2733, 100, 100, 2165, 1037, 2282, 1999, 3309, 100, 102]


In [12]:
device = torch.device("cuda")
model = Model(BASE_MODEL_PATH, n_tag=n_tag, n_pos=n_pos)
model.load_state_dict(torch.load(config.MODEL_PATH))
model.to(device)

with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    tag, pos, _ = model(**data)

    print(
        enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1)
        )[1:len(tokenized_sentence)-1]
    )
    print(
        enc_pos.inverse_transform(
            pos.argmax(2).cpu().numpy().reshape(-1)
        )[1:len(tokenized_sentence)-1]
    )

Some weights of the model checkpoint at ./bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['B-per' 'O' 'B-geo' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B-org']
['NNP' 'VBD' 'NNP' 'DT' 'NN' 'RB' ',' 'PRP' 'VBD' 'DT' 'NN' 'IN' 'NNP']


# Conclusion
We used only 1 epoch, but we got quite sensible result. So fine tuning helps to get higher quality.