In [1]:
import os
import zipfile
import torch
import pandas as pd
from transformers import AlbertModel, BertTokenizer, BertForTokenClassification
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

In [2]:
data_root = './data/'
local_zip = data_root + 'ner_datasets.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall(data_root)
zip_ref.close()


In [3]:
max_len = 150
batch_size = 32
model_name = 'clue/albert_chinese_tiny'
saved_model = './models/ner_albert_chinese'


In [4]:
def get_data_path(dataset='mara', type='train'):
    data_dir = data_root + 'ner_datasets/' + dataset
    if type in ['train', 'val', 'test'] and dataset in ['msra', 'daily', 'weibo']:
        sentences = os.path.join(data_dir, type, 'sentences.txt')
        labels = os.path.join(data_dir, type, 'labels.txt')
        return sentences, labels
    else:
        raise ValueError(
            "data type not in ['train', 'val', 'test'] or dataset name not in ['msra', 'daily']")


In [5]:
class Label_Tokenizer(object):
    def __init__(self, labels, max_length):
        super().__init__()
        self.size = len(labels)
        labels_to_ids = {k: v for v, k in enumerate(labels)}
        ids_to_labels = {v: k for v, k in enumerate(labels)}
        self.labels_to_ids = labels_to_ids
        self.ids_to_labels = ids_to_labels
        self.max_length = max_length

    def tokenize(self, labels):
        tokens = [self._tokenize(label) for label in labels]
        return tokens

    def _tokenize(self, label):
        label = label.decode('utf-8') if hasattr(label, 'decode') else label
        labels = [le for le in label.split(' ')]
        special_token = self.encode(['O'])[0]

        tokens = self.encode(labels)
        tokens = tokens[:self.max_length - 2]
        tokens = [special_token] + tokens + [special_token]
        # Add padded TAG tokens
        padding_len = self.max_length - len(tokens)
        tokens = tokens + ([special_token] * padding_len)
        return tokens

    def encode(self, labels):
        return [self.labels_to_ids[label] for label in labels]

    def decode(self, ids):
        return [self.ids_to_labels[id] for id in ids]


labels = ['O', 'B-ORG', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-LOC']
label_tokenizer = Label_Tokenizer(labels, max_length=max_len)
labels_num = label_tokenizer.size


In [None]:
class Sentence_Tokenizer(object):
    def __init__(self, model_name, max_length=128, padded_token=True):
        super().__init__()
        self.max_length = max_length
        self.padded_token = padded_token
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def bert_pack_inputs(self, sentences):
        outputs = [self.tokenize(sentence, self.padded_token) for sentence in sentences]
        return outputs

    def tokenize(self, sentence, padded_token=True):
        padiding = 'max_length' if padded_token else True
        tokens = self.tokenizer(text=sentence, max_length=self.max_length, truncation=True, padding=padiding, add_special_tokens=True, return_tensors="pt")
        return tokens

    def decode(self, tokens):
        words = self.tokenizer.decode(tokens)
        return words


tokenizer = Sentence_Tokenizer(model_name, max_length=max_len)


In [None]:
class NERDataset(Dataset):
  def __init__(self, type = 'train', datasets = ['msra', 'daily'], max_line = None):
    x_data = None
    y_data = None
    for dataset in datasets:
        sen_file, labels_file = get_data_path(dataset, type)
        sentences = pd.read_csv(sen_file, sep="\n", header=None)
        labels = pd.read_csv(labels_file, sep="\n", header=None)
        if(max_line is not None):
          sentences = sentences.head(max_line)
          labels = labels.head(max_line)
        x_data = sentences if x_data is None else pd.concat([x_data, sentences])
        y_data = labels if y_data is None else pd.concat([y_data, labels])

    x_data = x_data.to_numpy().flatten()
    y_data = y_data.to_numpy().flatten()

    self.x_data = tokenizer.bert_pack_inputs(x_data)
    self.y_data = torch.tensor(label_tokenizer.tokenize(y_data)).long()
    self.len = len(y_data)

  def __getitem__(self, index):
    return self.x_data[index], self.y_data[index]
  
  def __len__(self):
    return self.len

In [None]:
train_dataset = NERDataset('train', max_line=110829)
val_dataset = NERDataset('val', max_line=11082)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained(model_name, num_labels=labels_num)

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [None]:
def train_loop(model, train_dataloader, val_dataloader):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in train_dataloader:

            train_label = train_label.to(device)
            input_ids = train_data['input_ids'].squeeze(1).to(device)
            attention_mask = train_data['attention_mask'].squeeze(1).to(device)
            token_type_ids = train_data['token_type_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_ids, attention_mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            input_ids = val_data['input_ids'].squeeze(1).to(device)
            attention_mask = val_data['attention_mask'].squeeze(1).to(device)

            _, logits = model(input_ids, attention_mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(val_dataloader)
        val_loss = total_loss_val / len(val_dataloader)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(train_dataloader): .3f} | Accuracy: {total_acc_train / len(train_dataloader): .3f} | Val_Loss: {total_loss_val / len(val_dataloader): .3f} | Accuracy: {total_acc_val / len(val_dataloader): .3f}')

LEARNING_RATE = 3e-5
EPOCHS = 3

model = BertModel()

In [None]:
train_loop(model, train_loader, val_loader)

In [None]:
# def train_loop(model, train_dataloader, val_dataloader, epochs=5, lr = 5e-3):
#     use_cuda = torch.cuda.is_available()
#     device = torch.device("cuda" if use_cuda else "cpu")

#     optimizer = optim.SGD(model.parameters(), lr=lr)
#     if use_cuda:
#         model = model.cuda()

#     best_acc = 0
#     best_loss = 1000
#     total_acc_train = 0
#     total_loss_train = 0
#     total_acc_val = 0
#     total_loss_val = 0
    
#     for epoch in range(epochs):
#         for batch_idx, (data, target) in enumerate(train_loader):
#             target = target.to(device)
#             attention_mask = data['attention_mask'].squeeze(1).to(device)
#             input_ids = data['input_ids'].squeeze(1).to(device)
#             optimizer.zero_grad()
#             loss, logits = model(input_ids, attention_mask, target)

#             for i in range(logits.shape[0]):
#               logits_clean = logits[i][target[i] != -100]
#               label_clean = target[i][target[i] != -100]

#               predictions = logits_clean.argmax(dim=1)
#               acc = (predictions == label_clean).float().mean()
#               total_acc_train += acc
#               total_loss_train += loss.item()
            
#             loss.backward()
#             optimizer.step()
        
#         with torch.no_grad():
#             for data, target in val_dataloader:
#                 target = target.to(device)
#                 attention_mask = data['attention_mask'].squeeze(1).to(device)
#                 input_ids = data['input_ids'].squeeze(1).to(device)
#                 loss, logits = model(input_ids, attention_mask, target)

#                 for i in range(logits.shape[0]):

#                     logits_clean = logits[i][target[i] != -100]
#                     label_clean = target[i][target[i] != -100]

#                     predictions = logits_clean.argmax(dim=1)
#                     acc = (predictions == label_clean).float().mean()
#                     total_acc_val += acc
#                     total_loss_val += loss.item()

#         val_accuracy = total_acc_val / len(val_dataloader)
#         val_loss = total_loss_val / len(val_dataloader)
#         print(f'Epochs: {epoch + 1} | Loss: {total_loss_train / len(train_dataloader): .3f} | Accuracy: {total_acc_train / len(train_dataloader): .3f} | Val_Loss: {total_loss_val / len(val_dataloader): .3f} | Accuracy: {total_acc_val / len(val_dataloader): .3f}')
