In [1]:
import os
import zipfile
import torch
from transformers import AlbertModel, BertTokenizerFast
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim


In [2]:
use_gpu = torch.backends.mps.is_available()
device = torch.device('mps' if use_gpu else 'cpu')

In [3]:
data_root = './data/'
local_zip = data_root + 'ner_datasets.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall(data_root)
zip_ref.close()

In [4]:
max_len = 150
batch_size = 32
model_name = 'clue/albert_chinese_tiny'
saved_model = './models/ner_albert_chinese'

In [5]:
def get_data_path(dataset='mara', type='train'):
    data_dir = data_root + 'ner_datasets/' + dataset
    if type in ['train', 'val', 'test'] and dataset in ['msra', 'daily', 'weibo']:
        sentences = os.path.join(data_dir, type, 'sentences.txt')
        labels = os.path.join(data_dir, type, 'labels.txt')
        return sentences, labels
    else:
        raise ValueError(
            "data type not in ['train', 'val', 'test'] or dataset name not in ['msra', 'daily']")


In [6]:
def get_file_lines(filename):
    with open(filename) as file:
        lines = [line.rstrip() for line in file]
        return lines

In [7]:
class Label_Tokenizer(object):
    def __init__(self, labels, max_length):
        super().__init__()
        self.size = len(labels)
        labels_to_ids = {k: v for v, k in enumerate(labels)}
        ids_to_labels = {v: k for v, k in enumerate(labels)}
        self.labels_to_ids = labels_to_ids
        self.ids_to_labels = ids_to_labels
        self.max_length = max_length

    def tokenize(self, labels):
        tokens = [self._tokenize(label) for label in labels]
        return tokens

    def _tokenize(self, label):
        label = label.decode('utf-8') if hasattr(label, 'decode') else label
        labels = [le for le in label.split(' ')]
        special_token = self.encode(['O'])[0]

        tokens = self.encode(labels)
        tokens = tokens[:self.max_length - 2]
        tokens = [special_token] + tokens + [special_token]
        # Add padded TAG tokens
        padding_len = self.max_length - len(tokens)
        tokens = tokens + ([special_token] * padding_len)
        return tokens

    def encode(self, labels):
        return [self.labels_to_ids[label] for label in labels]

    def decode(self, ids):
        return [self.ids_to_labels[id] for id in ids]


labels = ['O', 'B-ORG', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-LOC']
label_tokenizer = Label_Tokenizer(labels, max_length=max_len)
num_labels = label_tokenizer.size


In [8]:
class Sentence_Tokenizer(object):
    def __init__(self, model_name, max_length=128, padded_token=True):
        super().__init__()
        self.max_length = max_length
        self.padded_token = padded_token
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)

    def bert_pack_inputs(self, sentences):
        outputs = [self.tokenize(sentence, self.padded_token) for sentence in sentences]
        return outputs

    def tokenize(self, sentence, padded_token=True):
        padiding = 'max_length' if padded_token else True
        tokens = self.tokenizer(text=sentence, max_length=self.max_length, truncation=True, padding=padiding, add_special_tokens=True, return_tensors="pt")
        return tokens

    def decode(self, tokens):
        words = self.tokenizer.decode(tokens)
        return words


tokenizer = Sentence_Tokenizer(model_name, max_length=max_len)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


In [9]:
class NERDataset(Dataset):
  def __init__(self, type = 'train', datasets = ['msra', 'daily'], max_line = None):
    x_data = None
    y_data = None
    for dataset in datasets:
        sen_file, labels_file = get_data_path(dataset, type)
        sentences = get_file_lines(sen_file)
        labels = get_file_lines(labels_file)
        if(max_line is not None):
          sentences = sentences[:max_line]
          labels = labels[:max_line]
        x_data = sentences if x_data is None else [*x_data, *sentences]
        y_data = labels if y_data is None else [*y_data, *labels]

    self.x_data = tokenizer.bert_pack_inputs(x_data)
    self.y_data = torch.tensor(label_tokenizer.tokenize(y_data)).long()
    self.len = len(y_data)

  def __getitem__(self, index):
    return self.x_data[index], self.y_data[index]
  
  def __len__(self):
    return self.len

In [14]:
train_dataset = NERDataset('train')
test_dataset = NERDataset('val')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
class Net(torch.nn.Module):
    def __init__(self, num_labels):
         super(Net, self).__init__()
         self.num_labels = num_labels
         self.encoder = AlbertModel.from_pretrained(model_name)
         self.bert_drop = torch.nn.Dropout(0.1)
         self.liner1 = torch.nn.Linear(312, 128)
         self.liner2 = torch.nn.Linear(128, num_labels)

    def forward(self, input_ids, token_type_ids, attention_mask):
         outputs = self.encoder(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
         logits = outputs[0] #(32,150, 312)
         logits = self.bert_drop(logits)
         logits = self.liner1(logits)
         logits = self.liner2(logits)
         return logits

model = Net(num_labels).to(device)


Some weights of the model checkpoint at clue/albert_chinese_tiny were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

def loss_fn(outputs, labels, num_labels):
    return criterion(outputs.view(-1, num_labels), labels.view(-1))


In [22]:
def train(epoch):
  running_loss = 0.0
  for i, data in enumerate(train_loader, 0):
    inputs, target = data
    input_ids = inputs['input_ids'].squeeze(1).to(device)
    token_type_ids = inputs['token_type_ids'].squeeze(1).to(device)
    attention_mask = inputs['attention_mask'].squeeze(1).to(device)

    target = target.long().to(device)
    optimizer.zero_grad()

    logits = model(input_ids,token_type_ids, attention_mask)
    loss = loss_fn(logits, target, model.num_labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

    if i % 300 == 299:
      print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 300))
      running_loss = 0.0

In [23]:
def accuracy_fn(logits, labels): #(32, 150, 7) (32,150)
    total = 0
    correct = 0

    batch_size = logits.size(0)
  
    for i in range(batch_size):
        predicted = logits[i].argmax(dim=1)[labels[i] != 0]

        label = labels[i][labels[i] != 0]
        total += label.size(0)
        correct += (predicted == label).sum().item()
    return correct / total
        

In [24]:
def test():
  total_acc = 0
  with torch.no_grad():
    for data in test_loader:
      inputs, labels = data
      input_ids = inputs['input_ids'].squeeze(1).to(device)
      token_type_ids = inputs['token_type_ids'].squeeze(1).to(device)
      attention_mask = inputs['attention_mask'].squeeze(1).to(device)

      labels = labels.long().to(device)
      
      logits = model(input_ids,token_type_ids, attention_mask)
      
      acc = accuracy_fn(logits, labels)
      total_acc += acc

  print('Accuracy of the network on the test inputs: %d %%' % (100 * total_acc / len(test_loader)))

In [25]:
for epoch in range(3):
  train(epoch)
  test()

[1,   300] loss: 0.433
[1,   600] loss: 0.158
[1,   900] loss: 0.144
[1,  1200] loss: 0.134
[1,  1500] loss: 0.125
[1,  1800] loss: 0.122
Accuracy of the network on the test inputs: 17 %
[2,   300] loss: 0.115
[2,   600] loss: 0.107
[2,   900] loss: 0.106
[2,  1200] loss: 0.103
[2,  1500] loss: 0.101
[2,  1800] loss: 0.097
Accuracy of the network on the test inputs: 16 %
[3,   300] loss: 0.090
[3,   600] loss: 0.090
[3,   900] loss: 0.089
[3,  1200] loss: 0.088
[3,  1500] loss: 0.084
[3,  1800] loss: 0.084
Accuracy of the network on the test inputs: 16 %


In [26]:
def align_word_ids(sentence):
  
    tokenized_inputs = tokenizer.tokenize(sentence)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(0)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(0)
        else:
            try:
                label_ids.append(1)
            except:
                label_ids.append(0)
        previous_word_idx = word_idx

    return label_ids

def evaluate_one_text(model, sentence):
    text = tokenizer.tokenize(sentence)

    with torch.no_grad():
        input_id = text['input_ids'].to(device)
        type_id = text['token_type_ids'].to(device)
        mask = text['attention_mask'].to(device)

        label_ids = torch.Tensor(align_word_ids(sentence)).to(device)

        logits = model(input_id, type_id, mask)
        predictions = logits[0].argmax(dim=1)
        predictions = predictions[label_ids != 0]

        input_ids = input_id[0][label_ids != 0].tolist()
        prediction_label = label_tokenizer.decode(predictions.tolist())
    
        res = []
        words = {
            'word': '',
            'tag': None
        }
        for idx, tag in enumerate(prediction_label):
            if(tag != 'O'):
                _, suf = tag.split('-')
                words['tag'] = suf
                token = input_ids[idx]
                word = tokenizer.decode(token)
                words['word'] = words['word'] + word if words['word'] else word
            else:
                if(words['tag']):
                    res.append(words)
                words = {
                    'word': '',
                    'tag': None
                }
        return res

In [27]:
test_input= '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'

evaluate_one_text(model, test_input)

[{'word': '李华住在朝阳区香河园街道西坝河北里社区，在5', 'tag': 'ORG'},
 {'word': '4号去', 'tag': 'ORG'}]

In [127]:
torch.save(model.state_dict(), './model/model.pt')