In [1]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('wnut17train.conll')

In [2]:
print(texts[0][10:17], tags[0][10:17], sep='\n')

['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']


In [3]:
len(texts)

3394

In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [5]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [6]:
id2tag

{0: 'B-person',
 1: 'B-corporation',
 2: 'I-person',
 3: 'B-location',
 4: 'I-location',
 5: 'I-corporation',
 6: 'B-group',
 7: 'B-creative-work',
 8: 'I-creative-work',
 9: 'B-product',
 10: 'I-group',
 11: 'I-product',
 12: 'O'}

In [7]:
max_length=100

In [8]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, padding='max_length', truncation=True,max_length=max_length)
val_encodings = tokenizer(val_texts, is_split_into_words=True, padding='max_length', truncation=True,max_length=max_length)

In [9]:
assert(max_length==len(train_encodings['input_ids'][0]))
assert(max_length==len(val_encodings['input_ids'][0]))

In [10]:
import numpy as np

def encode_tags(tags):
    encoded_labels =[]
    for doc in tags:
        temp = [tag2id[tag] for tag in doc]
        temp.extend([4]*max_length)
        temp = temp[:max_length]
        encoded_labels.append(temp)
    return encoded_labels

train_labels = encode_tags(train_tags)
val_labels = encode_tags(val_tags)

In [11]:
assert(len(train_labels[2])==max_length)

In [12]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [13]:
len(unique_tags)

13

In [15]:
from transformers import TFBertForTokenClassification
model1 = TFBertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_tags))


optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model1.compile(optimizer=optimizer, loss=model1.compute_loss) # can also use any keras loss fn
model1.fit(train_dataset.shuffle(100).batch(16), epochs=5, batch_size=16,validation_data=val_dataset)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 28/170 [===>..........................] - ETA: 32s - loss: 0.0439

KeyboardInterrupt: 

In [16]:
model1.summary()

Model: "tf_bert_for_token_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  107719680 
_________________________________________________________________
dropout_75 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  9997      
Total params: 107,729,677
Trainable params: 107,729,677
Non-trainable params: 0
_________________________________________________________________


In [17]:
sample_sentense = "i stayed for two weeks in Empire State Building"
print(sample_sentense)
print(sample_sentense.split())
inputs = tokenizer.encode(sample_sentense, return_tensors="tf")
outputs = model1(inputs)
outputs = outputs[0]
predictions = tf.argmax(outputs, axis=2)
np.vectorize(lambda x: id2tag[x])(predictions.numpy()).tolist()

i stayed for two weeks in Empire State Building
['i', 'stayed', 'for', 'two', 'weeks', 'in', 'Empire', 'State', 'Building']


[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-location',
  'O',
  'O',
  'I-location',
  'I-location']]

In [18]:
inputs

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[ 101,  178, 3523, 1111, 1160, 2277, 1107, 2813, 1426, 4334,  102]])>

In [21]:
tokenizer.decode([ 101,  178, 3523, 1111, 1160, 2277, 1107, 2813, 1426, 4334,  102])

'[CLS] i stayed for two weeks in Empire State Building [SEP]'

In [23]:
len('[CLS] i stayed for two weeks in Empire State Building [SEP]'.split())

11