In [8]:
import pandas as pd
import numpy as np
import math
import torch
import keras
import transformers
import torch.nn.functional as F
from seqeval.metrics import f1_score, classification_report, accuracy_score, f1_score

In [9]:
import os
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW

In [10]:
from keras_preprocessing.sequence import pad_sequences

# Loading and Preprocessing the data

In [13]:

data_path = '../data/'
df = pd.read_csv(data_path + 'ner_dataset.zip', encoding="latin1")

In [14]:
df.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


We see NAN values under 'Sentence #' column but they simply belong to the sentence number above it, so let's
tie them up in their respective sentences

In [15]:
df.fillna(method='ffill')

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [16]:
df.columns

Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

In [17]:
df['Tag'].dropna(inplace=True)

In [18]:
df['Word'].dropna(inplace=True)

In [19]:
df['POS'].dropna(inplace=True)

In [20]:
# convert the column to string
df['Word'] = df['Word'].astype(str)

In [21]:
# convert the column to string
df['Tag'] = df['Tag'].astype(str)

# Parse data into document structure

In [23]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        
        
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [24]:
# Get full document data structure
getter = SentenceGetter(df)

In [25]:
# get sentence data
sentences = [[s for s in sent] for sent in getter.sentences]
sentences[0:2]

[[('Thousands', 'NNS', 'O')], [('Iranian', 'JJ', 'B-gpe')]]

In [26]:
# get pos data
poses = [[s[1] for s in sent] for sent in getter.sentences]
print(poses[0])

['NNS']


In [27]:
print(poses[0:2])

[['NNS'], ['JJ']]


In [28]:
# get tag label data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O']


# Make tag name into index for for training

In [29]:
# set gives unique values
tags_vals = list(set(df["Tag"].values))

In [30]:
tags_vals

['B-nat',
 'B-art',
 'I-per',
 'I-geo',
 'I-art',
 'I-gpe',
 'I-tim',
 'I-org',
 'O',
 'I-eve',
 'B-org',
 'B-eve',
 'B-per',
 'I-nat',
 'B-gpe',
 'B-geo',
 'B-tim']

In [31]:
# Add X label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

In [32]:
tags_vals

['B-nat',
 'B-art',
 'I-per',
 'I-geo',
 'I-art',
 'I-gpe',
 'I-tim',
 'I-org',
 'O',
 'I-eve',
 'B-org',
 'B-eve',
 'B-per',
 'I-nat',
 'B-gpe',
 'B-geo',
 'B-tim',
 'X',
 '[CLS]',
 '[SEP]']

In [33]:
tags_vals = set(tags_vals)

In [34]:
tags_vals

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O',
 'X',
 '[CLS]',
 '[SEP]'}

In [36]:
# set a dict for mapping id to tag name
# Let's set it by manual way as it is good for reusing
tag2idx = {
    'B-art': 14,
    'B-eve':16,
    'B-geo':0,
    'B-gpe':13,
    'B-nat':12,
    'B-org':10,
    'B-per':4,
    'B-tim':2,
    'I-art': 5,
    'I-eve':7,
    'I-geo':15,
    'I-gpe':8,
    'I-nat':11,
    'I-org':3,
    'I-per':6,
    'I-tim':1,
    'X':17,
    'O':9,
    '[CLS]':18,
    '[SEP]':19
}

In [37]:
tag2idx

{'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X': 17,
 'O': 9,
 '[CLS]': 18,
 '[SEP]': 19}

In [38]:
# Mapping idx to name
tag2name = {tag2idx[key] : key for key in tag2idx.keys()}

In [39]:
tag2name

{14: 'B-art',
 16: 'B-eve',
 0: 'B-geo',
 13: 'B-gpe',
 12: 'B-nat',
 10: 'B-org',
 4: 'B-per',
 2: 'B-tim',
 5: 'I-art',
 7: 'I-eve',
 15: 'I-geo',
 8: 'I-gpe',
 11: 'I-nat',
 3: 'I-org',
 6: 'I-per',
 1: 'I-tim',
 17: 'X',
 9: 'O',
 18: '[CLS]',
 19: '[SEP]'}

In [40]:
#Make training data
#Make raw data into trainable data for BERT:
#set gpu environment
#Load tokenizer and tokenize
#set 3 embedding: token embedding, mask word embedding, segmentation embedding
#Split data set into train and validate, then send them to dataloader

In [41]:
# set up gpu environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [42]:
device

device(type='cpu')

In [43]:
n_gpu

0

# Load Tokenizer

In [44]:
# load tokenizer file into local folder
#vocab.txt, download from: https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt
vocabulary = "../data/vocab.txt"

In [45]:
# Length of the model must not be bigger than the training model
# See model's 'max_position_embeddings' = 512
max_len = 45

In [46]:
# load tokenizer, with manual file address ( we could also do it from pretraied address)
tokenizer = BertTokenizer(vocab_file = vocabulary, do_lower_case=True)

# Tokenizer
We need to adjust the labels baed on the tokenize result, "##abc" need to set label "X" 

We also need to set "[CLS]" at front and "[SEP]" at the end

In [47]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list, label in (zip(sentences, labels)):
    temp_label = []
    temp_token = []
    
    # Add [CLS] at the front
    temp_label.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word, lab in zip(word_list, label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_label.append(lab)
            else:
                temp_label.append('X')
                
    # Add [SEP] at the end
    temp_label.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_label)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc, len(temp_token)))
        print("texts:%s"%(' '.join(temp_token)))
        print("No.%d,len:%d"%(i_inc, len(temp_label)))
        print("lables:%s"%(' '.join(temp_label)))
    i_inc +=1

TypeError: expected string or bytes-like object