In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
train_data = pd.read_csv("tag_data.csv", encoding='unicode_escape')
train_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,Another,0,O
1,1,violent,0,T
2,1,and,0,T
3,1,aggressive,0,T
4,1,immigrant,0,T


In [4]:
train_data[train_data.isna().any(axis=1)]

Unnamed: 0,Sentence #,Word,POS,Tag
36149,901,,0,O
131944,3202,,0,O


In [5]:
train_data.dropna(axis=0,inplace=True)

In [6]:
train_data.shape

(329094, 4)

In [7]:
train_data[train_data['Sentence #'] == 1]

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,Another,0,O
1,1,violent,0,T
2,1,and,0,T
3,1,aggressive,0,T
4,1,immigrant,0,T
5,1,killing,0,O
6,1,a,0,O
7,1,innocent,0,O
8,1,and,0,O
9,1,intelligent,0,O


In [8]:
tag2id = {'O': 0, 'T': 1}
id2tag = {id: tag for tag, id in tag2id.items()}

In [9]:
texts = []
tags  = []
for i in range(1,max(train_data['Sentence #'])+1):
#for i in range(1,2):
  df = train_data[train_data['Sentence #'] == i]
  tokens = []
  tags_bag = []
  for index, row in df.iterrows():
    tokens.append(row['Word'])
    tags_bag.append(row['Tag'])
  texts.append(tokens)
  tags.append(tags_bag)

In [10]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [11]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
#tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [14]:
from transformers import DistilBertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
#train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length',max_length=128)
#val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True,padding='max_length',max_length=128)
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length',max_length=MAX_LEN)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True,truncation=True, padding='max_length',max_length=MAX_LEN)
#test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, truncation=True,padding='max_length',max_length=MAX_LEN)

In [15]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        #print(f"doc_enc_labels {len(doc_enc_labels)}")
        #print(f"doc_labels {len(doc_labels)}")
        #print(f"arr_offset {len(doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)])}")
        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels[:len(doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)])]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

    
class dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
#test_labels = encode_tags(test_tags, test_encodings)

In [17]:
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
#test_encodings.pop("offset_mapping")
train_dataset = dataset(train_encodings, train_labels)
val_dataset = dataset(val_encodings, val_labels)
#test_dataset = dataset(test_encodings, test_labels)

In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)
validation_loader =  DataLoader(val_dataset, **val_params)
#testing_loader = DataLoader(test_dataset, **test_params)

In [19]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [20]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [21]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    loss_history = []
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        tr_logits = outputs[1]
        #print(loss)
        #print(tr_logits)
        tr_loss += loss.item()
        loss_history.append(loss.item())
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    return loss_history

And let's train the model!

In [None]:
loss_history = []

In [22]:

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    loss = train(epoch)
    loss_history.extend(loss)

Training epoch: 1
Training loss per 100 training steps: 0.6734738945960999
Training loss per 100 training steps: 0.3195598431595481
Training loss per 100 training steps: 0.2867564552681363
Training loss per 100 training steps: 0.2581686050788905
Training loss per 100 training steps: 0.24718341406927144
Training loss per 100 training steps: 0.24369145323059516
Training loss per 100 training steps: 0.23753200526567744
Training loss per 100 training steps: 0.23734472431614465
Training loss per 100 training steps: 0.23790080263540986
Training loss per 100 training steps: 0.23596234074674885
Training loss per 100 training steps: 0.23263676130732933
Training loss per 100 training steps: 0.22956223726665076
Training loss per 100 training steps: 0.22499417996252705
Training loss per 100 training steps: 0.2256259765077975
Training loss per 100 training steps: 0.2216354648713594
Training loss per 100 training steps: 0.21857728600889187
Training loss epoch: 0.2177516130814962
Training accuracy ep

In [23]:
len(loss_history)

4764

In [26]:
tr_loss = pd.DataFrame(loss_history)
tr_loss.to_csv('bert_tr_loss.csv',index=None,header=None)

#### **Evaluating the model**

In [22]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = outputs[0], outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #labels = [ids_to_labels[id.item()] for id in eval_labels]
    #predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

As we can see below, performance is quite good! Accuracy on the test test is > 93%.

In [23]:
labels, predictions = valid(model, validation_loader)

Validation loss per 100 evaluation steps: 0.07867597043514252
Validation loss per 100 evaluation steps: 0.10876934364032333
Validation loss per 100 evaluation steps: 0.12304500615993394
Validation loss per 100 evaluation steps: 0.12523787105425055
Validation loss per 100 evaluation steps: 0.12634659503026868
Validation loss per 100 evaluation steps: 0.1277940470163902
Validation loss per 100 evaluation steps: 0.1307122356765059
Validation loss per 100 evaluation steps: 0.13215072208035009
Validation Loss: 0.13303520963515533
Validation Accuracy: 0.9558415829023744


#### **Saving the model**

In [None]:
import os

directory = "model/"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')

# Testing


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
from transformers import DistilBertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

model = BertForTokenClassification.from_pretrained('model', num_labels=2)
model.to(device)


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [4]:
def test(model, sentences):
    # put model in evaluation mode
    model.eval()
    #print(len(sentences))
    char_predictions_batch=[]
    for sentence in sentences:
        char_predictions = np.zeros((len(sentence),), dtype=int)
        inputs = tokenizer(sentence,
              return_offsets_mapping=True, 
              padding='max_length', 
              truncation=True, 
              max_length=MAX_LEN,
              return_tensors="pt")
        # move to gpu
        ids = inputs["input_ids"].to(device)
        mask = inputs["attention_mask"].to(device)
        # forward pass
        outputs = model(ids, attention_mask=mask)
        logits = outputs[0]

        active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

        tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
        #print(ids.squeeze().tolist())
        #token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
        token_predictions = flattened_predictions.cpu().numpy()
#        wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
#         wp_preds = list(zip(inputs.tokens(), token_predictions))

#         for i, wp_pred in enumerate(wp_preds):
#             if wp_pred[0] not in ('[CLS]','[SEP]','[PAD]'):
#                 print(f"{start}, {end}, {wp_pred[0]}, {wp_pred[1]}")
#                 if wp_pred[1] ==1:
#                     start,end = test_en.token_to_chars(i)
#                     char_predictions[start:end] = wp_pred[1]
        for i, token_prediction in enumerate(token_predictions):
            token = inputs.tokens()[i]
            if token in ('[CLS]','[SEP]','[PAD]'):
                continue
            
            token_start, _ = inputs.token_to_chars(i)
            word_index = inputs.word_ids()[i]
            word_start, word_end = inputs.word_to_chars(word_index)
            if token_start != word_start:
                continue #2nd or other sub-token
            else:
                #print(f"{word_start}, {word_end}, {token}, {token_prediction}")
                char_predictions[word_start:word_end] = token_prediction
#             if wp_pred[0] not in ('[CLS]','[SEP]','[PAD]'):
#                 print(f"{start}, {end}, {wp_pred[0]}, {wp_pred[1]}")
#                 if wp_pred[1] ==1:
#                     start,end = test_en.token_to_chars(i)
#                     char_predictions[start:end] = wp_pred[1]

        char_predictions_batch.append(char_predictions)
    return char_predictions_batch

In [5]:
test_data = pd.read_csv('data/tsd_test.csv')

In [6]:
#test_sentenses = test_data.iloc[0:10]['text']
test_sentenses = test_data['text']

In [7]:
test_sentenses[2]

'tens years ago i contacted the PDR and suggested that the time might be good to work with alaska on building a gas line.. alaska rejected them without even consideration despite china being flush with cash and hungry for gas.. and set up another infamous boondoggle.. the transcanada-exxon rip off that we are still paying for and have yet to receive anything of value.. hundreds of millions of dollars on studies.. and buyouts.. i hope china says f you alaska.. you are nothing but ignorant people..'

In [8]:
len(test_sentenses)

2000

In [9]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

char_predictions = test(model, test_sentenses)

In [10]:
len(char_predictions)

2000

In [64]:
with open('BERT-NER-CEL-result.txt', 'w') as f:
    for index, char_prediction in enumerate(char_predictions):
        s = ''
        for c in char_prediction:
            s += f'{c},'
        s = s[:-1]
        f.write(s+ '\n')
            