In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification , BertTokenizer

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [6]:
data = pd.read_csv("tag_data.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,Another,0,O
1,1,violent,0,T
2,1,and,0,T
3,1,aggressive,0,T
4,1,immigrant,0,T


In [7]:
data.count()

Sentence #    329096
Word          329094
POS           329096
Tag           329096
dtype: int64

In [8]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 2


O    304475
T     24621
Name: Tag, dtype: int64

In [9]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('', 24621)]


In [10]:
entities_to_remove = ["B-art", "I-art", "B-eve", "I-eve", "B-nat", "I-nat"]
data = data[~data.Tag.isin(entities_to_remove)]
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,Another,0,O
1,1,violent,0,T
2,1,and,0,T
3,1,aggressive,0,T
4,1,immigrant,0,T


In [11]:
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'O': 0, 'T': 1}

In [12]:
len(labels_to_ids)

2

In [11]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,Another,0,O
1,1,violent,0,T
2,1,and,0,T
3,1,aggressive,0,T
4,1,immigrant,0,T


In [12]:
# let's create a new column called "sentence" which groups the words by sentence 
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,1,Another,0,O,Another violent and aggressive immigrant killi...,"O,T,T,T,T,O,O,O,O,O,O,O,O,O,O,O,O"
1,1,violent,0,T,Another violent and aggressive immigrant killi...,"O,T,T,T,T,O,O,O,O,O,O,O,O,O,O,O,O"
2,1,and,0,T,Another violent and aggressive immigrant killi...,"O,T,T,T,T,O,O,O,O,O,O,O,O,O,O,O,O"
3,1,aggressive,0,T,Another violent and aggressive immigrant killi...,"O,T,T,T,T,O,O,O,O,O,O,O,O,O,O,O,O"
4,1,immigrant,0,T,Another violent and aggressive immigrant killi...,"O,T,T,T,T,O,O,O,O,O,O,O,O,O,O,O,O"


In [13]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Another violent and aggressive immigrant killi...,"O,T,T,T,T,O,O,O,O,O,O,O,O,O,O,O,O"
1,"I am 56 years old , I am not your fucking juni...","O,O,O,O,O,O,O,O,O,O,T,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,"Damn , a whole family . Sad indeed .","T,O,O,O,O,O,O,O,O"
3,What a knucklehead . How can anyone not know t...,"O,O,T,O,O,O,O,O,O,O,O,O,O,O,O"
4,""" who do you think should do the killing ? "" A...","O,O,O,O,O,O,O,O,T,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [14]:
len(data)

7939

In [15]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.00001
MAX_GRAD_NORM = 1.0
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [16]:
class dataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):      
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words =True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        # print("l=" + str(len(encoding["offset_mapping"])))
        # print(encoding["offset_mapping"])
        # print(labels)
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                if i< len(labels):
                    encoded_labels[idx] = labels[i]
                else:
                    encoded_labels[idx] = labels[-1]
                i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        return item
    def __len__(self):
        return self.len

In [17]:
train_size = 0.95
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (7939, 2)
TRAIN Dataset: (7542, 2)
TEST Dataset: (397, 2)


In [18]:
len(training_set)

7542

In [19]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [20]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [21]:
#optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

In [22]:
from sadice import SelfAdjDiceLoss

In [23]:
criterion = SelfAdjDiceLoss(reduction="mean", alpha = 0.7, gamma = 0.25)
criterion.to(device)

SelfAdjDiceLoss()

In [24]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
        bool_mask = (batch['attention_mask'] > 0).to(device)
        bool_labels = (batch['labels'] != -100).to(device)
        final_mask = torch.mul(bool_labels, bool_mask)
        #print(bool_mask)
        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        
        #loss = outputs[0]
        active_logits = torch.masked_select(outputs[1], final_mask.unsqueeze(-1).repeat(1, 1, 2))
        active_labels = torch.masked_select(labels, final_mask)
        loss = criterion(active_logits.view(-1, 2), active_labels)
        #loss = loss.reshape(-1, len(active_labels)).mean(-1).mean()

        tr_logits = outputs[1]
        #print(loss)
        #print(tr_logits)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [25]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 0.45024070143699646
Training loss per 100 training steps: 0.4403795535021489
Training loss per 100 training steps: 0.4398441830677773
Training loss per 100 training steps: 0.4394330385317438
Training loss per 100 training steps: 0.43928600933189105
Training loss per 100 training steps: 0.4391351707443268
Training loss per 100 training steps: 0.43901967213871873
Training loss per 100 training steps: 0.43897035807243595
Training loss per 100 training steps: 0.4388954716079988
Training loss per 100 training steps: 0.4388465554878265
Training loss epoch: 0.4388317347268158
Training accuracy epoch: 0.9285367461021174
Training epoch: 2
Training loss per 100 training steps: 0.4392235279083252
Training loss per 100 training steps: 0.43871527939739796
Training loss per 100 training steps: 0.4384873295897868
Training loss per 100 training steps: 0.43846911399863486
Training loss per 100 training steps: 0.43844061770344017
Training loss per 

In [26]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            bool_mask = (batch['attention_mask'] > 0).to(device)
            bool_labels = (batch['labels'] != -100).to(device)
            final_mask = torch.mul(bool_labels, bool_mask)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            active_logits = torch.masked_select(outputs[1], final_mask.unsqueeze(-1).repeat(1, 1, 2))
            active_labels = torch.masked_select(labels, final_mask)
            loss = criterion(active_logits.view(-1, 2), active_labels)
            #loss = loss.reshape(-1, len(active_labels)).mean(-1).mean()
            
            eval_logits = outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [27]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.43801334500312805
Validation Loss: 0.43833431124687194
Validation Accuracy: 0.937317832217786


In [28]:
#!pip install seqeval

In [29]:
from seqeval.metrics import classification_report

In [30]:
# print(classification_report(labels, predictions))

In [31]:
import sklearn
sklearn.metrics.f1_score(labels, predictions,pos_label='T')

0.46976241900647947

In [32]:
directory = './dice_loss_model4'
tokenizer.save_vocabulary(directory)
model.save_pretrained(directory)
print('All files saved')

All files saved


# Testing

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [2]:
model = BertForTokenClassification.from_pretrained('dice_loss_model4', num_labels=2)
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [3]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.00001
MAX_GRAD_NORM = 1.0
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [4]:
def test(model, sentences):
    # put model in evaluation mode
    model.eval()
    #print(len(sentences))
    char_predictions_batch=[]
    for sentence in sentences:
        char_predictions = np.zeros((len(sentence),), dtype=int)
        inputs = tokenizer(sentence,
              return_offsets_mapping=True, 
              padding='max_length', 
              truncation=True, 
              max_length=MAX_LEN,
              return_tensors="pt")
        # move to gpu
        ids = inputs["input_ids"].to(device)
        mask = inputs["attention_mask"].to(device)
        # forward pass
        outputs = model(ids, attention_mask=mask)
        logits = outputs[0]

        active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

        tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
        #print(ids.squeeze().tolist())
        token_predictions = flattened_predictions.cpu().numpy()
        for i, token_prediction in enumerate(token_predictions):
            token = inputs.tokens()[i]
            if token in ('[CLS]','[SEP]','[PAD]'):
                continue
            
            token_start, _ = inputs.token_to_chars(i)
            word_index = inputs.word_ids()[i]
            word_start, word_end = inputs.word_to_chars(word_index)
            if token_start != word_start:
                continue #2nd or other sub-token
            else:
                #print(f"{word_start}, {word_end}, {token}, {token_prediction}")
                char_predictions[word_start:word_end] = token_prediction
        char_predictions_batch.append(char_predictions)
    return char_predictions_batch

In [5]:
test_data = pd.read_csv('data/tsd_test.csv')

In [6]:
test_sentenses = test_data['text']
len(test_sentenses)

2000

In [7]:
char_predictions = test(model, test_sentenses)

In [8]:
with open('BERT-NER-DL-result.txt', 'w') as f:
    for index, char_prediction in enumerate(char_predictions):
        s = ''
        for c in char_prediction:
            s += f'{c},'
        s = s[:-1]
        f.write(s+ '\n')
            