In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
data = pd.read_csv("Data/training_dataset_2.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)


Unnamed: 0,Sentence #,Word,Tag
165871,Sentence: 9999,year,modf
165872,Sentence: 9999,at,modf
165873,Sentence: 9999,7,modf
165874,Sentence: 9999,:05,modf
165875,Sentence: 9999,in,modf
165876,Sentence: 9999,the,modf
165877,Sentence: 9999,evening,modf
165878,Sentence: 9999,and,O
165879,Sentence: 9999,heathy,health
165880,Sentence: 9999,.,O


In [3]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [4]:
getter = SentenceGetter(data)

In [5]:
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
print(sentences[0])

cidrs operating and which is created at 18 august year at 21 :21 :44 and that have issue .


In [6]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[0])

['aci_object', 'oper', 'O', 'O', 'O', 'modf', 'modf', 'modf', 'modf', 'modf', 'modf', 'modf', 'modf', 'modf', 'O', 'health', 'health', 'health', 'O']


In [7]:
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [8]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import BertForTokenClassification

Using TensorFlow backend.


In [9]:
MAX_LEN = 75
bs = 32

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [12]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['cid', '##rs', 'operating', 'and', 'which', 'is', 'created', 'at', '18', 'august', 'year', 'at', '21', ':', '21', ':', '44', 'and', 'that', 'have', 'issue', '.']


In [13]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [14]:
input_ids[0]

array([28744,  2869,  4082,  1998,  2029,  2003,  2580,  2012,  2324,
        2257,  2095,  2012,  2538,  1024,  2538,  1024,  4008,  1998,
        2008,  2031,  3277,  1012,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])

In [15]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [16]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [17]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [18]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [19]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [20]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  model = nn.DataParallel(model)

Let's use 4 GPUs!


In [21]:
model.to(device)
model.cuda();

In [22]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [23]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [24]:
epochs = 7
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        # backward pass
        loss.mean().backward()
        # track train loss
        tr_loss += loss.mean().item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            tmp_eval_loss, logits = outputs[:2]
#             logits = model(b_input_ids, token_type_ids=None,
#                            attention_mask=b_input_mask)
#         print("Outside: input size", b_input_ids.size(),
#           "output_size", logits.size())
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))



Train loss: 0.3024880013937223


Epoch:  14%|█▍        | 1/7 [02:45<16:31, 165.29s/it]

Validation loss: 0.060298471013084054
Validation Accuracy: 0.7232682291666666
F1-Score: 0.4485258579023682
Train loss: 0.07531172805961142


Epoch:  29%|██▊       | 2/7 [05:19<13:29, 162.00s/it]

Validation loss: 0.03018348714977037
Validation Accuracy: 0.7369791666666667
F1-Score: 0.4873076923076923
Train loss: 0.04778049148434232


Epoch:  43%|████▎     | 3/7 [07:54<10:38, 159.75s/it]

Validation loss: 0.019987844178103842
Validation Accuracy: 0.7402604166666666
F1-Score: 0.5081839438815278
Train loss: 0.035523078300336255


Epoch:  57%|█████▋    | 4/7 [10:28<07:54, 158.20s/it]

Validation loss: 0.017251146884518676
Validation Accuracy: 0.7215234375000001
F1-Score: 0.49226706903055445
Train loss: 0.02672797687688565


Epoch:  71%|███████▏  | 5/7 [13:03<05:14, 157.09s/it]

Validation loss: 0.010453974427946378
Validation Accuracy: 0.7290755208333334
F1-Score: 0.50730966394532
Train loss: 0.022851186586091466


Epoch:  86%|████████▌ | 6/7 [15:37<02:36, 156.25s/it]

Validation loss: 0.015956621569785057
Validation Accuracy: 0.7346744791666665
F1-Score: 0.5136601433830653
Train loss: 0.01933627243976089


Epoch: 100%|██████████| 7/7 [18:11<00:00, 155.70s/it]

Validation loss: 0.008182447561921435
Validation Accuracy: 0.7249348958333333
F1-Score: 0.5099744729129243





In [25]:
output_dir = 'Data/model_bert/'

In [26]:
import os
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME



model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
 

In [27]:
# model = BertForTokenClassification.from_pretrained(output_dir)
# tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True)  # Add specific options if needed
model.to(device)

DataParallel(
  (module): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): BertLayerNorm()
                (dropou

In [28]:
model.eval()
inputs = []
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
#         logits = model(b_input_ids, token_type_ids=None,
#                        attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    ips = b_input_ids.to('cpu').numpy()
    inputs.append(ips)
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Validation loss: 0.008182447561921435
Validation Accuracy: 0.7249348958333333
Validation F1-Score: 0.5103969754253309


In [29]:
test_sent = "cidrs operating and which is created at 18 august year at 21 :21 :44 and that have issue ."


In [30]:
tokenized_texts = tokenizer.tokenize(test_sent)
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(tokenized_texts)],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [31]:
import spacy
from spacy import displacy
import en_core_web_sm
nlp = spacy.load('Data/')

In [32]:
cls = {"ACI_OBJECT":'#E0FFFF',"OPER":'#FFB6C1',"MODF":'#FFFACD',"HEALTH":'#E6E6FA'}
displacy.render(nlp(str(sentences[1])), jupyter=True, style='ent',options={'colors':cls})