In [None]:
#path that contains folder you want to copy
from google.colab import drive
drive.mount('/content/drive')
%cd /content/
%cd /content/drive/MyDrive/USC/NLP_CSCI_544/project/

Mounted at /content/drive
/content
/content/drive/MyDrive/USC/NLP_CSCI_544/project


# Import required libraries

In [None]:
import pandas as pd
import numpy as np
import os
import random
import tqdm 

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

# Define flags and constants

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# special token for unknown words
UNK = "<UNK>"    
# special token for padding                 
PAD = "<PAD>"

START_TAG = "<START>"
STOP_TAG = "<STOP>"

random_seed = 42

model_file_name = 'BLSTM1.pt'

# checks whether to use CNN for character-level representation
use_cnn_for_char_level = False

# vector dimension for word embeddings
word_embedding_dim = 100

# vector dimension for character embeddings
char_embedding_dim = 30

# None if not using glove embedding else contains glove embedding dictionary
pre_embeddings = None

# setting the hyperparameters as per the task description
lstm_hidden_dim = 200
dropout = 0.5

# number of filters in the CNN layer
out_channels = 30


# setting all the other hyperparameters
num_epochs = 100
batch_size = 10
learning_rate = 0.015
momentum = 0.9
decay_rate = 0.05
grad_clip = 5.0

# creating dictionary for storing tag and corresponding index
tag_to_idx = {}
idx_to_tag = {}

pos_tag_to_idx = {}
pos_idx_to_tag = {}

# creating dictionary for storing word and corresponding index in training data
word_to_idx = {}
idx_to_word = {}

# creating dictionary for storing character and corresponding index in training data
char_to_idx = {}
idx_to_char = {}

# Load data files

In [None]:
train = pd.read_csv('english/eng.train.bioes.conll', header = None, names = ['idx','word','pos','synt','tag'], sep ='\s',na_values=['<NAN>'], keep_default_na=False)
train.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,idx,word,pos,synt,tag
0,1,EU,NNP,I-NP,S-ORG
1,2,rejects,VBZ,I-VP,O
2,3,German,JJ,I-NP,S-MISC
3,4,call,NN,I-NP,O
4,5,to,TO,I-VP,O


In [None]:
dev = pd.read_csv('english/eng.dev.bioes.conll', header = None, names = ['idx','word','pos','synt','tag'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)
dev.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,idx,word,pos,synt,tag
0,1,CRICKET,NNP,I-NP,O
1,2,-,:,O,O
2,3,LEICESTERSHIRE,NNP,I-NP,S-ORG
3,4,TAKE,NNP,I-NP,O
4,5,OVER,IN,I-PP,O


In [None]:
test = pd.read_csv('english/eng.test.bioes.conll', header = None, names = ['idx','word','pos','synt','tag'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)
test.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,idx,word,pos,synt,tag
0,1,SOCCER,NN,I-NP,O
1,2,-,:,O,O
2,3,JAPAN,NNP,I-NP,S-LOC
3,4,GET,VB,I-VP,O
4,5,LUCKY,NNP,I-NP,O


# Define functions

- The *make_data* function creates a list of 3-tuples where first element is list of preprocessed words, second element is list of actual words and third element is list of corresponding tags.

In [None]:
def make_data(df,has_tags=True):
    sentences = []
    sentence_tags = []
    pos_sentence_tags = []
    tags = []
    pos_tags = []
    sentence = None
    for row in df.values.tolist():
        if row[0] == 1:
            if sentence:
                sentence_tags.append(tags)
                sentences.append(sentence)
                pos_sentence_tags.append(pos_tags)
            sentence = []
            tags = []
            pos_tags = []
        sentence.append(row[1])
        if has_tags:
          tags.append(row[-1])
          pos_tags.append(row[2])
        else:
          tags.append("O")
    sentence_tags.append(tags)
    sentences.append(sentence)
    pos_sentence_tags.append(pos_tags)
    
    return list(zip(sentences,sentence_tags,pos_sentence_tags))

- The *make_tag_to_idx* function creates the tag to index dictionary.
- The *make_word_to_idx* function creates the word to index dictionary.
- Both functions use the output of the *make_data* function.

In [None]:
def make_tag_to_idx(data):
    if PAD not in tag_to_idx:
      idx_to_tag[len(tag_to_idx)] = PAD
      tag_to_idx[PAD] = len(tag_to_idx)
      pos_idx_to_tag[len(pos_tag_to_idx)] = PAD
      pos_tag_to_idx[PAD] = len(pos_tag_to_idx)
    
    if START_TAG not in tag_to_idx:
      idx_to_tag[len(tag_to_idx)] = START_TAG
      tag_to_idx[START_TAG] = len(tag_to_idx)
      pos_idx_to_tag[len(pos_tag_to_idx)] = START_TAG
      pos_tag_to_idx[START_TAG] = len(pos_tag_to_idx)
    
    if STOP_TAG not in tag_to_idx:
      idx_to_tag[len(tag_to_idx)] = STOP_TAG
      tag_to_idx[STOP_TAG] = len(tag_to_idx)
      pos_idx_to_tag[len(pos_tag_to_idx)] = STOP_TAG
      pos_tag_to_idx[STOP_TAG] = len(pos_tag_to_idx)

    for _,ner_tags,pos_tags in data:
      for tag in ner_tags:
        if tag not in tag_to_idx:
          idx_to_tag[len(tag_to_idx)] = tag
          tag_to_idx[tag] = len(tag_to_idx)
      for tag in pos_tags:
        if tag not in pos_tag_to_idx:
          pos_idx_to_tag[len(pos_tag_to_idx)] = tag
          pos_tag_to_idx[tag] = len(pos_tag_to_idx)


def make_word_to_idx(data,is_valid_or_test=False):
  if is_valid_or_test:
    for sentence,_,_ in data:
      for word in sentence:
        if word not in word_to_idx:
          idx_to_word[len(word_to_idx)] = word
          word_to_idx[word] = len(word_to_idx)
    return word_to_idx, idx_to_word

  else:
    idx_to_word[0] = PAD
    word_to_idx[PAD] = 0
    idx_to_word[1] = UNK
    word_to_idx[UNK] = 1

    idx_to_char[0] = PAD
    char_to_idx[PAD] = 0
    idx_to_char[1] = UNK
    char_to_idx[UNK] = 1

    for sentence,_,_ in data:
      for word in sentence:
        if word not in word_to_idx:
          idx_to_word[len(word_to_idx)] = word
          word_to_idx[word] = len(word_to_idx)

    for sentence,_,_ in data:
      for word in sentence:
        for char in sentence:
          if char not in char_to_idx:
            idx_to_char[len(char_to_idx)] = char
            char_to_idx[char] = len(char_to_idx)

- The *get_embedding_data* function creates the embedding dictionary from the glove.6B.100d text file.
- The *build_embedding_table* function creates a numpy matrix for all word embeddings. If glove embedding dictionary is provided then, that is used for creating a table otherwise the table is created with random entries.

In [None]:
def get_embedding_data(filename,dim):
  embedding = dict()
  with open(filename,'r',encoding='utf-8') as f:
    for line in f.readlines():
      line = line.strip()
      if len(line) == 0:
        continue
      line_split = line.split() #word followed by dim numbers
      embedd = np.empty([1,dim])
      embedd[:] = line_split[1:]
      word = line_split[0]
      embedding[word] = embedd
  return embedding

def build_embedding_table():
    global embeddings
    scale = np.sqrt(3.0 / word_embedding_dim)
    if pre_embeddings is not None:
        embeddings = np.empty([len(word_to_idx), word_embedding_dim])
        for word in word_to_idx:
            if word.lower() in pre_embeddings:
                embeddings[word_to_idx[word], :] = pre_embeddings[word.lower()]
            else:
                embeddings[word_to_idx[word], :] = np.random.uniform(-scale, scale, [1, word_embedding_dim])
    else:
        embeddings = np.empty([len(word_to_idx), word_embedding_dim])
        for word in word_to_idx:
            embeddings[word_to_idx[word], :] = np.random.uniform(-scale, scale, [1, word_embedding_dim])


- The *make_numeric_data* function uses the word_to_idx, tag_to_idx and char_to_idx dictionaries in order to change the dataset into a list of indices.
- The *data_batching* function creates batches from entire dataset. All sentences in a batch are padded to the same length using the *padded_batching* function.



In [None]:
def make_numeric_data(data):
  list_sent_ids = []
  list_tag_ids = []
  list_char_ids = []
  list_pos_tag_ids = []
  for sentence,tags,pos_tags in data:
    sentence_ids = []
    tag_ids = []
    pos_tag_ids = []
    char_ids = []
    for word in sentence:
      if word in word_to_idx:
        sentence_ids.append(word_to_idx[word])
      else:
        sentence_ids.append(word_to_idx[UNK])
      
      char_id = []
      for c in word:
        if c in char_to_idx:
          char_id.append(char_to_idx[c])
        else:
          char_id.append(char_to_idx[UNK])
      char_ids.append(char_id)
  
    for tag in tags:
      if tag in tag_to_idx:
        tag_ids.append(tag_to_idx[tag])
      else:
        tag_ids.append("O")

    for tag in pos_tags:
      if tag in pos_tag_to_idx:
        pos_tag_ids.append(pos_tag_to_idx[tag])

    list_sent_ids.append(sentence_ids)
    list_tag_ids.append(tag_ids)
    list_pos_tag_ids.append(pos_tag_ids)
    list_char_ids.append(char_ids)
  
  return list(zip(list_sent_ids,list_char_ids,list_tag_ids,list_pos_tag_ids))

def data_batching(data):
  num_instances = len(data)
  num_batches = num_instances // batch_size + 1 if num_instances != 0 else num_instances//batch_size
  batched_data = []
  for i in range(num_batches):
    batch = data[i*batch_size:(i+1)*batch_size]
    batched_data.append(padded_batching(batch))
  return batched_data

def padded_batching(data):
  batch_size = len(data)
  batch_data = data
  
  word_seq_len = torch.LongTensor(list(map(lambda x: len(x[0]), batch_data)))
  max_word_seq_len = word_seq_len.max()
  # print(batch_data)
  char_seq_len = torch.LongTensor(
      [list(map(len,x[1])) + [1]* (int(max_word_seq_len) - len(x[1])) for x in batch_data]
  )

  max_char_seq_len = char_seq_len.max()

  word_seq_tensor = torch.zeros((batch_size,max_word_seq_len),dtype=torch.long)
  char_seq_tensor = torch.zeros((batch_size,max_word_seq_len,max_char_seq_len), dtype=torch.long)
  tag_seq_tensor = torch.zeros((batch_size,max_word_seq_len),dtype=torch.long)
  pos_tag_seq_tensor = torch.zeros((batch_size,max_word_seq_len),dtype=torch.long)

  for idx in range(batch_size):
    word_seq_tensor[idx,:word_seq_len[idx]] = torch.LongTensor(batch_data[idx][0])
    tag_seq_tensor[idx,:word_seq_len[idx]] = torch.LongTensor(batch_data[idx][2])
    pos_tag_seq_tensor[idx,:word_seq_len[idx]] = torch.LongTensor(batch_data[idx][3])
    
    for word_idx in range(word_seq_len[idx]):
      char_seq_tensor[idx,word_idx,:char_seq_len[idx,word_idx]] = torch.LongTensor(
          batch_data[idx][1][word_idx]
      )

    for word_idx in range(word_seq_len[idx],max_word_seq_len):
      char_seq_tensor[idx,word_idx,0:1] = torch.LongTensor([char_to_idx[PAD]])
    
    word_seq_tensor = word_seq_tensor.to(device)
    word_seq_len = word_seq_len.to(device)
    char_seq_tensor = char_seq_tensor.to(device)
    tag_seq_tensor = tag_seq_tensor.to(device)
    pos_tag_seq_tensor = pos_tag_seq_tensor.to(device)
  return word_seq_tensor,word_seq_len,char_seq_tensor,tag_seq_tensor,pos_tag_seq_tensor,data

In [None]:
# CRF helper functions
def argmax(vec):
  _, idx = torch.max(vec, 1)
  return idx.item()

def log_sum_exp(vec):
  max_score,_ = torch.max(vec,-1)
  max_score_broadcast = max_score.unsqueeze(-1).expand_as(vec)
  return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast),-1))

- The *initialize_linear_layer* function initializes the weights and bias of a linear layer using xavier initialization.
- The *initialize_lstm_layer* function initializes the weights of the LSTM layer using an orthogonal matrix. It initializes the bias by sampling a normal distribution.
- The *BLSTM* class defines feed-forward architecture of the model used in tasks 1 and 2. It also takes care of character-level CNN for the bonus task.

In [None]:
def initialize_linear_layer(layer):
    nn.init.xavier_normal_(layer.weight.data)
    nn.init.normal_(layer.bias.data)

def initialize_lstm_layer(layer):
    for param in layer.parameters():
        if len(param.shape) >= 2:
            nn.init.orthogonal_(param.data)
        else:
            nn.init.normal_(param.data)

class BLSTM(nn.Module):
  def __init__(self):
    super(BLSTM,self).__init__()

    self.vocab_size = len(word_to_idx)
    self.tag_size = len(tag_to_idx)
    self.tag_to_idx = tag_to_idx
    self.pos_tag_size = len(pos_tag_to_idx)
    self.pos_tag_to_idx = pos_tag_to_idx

    if use_cnn_for_char_level:
      self.char_embedd = nn.Embedding(len(char_to_idx),char_embedding_dim)
      nn.init.xavier_uniform_(self.char_embedd.weight)
      self.char_cnn = nn.Conv2d(
          in_channels = 1,
          out_channels = out_channels,
          kernel_size = (3,char_embedding_dim),
          padding = (2,0)
      )

    self.word_embedd = nn.Embedding(self.vocab_size,word_embedding_dim)
    self.word_embedd.weight = nn.Parameter(torch.FloatTensor(embeddings))

    self.dropout = nn.Dropout(dropout)
    
    if use_cnn_for_char_level:
      self.lstm = nn.LSTM(word_embedding_dim+out_channels,lstm_hidden_dim,num_layers=1,
                          batch_first = True, bidirectional=True)

    else:
      self.lstm = nn.LSTM(word_embedding_dim,lstm_hidden_dim,num_layers=1,
                          batch_first = True, bidirectional=True)
    
    initialize_lstm_layer(self.lstm)
    self.dropout = nn.Dropout(dropout)
    self.out_layer = nn.Linear(2 * lstm_hidden_dim,self.tag_size)
    self.pos_out_layer = nn.Linear(2 * lstm_hidden_dim,self.pos_tag_size)
    initialize_linear_layer(self.out_layer)
    initialize_linear_layer(self.pos_out_layer)


    self.transitions = nn.Parameter(torch.randn(self.tag_size, self.tag_size))
    self.transitions.data[self.tag_to_idx[START_TAG], :] = -10000.
    self.transitions.data[:, self.tag_to_idx[STOP_TAG]] = -10000.
    self.transitions.data[:,self.tag_to_idx[PAD]] = -10000.
    self.transitions.data[self.tag_to_idx[PAD],:] = -10000.
    self.transitions.data[self.tag_to_idx[PAD], tag_to_idx[PAD]] = 0.0

    self.transitions_pos = nn.Parameter(torch.randn(self.pos_tag_size, self.pos_tag_size))
    self.transitions_pos.data[self.pos_tag_to_idx[START_TAG], :] = -10000.
    self.transitions_pos.data[:, self.pos_tag_to_idx[STOP_TAG]] = -10000.
    self.transitions_pos.data[:,self.pos_tag_to_idx[PAD]] = -10000.
    self.transitions_pos.data[self.pos_tag_to_idx[PAD],:] = -10000.
    self.transitions_pos.data[self.pos_tag_to_idx[PAD], tag_to_idx[PAD]] = 0.0

  def init_hidden(self,batch_shape):
    return (
        torch.randn(2,batch_shape,lstm_hidden_dim).to(device),
        torch.randn(2,batch_shape,lstm_hidden_dim).to(device)
            )
  
  def get_mask_from_word_sequences(self, word_seq_len):
    batch_num = len(word_seq_len)
    max_seq_len = max([x for x in word_seq_len])
    mask_tensor = torch.zeros(batch_num, max_seq_len, dtype=torch.float).to(device)
    for k, x in enumerate(word_seq_len):
        mask_tensor[k, :x] = 1
    return mask_tensor # batch_size x max_seq_len

  def _get_lstm_features(self,word_seq_tensor,word_seq_len,char_seq_tensor):
    self.hidden = self.init_hidden(word_seq_tensor.shape[0])
    word_embedds = self.word_embedd(word_seq_tensor)
    

    if use_cnn_for_char_level:
      batch_size = char_seq_tensor.size(0)
      sent_len = char_seq_tensor.size(1)
      char_seq_tensor = char_seq_tensor.view(batch_size*sent_len,-1)
      char_embedds = self.char_embedd(char_seq_tensor).unsqueeze(1)
      char_embedds = self.dropout(char_embedds)
      cnn_out = self.char_cnn(char_embedds)
      char_embedds = nn.functional.max_pool2d(cnn_out,kernel_size=(cnn_out.size(2),1)).view(
          cnn_out.size(0),
          out_channels
      )

      char_features = char_embedds.view(batch_size,sent_len,-1)
      word_embedds = torch.cat([word_embedds,char_features],axis=2)
    
    word_embs = self.dropout(word_embedds)
    self.mask = self.get_mask_from_word_sequences(word_seq_len)
    sorted_seq_len,idx = word_seq_len.sort(0,descending=True)
    _,sorted_idx = idx.sort(0,descending=False)
    sorted_seq_tensor = word_embs[idx]
    packed_words = pack_padded_sequence(sorted_seq_tensor,sorted_seq_len.cpu(),True)
    output, self.hidden = self.lstm(packed_words,self.hidden)
    output, _ = pad_packed_sequence(output,batch_first=True)
    output = output[sorted_idx]
    output = self.dropout(output)
    ner_output = self.out_layer(output)
    pos_output = self.pos_out_layer(output)
    return ner_output,pos_output

  def _forward_alg(self, ner_feats,pos_feats):
    batch_num, max_seq_len = self.mask.shape
    ner_score = torch.full((batch_num, self.tag_size), -10000.).to(device)
    ner_score[:, self.tag_to_idx[START_TAG]] = 0.
    for n in range(max_seq_len):
      curr_mask = self.mask[:, n].unsqueeze(-1).expand_as(ner_score)
      curr_ner_score = ner_score.unsqueeze(1).expand(-1, *self.transitions.size())
      curr_emission = ner_feats[:, n].unsqueeze(-1).expand_as(curr_ner_score)
      curr_transition = self.transitions.unsqueeze(0).expand_as(curr_ner_score)
      curr_ner_score = log_sum_exp(curr_ner_score + curr_emission + curr_transition)
      ner_score = curr_ner_score * curr_mask + ner_score*(1-curr_mask)
    ner_score = log_sum_exp(ner_score)

    pos_score = torch.full((batch_num, self.pos_tag_size), -10000.).to(device)
    pos_score[:, self.pos_tag_to_idx[START_TAG]] = 0.
    for n in range(max_seq_len):
      curr_mask = self.mask[:, n].unsqueeze(-1).expand_as(pos_score)
      curr_pos_score = pos_score.unsqueeze(1).expand(-1, *self.transitions_pos.size())
      curr_emission = pos_feats[:, n].unsqueeze(-1).expand_as(curr_pos_score)
      curr_transition = self.transitions_pos.unsqueeze(0).expand_as(curr_pos_score)
      curr_pos_score = log_sum_exp(curr_pos_score + curr_emission + curr_transition)
      pos_score = curr_pos_score * curr_mask + pos_score*(1-curr_mask)
    pos_score = log_sum_exp(pos_score)
    return ner_score,pos_score

  def _score_sentence(self, ner_feats, pos_feats, ner_tags, pos_tags):
    batch_num, max_seq_len = self.mask.shape
    ner_score = torch.zeros(batch_num, dtype=torch.float).to(device)
    start_tag_tensor = torch.zeros(batch_num, 1, dtype=torch.long).fill_(self.tag_to_idx[START_TAG]).to(device)
    ner_tags = torch.cat([start_tag_tensor, ner_tags], 1)
    for n in range(max_seq_len):
      curr_mask = self.mask[:,n]
      curr_emission = torch.zeros(batch_num, dtype=torch.float).to(device)
      curr_transition = torch.zeros(batch_num, dtype=torch.float).to(device)
      for k in range(batch_num):
        curr_emission[k] = ner_feats[k, n, ner_tags[k, n+1]].unsqueeze(0)
        curr_ner_tags = ner_tags[k]
        curr_transition[k] = self.transitions[curr_ner_tags[n+1], curr_ner_tags[n]].unsqueeze(0)
      ner_score = ner_score + curr_emission*curr_mask + curr_transition*curr_mask
    
    pos_score = torch.zeros(batch_num, dtype=torch.float).to(device)
    start_tag_tensor = torch.zeros(batch_num, 1, dtype=torch.long).fill_(self.pos_tag_to_idx[START_TAG]).to(device)
    pos_tags = torch.cat([start_tag_tensor, pos_tags], 1)
    for n in range(max_seq_len):
      curr_mask = self.mask[:,n]
      curr_emission = torch.zeros(batch_num, dtype=torch.float).to(device)
      curr_transition = torch.zeros(batch_num, dtype=torch.float).to(device)
      for k in range(batch_num):
        curr_emission[k] = pos_feats[k, n, pos_tags[k, n+1]].unsqueeze(0)
        curr_pos_tags = pos_tags[k]
        curr_transition[k] = self.transitions_pos[curr_pos_tags[n+1], curr_pos_tags[n]].unsqueeze(0)
      pos_score = pos_score + curr_emission*curr_mask + curr_transition*curr_mask

    return ner_score,pos_score
  
  def _viterbi_decode(self, ner_feats, pos_feats):
    batch_size, max_seq_len = self.mask.shape
    seq_len_list = [int(self.mask[k].sum().item()) for k in range(batch_size)]
    
    ner_backpointers = torch.LongTensor(batch_size, max_seq_len, self.tag_size).to(device)
    ner_score = torch.full((batch_size, self.tag_size), -10000.).to(device)
    ner_score[:,self.tag_to_idx[START_TAG]] = 0

    for n in range(max_seq_len):
      curr_ner_emission = ner_feats[:, n]
      curr_ner_score = torch.Tensor(batch_size, self.tag_size).to(device)
      curr_ner_backpointers = torch.LongTensor(batch_size, self.tag_size).to(device)
      for curr_tag in range(self.tag_size):
        T = self.transitions[curr_tag, :].unsqueeze(0).expand(batch_size, self.tag_size)
        max_values, max_indices = torch.max(ner_score + T, 1)
        curr_ner_score[:, curr_tag] = max_values
        curr_ner_backpointers[:, curr_tag] = max_indices
      curr_mask = self.mask[:, n].unsqueeze(1).expand(batch_size, self.tag_size)
      ner_score = ner_score * (1-curr_mask) + (curr_ner_score + curr_ner_emission) * curr_mask
      ner_backpointers[:,n,:] = curr_ner_backpointers
    best_ner_score_batch, last_best_tag_batch = torch.max(ner_score, 1)
    best_ner_path_batch = [[tag] for tag in last_best_tag_batch.tolist()]
    for k in range(batch_size):
      curr_best_tag = last_best_tag_batch[k]
      curr_seq_len = seq_len_list[k]
      for n in reversed(range(1, curr_seq_len)):
        curr_best_tag = ner_backpointers[k, n, curr_best_tag].item()
        best_ner_path_batch[k].insert(0, curr_best_tag)
    
    pos_backpointers = torch.LongTensor(batch_size, max_seq_len, self.pos_tag_size).to(device)
    pos_score = torch.full((batch_size, self.pos_tag_size), -10000.).to(device)
    pos_score[:,self.tag_to_idx[START_TAG]] = 0

    for n in range(max_seq_len):
      curr_pos_emission = pos_feats[:, n]
      curr_pos_score = torch.Tensor(batch_size, self.pos_tag_size).to(device)
      curr_pos_backpointers = torch.LongTensor(batch_size, self.pos_tag_size).to(device)
      for curr_tag in range(self.pos_tag_size):
        T = self.transitions_pos[curr_tag, :].unsqueeze(0).expand(batch_size, self.pos_tag_size)
        max_values, max_indices = torch.max(pos_score + T, 1)
        curr_pos_score[:, curr_tag] = max_values
        curr_pos_backpointers[:, curr_tag] = max_indices
      curr_mask = self.mask[:, n].unsqueeze(1).expand(batch_size, self.pos_tag_size)
      pos_score = pos_score * (1-curr_mask) + (curr_pos_score + curr_pos_emission) * curr_mask
      pos_backpointers[:,n,:] = curr_pos_backpointers
    best_pos_score_batch, last_best_tag_batch = torch.max(pos_score, 1)
    best_pos_path_batch = [[tag] for tag in last_best_tag_batch.tolist()]
    for k in range(batch_size):
      curr_best_tag = last_best_tag_batch[k]
      curr_seq_len = seq_len_list[k]
      for n in reversed(range(1, curr_seq_len)):
        curr_best_tag = pos_backpointers[k, n, curr_best_tag].item()
        best_pos_path_batch[k].insert(0, curr_best_tag)

    return best_ner_path_batch,best_pos_path_batch

  def neg_log_likelihood(self, word_seq_tensor, word_seq_len, char_seq_tensor, ner_tags, pos_tags):
    ner_lstm_feats,pos_lstm_feats = self._get_lstm_features(word_seq_tensor, word_seq_len, char_seq_tensor)
    ner_forward_score, pos_forward_score = self._forward_alg(ner_lstm_feats,pos_lstm_feats)
    ner_gold_score,pos_gold_score = self._score_sentence(ner_lstm_feats,pos_lstm_feats, ner_tags, pos_tags)
    ner_loss = torch.mean(ner_forward_score - ner_gold_score)
    pos_loss = torch.mean(pos_forward_score - pos_gold_score)
    loss =  ner_loss + pos_loss
    return ner_loss,pos_loss,loss
  
  def forward(self, word_seq_tensor, word_seq_len, char_seq_tensor):
    ner_lstm_feats,pos_lstm_feats = self._get_lstm_features(word_seq_tensor, word_seq_len, char_seq_tensor)
    ner_tag_seqs,pos_tag_seqs= self._viterbi_decode(ner_lstm_feats,pos_lstm_feats)
    return ner_tag_seqs,pos_tag_seqs

- The *evaluate_model* function creates the .out prediction file with required output. It also runs the perl script to evaluate F-1 score.
- It returns the best F-1 score until current epoch, the current F-1 score and a flag called save, which tells the training function when to save the model.

In [None]:
def evaluate_model(model,data,act_data,best_f1_score,best_acc,name="dev",model_num="1"):
  save = False
  f1_score = 0.0
  correct,total = 0,0

  pred_file_name = name + model_num + '.out'
  score_file_name = name+model_num+'_score.out'
  
  pred_file = open(pred_file_name,'w')

  for idx in range(len(data)):
    batch = data[idx]
    rows = batch[5]
    ner_pred_tags,pos_pred_tags = model(*batch[:3])
    orig_data = act_data[idx*batch_size:(idx+1)*batch_size]
    for i,(row,ner_preds,pos_preds) in enumerate(list(zip(rows,ner_pred_tags,pos_pred_tags))):
      for idx,(word,ner_gold,pos_gold,ner_pred,pos_pred) in enumerate(zip(orig_data[i][0],row[2],row[3],ner_preds,pos_preds),start=1):
        #print(str(idx),word,ner_gold,pred.item())
        if pos_pred == pos_gold:
          correct+=1
        total+=1
        pred_file.write(' '.join([str(idx),word,idx_to_tag[ner_gold],idx_to_tag[ner_pred]]))
        pred_file.write('\n')
      pred_file.write('\n')
    
  pred_file.close()

  acc = correct*100/total

  os.system('perl conlleval.v2 < %s > %s' % (pred_file_name,score_file_name))
  eval_lines = [l.rstrip() for l in open(score_file_name,'r',encoding='utf-8')]

  for i, line in enumerate(eval_lines):
    #print(line)
    if i==1:
      f1_score = float(line.strip().split()[-1])
      if f1_score>best_f1_score:
        best_f1_score = f1_score
        save = True
      if acc>best_acc:
        best_acc = acc
        save = True
  
  return best_f1_score,f1_score,best_acc,acc,save

- The *train_model* function contains the main loop for training the model.
- First, the model back propagates and learns on training batches and after each epoch, the F-1 score on development dataset is calculated. The model is saved whenever it beats its current best F-1 score.

In [None]:
def train_model(model,train_batches,dev_batches,model_num="1",curr_epoch = 0):
  optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate,momentum=momentum)
  if curr_epoch>0:
    for g in optimizer.param_groups:
      g['initial_lr'] = learning_rate
  lambda_schedule = lambda x: 1/(1+x*0.05)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,lambda_schedule,verbose=True,last_epoch=curr_epoch-1)

  losses = []
  best_dev_f1 = -1.
  best_dev_acc = -1.

  # loss_function = loss_func(ignore_index=tag_to_idx[PAD])

  for epoch in range(1,num_epochs+1):
    total_ner_loss,total_pos_loss,total_loss = 0,0,0
    model.train()
    model.zero_grad()
    for idx in tqdm.notebook.tqdm(np.random.permutation(len(train_batches)),total=len(train_batches)):
      batch = train_batches[idx]
      ner_loss,pos_loss,loss = model.neg_log_likelihood(*batch[:5])
      total_ner_loss += ner_loss.data
      total_pos_loss += pos_loss.data
      total_loss += loss.data
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(),grad_clip)
      optimizer.step()
      model.zero_grad()
    losses.append(total_loss)
    
    model.eval()
    total_dev_ner_loss,total_dev_pos_loss,total_dev_loss = 0,0,0
    for idx in tqdm.notebook.tqdm(np.random.permutation(len(dev_batches)),total=len(dev_batches)):
      batch = dev_batches[idx]
      ner_loss,pos_loss,loss = model.neg_log_likelihood(*batch[:5])
      total_dev_ner_loss += ner_loss.data
      total_dev_pos_loss += pos_loss.data
      total_dev_loss += loss.data
      model.zero_grad()

    print("Epoch:",epoch,
          "train_ner_loss:",total_ner_loss.item()/len(train_batches),
          "train_pos_loss:",total_pos_loss.item()/len(train_batches),
          "train_loss:",total_loss.item()/len(train_batches),
          "val_ner_loss:",total_dev_ner_loss.item()/len(dev_batches),
          "val_pos_loss:",total_dev_pos_loss.item()/len(dev_batches),
          "val_loss:",total_dev_loss.item()/len(dev_batches))
    
    if epoch%5==0:
      best_dev_f1 , dev_f1, best_dev_acc,dev_acc, save = evaluate_model(model,
                                                                        dev_batches,
                                                                        validation_data,
                                                                        best_dev_f1,
                                                                        best_dev_acc,
                                                                        "dev",
                                                                        model_num)
      if save:
        print('NER F1 Score:',dev_f1,"POS Accuracy:",dev_acc,"saving model to",model_file_name)
        torch.save(model.state_dict(),model_file_name)
        
    
    model.zero_grad()
    scheduler.step()
  return losses

- The *infer* function creates the .out prediction file with required output without the column for gold tags.

In [None]:
def infer(model,data,act_data,filename):
  
  pred_file = open(filename,'w')

  for idx in range(len(data)):
    batch = data[idx]
    rows = batch[4]
    scores,pred_tags = model(*batch[:3])
    # actual_pred_tags = []
    # for i,len in enumerate(batch[2]):
    #   actual_pred_tags.append(pred_tags[i][:len])
    orig_data = act_data[idx*batch_size:(idx+1)*batch_size]
    for i,(row,preds) in enumerate(list(zip(rows,pred_tags))):
      for idx,(word,pred) in enumerate(zip(orig_data[i][0],preds),start=1):
        pred_file.write(' '.join([str(idx),word,idx_to_tag[pred]]))
        pred_file.write('\n')
      pred_file.write('\n')
    
  pred_file.close()

In [None]:
!gzip -d glove.6B.100d.gz

gzip: glove.6B.100d.gz: No such file or directory


# Bonus Task: Using BLSTM-CNN with GloVe word embeddings

- We use all the above functions to preprocess and prepare our data for training the model.
- For this task, we load the glove embeddings for words in the train, dev and test sets into our embedding layer.
- Additionaly, we also make use_cnn_for_char_level = True, this in turn gives the model a CNN layer which takes character level embedding as input and creates a character-level representation for a word. These representations are then concatenated with the word embeddings to create the inputs for the LSTM Layer.
- After forming the initial training and validation data, we sort the sentences (and corresponding tags) according to length.
- The model is trained with the following parameters:

 - use_cnn_for_char_level = True 
 - word_embedding_dim = 100 (adding 1 more feature for capitalization makes the dimension  101)
 - pre_embeddings = GloVe Embeddings Dictionary
 - lstm_hidden_dim = 256
 - dropout = 0.33
 - output_dimension = 128
 - num_epochs = 100
 - batch_size = 16
 - learning_rate = 0.015
 - momentum = 0.9
 - decay_rate = 0.05
 - grad_clip = 5.0
 - loss function = Cross Entropy Loss
 - optimizer = SGD

In [None]:
pre_embeddings = get_embedding_data('glove.6B.100d',100)

In [None]:
word_embedding_dim = 100
model_file_name = 'BLSTM3_HPS.pt'
batch_size = 10
use_cnn_for_char_level = True
learning_rate = 0.0125

word_to_idx = {}
idx_to_word = {}
char_to_idx = {}
idx_to_char = {}

pos_idx_to_tag = {}
pos_tag_to_idx = {}
tag_to_idx = {}
idx_to_tag = {}

training_data = sorted(make_data(train),key=lambda x:len(x[0]))
validation_data = sorted(make_data(dev),key=lambda x:len(x[0]))
testing_data = sorted(make_data(test,has_tags=False),key=lambda x:len(x[0]))

make_word_to_idx(training_data)
make_word_to_idx(validation_data,is_valid_or_test=True)
make_word_to_idx(testing_data,is_valid_or_test=True)
make_tag_to_idx(training_data)

build_embedding_table()

training_data_tensors = make_numeric_data(training_data)
validation_data_tensors = make_numeric_data(validation_data)

train_batches = data_batching(training_data_tensors)
dev_batches = data_batching(validation_data_tensors)

random.seed(random_seed)
model = BLSTM()
model.to(device)
train_model(model,train_batches,dev_batches,model_num="3")

Adjusting learning rate of group 0 to 1.2500e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 1 train_ner_loss: 5.8892145544946635 train_pos_loss: 16.77828969312875 train_loss: 22.6675048990994 val_ner_loss: 2.5075551673383467 val_pos_loss: 7.818577483339337 val_loss: 10.326131771208573
Adjusting learning rate of group 0 to 1.1905e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 2 train_ner_loss: 2.6695156797239825 train_pos_loss: 7.892096032771848 train_loss: 10.561599660190128 val_ner_loss: 1.285613485990409 val_pos_loss: 4.840370376103206 val_loss: 6.12598430182817
Adjusting learning rate of group 0 to 1.1364e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 3 train_ner_loss: 1.6661161151705304 train_pos_loss: 5.7225103193795865 train_loss: 7.388624317253169 val_ner_loss: 1.019538373699793 val_pos_loss: 4.043476027782781 val_loss: 5.063016072473883
Adjusting learning rate of group 0 to 1.0870e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 4 train_ner_loss: 1.416321981581471 train_pos_loss: 4.935992484573049 train_loss: 6.352313000333556 val_ner_loss: 0.8782394365206232 val_pos_loss: 3.650063814278638 val_loss: 4.528303426693084
Adjusting learning rate of group 0 to 1.0417e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 5 train_ner_loss: 1.2543119566690293 train_pos_loss: 4.466818699445463 train_loss: 5.721130493245497 val_ner_loss: 0.8624512070537644 val_pos_loss: 3.486561360208033 val_loss: 4.349013094943263
NER F1 Score: 89.7 POS Accuracy: 93.0706890534724 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 1.0000e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 6 train_ner_loss: 1.1510076541913359 train_pos_loss: 4.15958620851401 train_loss: 5.310598667340727 val_ner_loss: 0.7773027667394633 val_pos_loss: 3.211924616129323 val_loss: 3.9892286141255404
Adjusting learning rate of group 0 to 9.6154e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 7 train_ner_loss: 1.0752672191617327 train_pos_loss: 3.921614083868412 train_loss: 4.996879104298699 val_ner_loss: 0.7397166315348073 val_pos_loss: 3.152667394632565 val_loss: 3.892383586432817
Adjusting learning rate of group 0 to 9.2593e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 8 train_ner_loss: 1.0126168096439292 train_pos_loss: 3.733393225692128 train_loss: 4.746013618453969 val_ner_loss: 0.7259990241410753 val_pos_loss: 3.0163127448442 val_loss: 3.742313439976585
Adjusting learning rate of group 0 to 8.9286e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 9 train_ner_loss: 0.959897825764051 train_pos_loss: 3.5779943790652102 train_loss: 4.537896357988659 val_ner_loss: 0.6716286607022244 val_pos_loss: 2.987437663229467 val_loss: 3.659065004728026
Adjusting learning rate of group 0 to 8.6207e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 10 train_ner_loss: 0.9142652719000167 train_pos_loss: 3.4366758828802535 train_loss: 4.350941643387258 val_ner_loss: 0.6519613128574162 val_pos_loss: 2.8616304864800974 val_loss: 3.5135906120542146
NER F1 Score: 91.16 POS Accuracy: 94.15060684788088 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 8.3333e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 11 train_ner_loss: 0.8747438885037941 train_pos_loss: 3.331582708785023 train_loss: 4.206326352985323 val_ner_loss: 0.6621316695419444 val_pos_loss: 2.824547495553404 val_loss: 3.486679209068804
Adjusting learning rate of group 0 to 8.0645e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 12 train_ner_loss: 0.8481885873134173 train_pos_loss: 3.2547342759756503 train_loss: 4.102925062020514 val_ner_loss: 0.6021796674481042 val_pos_loss: 2.811880677852576 val_loss: 3.4140617964247117
Adjusting learning rate of group 0 to 7.8125e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 13 train_ner_loss: 0.8202892097335723 train_pos_loss: 3.139653975045864 train_loss: 3.9599498624082723 val_ner_loss: 0.6045825529510762 val_pos_loss: 2.8046822231853388 val_loss: 3.409266579048091
Adjusting learning rate of group 0 to 7.5758e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 14 train_ner_loss: 0.7868950394012675 train_pos_loss: 3.0514789807788527 train_loss: 3.838375323132088 val_ner_loss: 0.6128248055317903 val_pos_loss: 2.740578423315922 val_loss: 3.3534030529538903
Adjusting learning rate of group 0 to 7.3529e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 15 train_ner_loss: 0.7591810881733656 train_pos_loss: 3.003551195588726 train_loss: 3.762729840727151 val_ner_loss: 0.5851240790199478 val_pos_loss: 2.74142412081232 val_loss: 3.326548991354467
NER F1 Score: 92.37 POS Accuracy: 94.46857187172826 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 7.1429e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 16 train_ner_loss: 0.7296010654238242 train_pos_loss: 2.9301533053285524 train_loss: 3.6597536378418947 val_ner_loss: 0.610427988365679 val_pos_loss: 2.68311320807817 val_loss: 3.293539701346362
Adjusting learning rate of group 0 to 6.9444e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 17 train_ner_loss: 0.7157225096679036 train_pos_loss: 2.88977254367495 train_loss: 3.6054957862533357 val_ner_loss: 0.5819910582616624 val_pos_loss: 2.5844930599333575 val_loss: 3.1664842061419307
Adjusting learning rate of group 0 to 6.7568e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 18 train_ner_loss: 0.7013791257974066 train_pos_loss: 2.833623131566878 train_loss: 3.5350034788817544 val_ner_loss: 0.5682579722115905 val_pos_loss: 2.5883386265647514 val_loss: 3.1565993251305837
Adjusting learning rate of group 0 to 6.5789e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 19 train_ner_loss: 0.6695236195875375 train_pos_loss: 2.795518301263342 train_loss: 3.4650453818170446 val_ner_loss: 0.5648289221508015 val_pos_loss: 2.5990041243583395 val_loss: 3.16383383803134
Adjusting learning rate of group 0 to 6.4103e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 20 train_ner_loss: 0.6519412599936416 train_pos_loss: 2.734436238742495 train_loss: 3.3863816111782854 val_ner_loss: 0.5411612211111987 val_pos_loss: 2.5554945008555476 val_loss: 3.0966557659402016
NER F1 Score: 92.68 POS Accuracy: 94.86215052929543 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 6.2500e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 21 train_ner_loss: 0.6422368737361366 train_pos_loss: 2.685778474712308 train_loss: 3.32801392334473 val_ner_loss: 0.5457853817458799 val_pos_loss: 2.5658936954025577 val_loss: 3.111678505493516
Adjusting learning rate of group 0 to 6.0976e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 22 train_ner_loss: 0.6397889934435457 train_pos_loss: 2.6599500578510673 train_loss: 3.299737911211641 val_ner_loss: 0.5440913483457538 val_pos_loss: 2.569529068916607 val_loss: 3.113620725076549
Adjusting learning rate of group 0 to 5.9524e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 23 train_ner_loss: 0.6161821096877085 train_pos_loss: 2.610764598273849 train_loss: 3.2269452421405935 val_ner_loss: 0.5288018658113067 val_pos_loss: 2.5493517609082312 val_loss: 3.0781563091003243
Adjusting learning rate of group 0 to 5.8140e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 24 train_ner_loss: 0.5999478574909315 train_pos_loss: 2.564514852349483 train_loss: 3.1644625062541696 val_ner_loss: 0.5560062639307907 val_pos_loss: 2.532302900418768 val_loss: 3.088309604084114
Adjusting learning rate of group 0 to 5.6818e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 25 train_ner_loss: 0.5748422125166778 train_pos_loss: 2.5279115113200468 train_loss: 3.1027540495747163 val_ner_loss: 0.531618277689909 val_pos_loss: 2.4732950963616713 val_loss: 3.0049144733879682
NER F1 Score: 92.95 POS Accuracy: 95.15297219744852 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 5.5556e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 26 train_ner_loss: 0.5812684937744955 train_pos_loss: 2.515671091925867 train_loss: 3.096940603631588 val_ner_loss: 0.5292872887866985 val_pos_loss: 2.469168979084114 val_loss: 2.9984560040300794
Adjusting learning rate of group 0 to 5.4348e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 27 train_ner_loss: 0.5683211822204386 train_pos_loss: 2.470155070828469 train_loss: 3.0384784517803536 val_ner_loss: 0.5353851318359375 val_pos_loss: 2.4831681924869415 val_loss: 3.018555742862932
Adjusting learning rate of group 0 to 5.3191e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 28 train_ner_loss: 0.567078166042987 train_pos_loss: 2.434197668237158 train_loss: 3.001272332596731 val_ner_loss: 0.5280270095517381 val_pos_loss: 2.4618405388823845 val_loss: 2.9898688676377883
Adjusting learning rate of group 0 to 5.2083e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 29 train_ner_loss: 0.5649056972226276 train_pos_loss: 2.4068384456825385 train_loss: 2.971745486574383 val_ner_loss: 0.529983388587446 val_pos_loss: 2.4764318108902197 val_loss: 3.006416254840598
Adjusting learning rate of group 0 to 5.1020e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 30 train_ner_loss: 0.5396405627840435 train_pos_loss: 2.3770185983363907 train_loss: 2.9166589575341892 val_ner_loss: 0.5221358417434033 val_pos_loss: 2.4861545177976403 val_loss: 3.0082916347712536
NER F1 Score: 92.99 POS Accuracy: 95.14715576408547 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 5.0000e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 31 train_ner_loss: 0.5293173150589977 train_pos_loss: 2.3747876188292194 train_loss: 2.904104038108739 val_ner_loss: 0.514654110075761 val_pos_loss: 2.46791010699973 val_loss: 2.9825654043587897
Adjusting learning rate of group 0 to 4.9020e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 32 train_ner_loss: 0.5315765930542237 train_pos_loss: 2.3569666561561875 train_loss: 2.888544185707138 val_ner_loss: 0.5082043474620632 val_pos_loss: 2.4567494680970823 val_loss: 2.9649535077449567
Adjusting learning rate of group 0 to 4.8077e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 33 train_ner_loss: 0.5166360500099024 train_pos_loss: 2.33567544378544 train_loss: 2.8523120231195795 val_ner_loss: 0.5161974904173046 val_pos_loss: 2.4549349474288547 val_loss: 2.9711323059257926
Adjusting learning rate of group 0 to 4.7170e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 34 train_ner_loss: 0.5062535994048115 train_pos_loss: 2.2952603167736823 train_loss: 2.801513509006004 val_ner_loss: 0.5128357637169039 val_pos_loss: 2.4471790852395534 val_loss: 2.9600133538589697
Adjusting learning rate of group 0 to 4.6296e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 35 train_ner_loss: 0.5000458476223733 train_pos_loss: 2.2753663575195966 train_loss: 2.7754134266594397 val_ner_loss: 0.5160024241686555 val_pos_loss: 2.5048548453822947 val_loss: 3.0208596001440924
NER F1 Score: 93.26 POS Accuracy: 95.15297219744852 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 4.5455e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 36 train_ner_loss: 0.4995009286790152 train_pos_loss: 2.256529418049533 train_loss: 2.7560307131837893 val_ner_loss: 0.4974853779465733 val_pos_loss: 2.4032573425117074 val_loss: 2.9007438197946684
Adjusting learning rate of group 0 to 4.4643e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 37 train_ner_loss: 0.484192953179724 train_pos_loss: 2.248788743276768 train_loss: 2.7329808413942627 val_ner_loss: 0.5190322392268552 val_pos_loss: 2.4068965856898417 val_loss: 2.9259288249166966
Adjusting learning rate of group 0 to 4.3860e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 38 train_ner_loss: 0.4757174949315168 train_pos_loss: 2.2216497196047365 train_loss: 2.6973700240264344 val_ner_loss: 0.5161439747219471 val_pos_loss: 2.415997156148685 val_loss: 2.932140339348433
Adjusting learning rate of group 0 to 4.3103e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 39 train_ner_loss: 0.48769622456637757 train_pos_loss: 2.2101081710723816 train_loss: 2.697806350066711 val_ner_loss: 0.510629659427346 val_pos_loss: 2.4247082976855188 val_loss: 2.935337913139409
Adjusting learning rate of group 0 to 4.2373e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 40 train_ner_loss: 0.4752349955308747 train_pos_loss: 2.1924600124041027 train_loss: 2.6676957815627085 val_ner_loss: 0.5089951221125271 val_pos_loss: 2.4147025776184257 val_loss: 2.9236979635716858
NER F1 Score: 93.43 POS Accuracy: 95.31777114273527 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 4.1667e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 41 train_ner_loss: 0.46163486432361156 train_pos_loss: 2.160689645961891 train_loss: 2.622324713871748 val_ner_loss: 0.5001855679822587 val_pos_loss: 2.448452556511167 val_loss: 2.9486386521748917
Adjusting learning rate of group 0 to 4.0984e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 42 train_ner_loss: 0.4500251143991828 train_pos_loss: 2.1645814006212474 train_loss: 2.6146102610073383 val_ner_loss: 0.5162354834828665 val_pos_loss: 2.4259244275711453 val_loss: 2.942161186284222
Adjusting learning rate of group 0 to 4.0323e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 43 train_ner_loss: 0.45974578755628753 train_pos_loss: 2.1531815481154104 train_loss: 2.6129255441127417 val_ner_loss: 0.5066795239187455 val_pos_loss: 2.4053638469245318 val_loss: 2.9120427112414444
Adjusting learning rate of group 0 to 3.9683e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 44 train_ner_loss: 0.45407415979142346 train_pos_loss: 2.1111233986720315 train_loss: 2.5651985763946796 val_ner_loss: 0.508465066079791 val_pos_loss: 2.3707532195605188 val_loss: 2.8792175820650217
Adjusting learning rate of group 0 to 3.9062e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 45 train_ner_loss: 0.4497010865316461 train_pos_loss: 2.1022191877918615 train_loss: 2.551919052806037 val_ner_loss: 0.5085632148324928 val_pos_loss: 2.426432584823037 val_loss: 2.934996503230818
NER F1 Score: 93.34 POS Accuracy: 95.37787428748692 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 3.8462e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 46 train_ner_loss: 0.43285391758568215 train_pos_loss: 2.1007339854173614 train_loss: 2.5335892873895096 val_ner_loss: 0.490800972974266 val_pos_loss: 2.397609919567273 val_loss: 2.8884104967804394
Adjusting learning rate of group 0 to 3.7879e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 47 train_ner_loss: 0.4321428315491578 train_pos_loss: 2.0744172873061206 train_loss: 2.5065587344688125 val_ner_loss: 0.5004070622776702 val_pos_loss: 2.4182505319029177 val_loss: 2.918659133251531
Adjusting learning rate of group 0 to 3.7313e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 48 train_ner_loss: 0.42930540933539024 train_pos_loss: 2.0542195773744996 train_loss: 2.48352335801993 val_ner_loss: 0.4923987045068219 val_pos_loss: 2.4014496817025397 val_loss: 2.893848606076639
Adjusting learning rate of group 0 to 3.6765e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 49 train_ner_loss: 0.43485871277466226 train_pos_loss: 2.0509980286336726 train_loss: 2.4858592251605236 val_ner_loss: 0.49225974838740544 val_pos_loss: 2.4142169347757565 val_loss: 2.9064763753489733
Adjusting learning rate of group 0 to 3.6232e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 50 train_ner_loss: 0.42875401634944127 train_pos_loss: 2.0632788395388593 train_loss: 2.4920366833097063 val_ner_loss: 0.5029784980356178 val_pos_loss: 2.385204304192183 val_loss: 2.8881811312364913
NER F1 Score: 93.6 POS Accuracy: 95.43603862111753 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 3.5714e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 51 train_ner_loss: 0.4309634971491411 train_pos_loss: 2.0379878903644095 train_loss: 2.4689527719000166 val_ner_loss: 0.4970287136110861 val_pos_loss: 2.3902844695605188 val_loss: 2.8873150300567363
Adjusting learning rate of group 0 to 3.5211e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 52 train_ner_loss: 0.4224735435602902 train_pos_loss: 2.0301869540631254 train_loss: 2.4526593575404436 val_ner_loss: 0.5084999370300117 val_pos_loss: 2.3680812165098164 val_loss: 2.8765819890354827
Adjusting learning rate of group 0 to 3.4722e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 53 train_ner_loss: 0.41546740795947296 train_pos_loss: 2.01098388509006 train_loss: 2.426449990097565 val_ner_loss: 0.49876812761730005 val_pos_loss: 2.419177844132745 val_loss: 2.9179464114846003
Adjusting learning rate of group 0 to 3.4247e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 54 train_ner_loss: 0.4105915173918029 train_pos_loss: 1.9789517881712808 train_loss: 2.3895428983905935 val_ner_loss: 0.5013220619399541 val_pos_loss: 2.390850319986041 val_loss: 2.892173393315472
Adjusting learning rate of group 0 to 3.3784e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 55 train_ner_loss: 0.40094372811040696 train_pos_loss: 1.9671301049658105 train_loss: 2.3680751360281853 val_ner_loss: 0.5145624693944749 val_pos_loss: 2.3544443443804033 val_loss: 2.8690053626508467
NER F1 Score: 93.43 POS Accuracy: 95.4515491100857 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 3.3333e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 56 train_ner_loss: 0.38729351468687456 train_pos_loss: 1.9708576876772015 train_loss: 2.358151365233072 val_ner_loss: 0.5040830232911563 val_pos_loss: 2.3865599178786923 val_loss: 2.890642941169849
Adjusting learning rate of group 0 to 3.2895e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 57 train_ner_loss: 0.39947318394554704 train_pos_loss: 1.9647276244058538 train_loss: 2.3641996682684288 val_ner_loss: 0.4996379665407736 val_pos_loss: 2.398658070852846 val_loss: 2.8982952018979646
Adjusting learning rate of group 0 to 3.2468e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 58 train_ner_loss: 0.39045700063062877 train_pos_loss: 1.9325748415610406 train_loss: 2.323032575102151 val_ner_loss: 0.5082067220286608 val_pos_loss: 2.4015379804012067 val_loss: 2.9097450102440563
Adjusting learning rate of group 0 to 3.2051e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 59 train_ner_loss: 0.38254173029415445 train_pos_loss: 1.943839349931204 train_loss: 2.3263800215768846 val_ner_loss: 0.4965475120874234 val_pos_loss: 2.373062705443984 val_loss: 2.8696074472037103
Adjusting learning rate of group 0 to 3.1646e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 60 train_ner_loss: 0.3898012493354945 train_pos_loss: 1.9317422552535024 train_loss: 2.321540369360824 val_ner_loss: 0.48946708217477936 val_pos_loss: 2.3625968471384184 val_loss: 2.8520636214990094
NER F1 Score: 93.6 POS Accuracy: 95.50583582147428 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 3.1250e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 61 train_ner_loss: 0.3845417615650017 train_pos_loss: 1.9140030528164609 train_loss: 2.298545221553953 val_ner_loss: 0.4893065790621623 val_pos_loss: 2.3956502864958575 val_loss: 2.884957173372208
Adjusting learning rate of group 0 to 3.0864e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 62 train_ner_loss: 0.3842599574846773 train_pos_loss: 1.919365514509673 train_loss: 2.3036262456220813 val_ner_loss: 0.4945798318736491 val_pos_loss: 2.386161518371758 val_loss: 2.8807427573959834
Adjusting learning rate of group 0 to 3.0488e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 63 train_ner_loss: 0.3815777087068462 train_pos_loss: 1.9061021149516344 train_loss: 2.287679416485991 val_ner_loss: 0.49928600300285936 val_pos_loss: 2.382477774056646 val_loss: 2.881765931758826
Adjusting learning rate of group 0 to 3.0120e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 64 train_ner_loss: 0.377748577176451 train_pos_loss: 1.8835592543466477 train_loss: 2.2613104374270345 val_ner_loss: 0.498092783287554 val_pos_loss: 2.3818797350616894 val_loss: 2.8799730460307096
Adjusting learning rate of group 0 to 2.9762e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 65 train_ner_loss: 0.37915478808789194 train_pos_loss: 1.8658037650100068 train_loss: 2.2449608332638427 val_ner_loss: 0.4971818291831772 val_pos_loss: 2.366726482292417 val_loss: 2.8639091909447045
NER F1 Score: 93.79 POS Accuracy: 95.54848966613673 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 2.9412e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 66 train_ner_loss: 0.3719182742922365 train_pos_loss: 1.8828603834848232 train_loss: 2.2547789020805538 val_ner_loss: 0.5028916064875271 val_pos_loss: 2.3592094839134545 val_loss: 2.862101530135537
Adjusting learning rate of group 0 to 2.9070e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 67 train_ner_loss: 0.36862807626959637 train_pos_loss: 1.8758988739889093 train_loss: 2.2445305333764174 val_ner_loss: 0.4929456023730187 val_pos_loss: 2.37098627887473 val_loss: 2.8639315294601047
Adjusting learning rate of group 0 to 2.8736e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 68 train_ner_loss: 0.373353760905708 train_pos_loss: 1.8632095876417611 train_loss: 2.2365633078302203 val_ner_loss: 0.4880215427717489 val_pos_loss: 2.394279545940652 val_loss: 2.8823001212963795
Adjusting learning rate of group 0 to 2.8409e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 69 train_ner_loss: 0.36487557460181785 train_pos_loss: 1.846368607717645 train_loss: 2.211243693712475 val_ner_loss: 0.4920982778587671 val_pos_loss: 2.3488335073509545 val_loss: 2.8409321809708215
Adjusting learning rate of group 0 to 2.8090e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 70 train_ner_loss: 0.36057514091425535 train_pos_loss: 1.8481966493287192 train_loss: 2.2087749254711473 val_ner_loss: 0.493638931846069 val_pos_loss: 2.363766365161203 val_loss: 2.857402702573397
NER F1 Score: 93.56 POS Accuracy: 95.56400015510489 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 2.7778e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 71 train_ner_loss: 0.358766480713705 train_pos_loss: 1.8431218142824384 train_loss: 2.2018891500583724 val_ner_loss: 0.49479609332785485 val_pos_loss: 2.3813652456322045 val_loss: 2.8761614269069704
Adjusting learning rate of group 0 to 2.7473e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 72 train_ner_loss: 0.36243665210400683 train_pos_loss: 1.8280754878252168 train_loss: 2.1905134021639427 val_ner_loss: 0.5030856173732439 val_pos_loss: 2.349525473646884 val_loss: 2.852612058436149
Adjusting learning rate of group 0 to 2.7174e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 73 train_ner_loss: 0.35002562743652016 train_pos_loss: 1.798645223117495 train_loss: 2.148672031354236 val_ner_loss: 0.496339605589765 val_pos_loss: 2.361827663454611 val_loss: 2.8581678406992976
Adjusting learning rate of group 0 to 2.6882e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 74 train_ner_loss: 0.34957586470407354 train_pos_loss: 1.8294942396493497 train_loss: 2.179069086422198 val_ner_loss: 0.489492410885154 val_pos_loss: 2.3804618549621757 val_loss: 2.869955716971362
Adjusting learning rate of group 0 to 2.6596e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 75 train_ner_loss: 0.3497351505821589 train_pos_loss: 1.8122091159731488 train_loss: 2.161943737231071 val_ner_loss: 0.49073430921571054 val_pos_loss: 2.3631145026567 val_loss: 2.8538484161113113
NER F1 Score: 93.73 POS Accuracy: 95.6124704331304 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 2.6316e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 76 train_ner_loss: 0.3555422039171948 train_pos_loss: 1.7907288126980487 train_loss: 2.1462695507942797 val_ner_loss: 0.49637333323014227 val_pos_loss: 2.3882106813985953 val_loss: 2.8845837507880043
Adjusting learning rate of group 0 to 2.6042e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 77 train_ner_loss: 0.34330129591602737 train_pos_loss: 1.787655963350567 train_loss: 2.1309583993495664 val_ner_loss: 0.49488223732720193 val_pos_loss: 2.3635421005380945 val_loss: 2.858424293891841
Adjusting learning rate of group 0 to 2.5773e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 78 train_ner_loss: 0.3420889398270722 train_pos_loss: 1.7856873657959473 train_loss: 2.1277772421197465 val_ner_loss: 0.49211054645285485 val_pos_loss: 2.367100256664265 val_loss: 2.8592115946393193
Adjusting learning rate of group 0 to 2.5510e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 79 train_ner_loss: 0.34613216265270597 train_pos_loss: 1.784185062176868 train_loss: 2.130315718291361 val_ner_loss: 0.49547935287959294 val_pos_loss: 2.3677097287576547 val_loss: 2.8631880262743157
Adjusting learning rate of group 0 to 2.5253e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 80 train_ner_loss: 0.340667887478371 train_pos_loss: 1.769841026830387 train_loss: 2.1105082424741495 val_ner_loss: 0.4914804508088301 val_pos_loss: 2.3483357278345642 val_loss: 2.839816486457583
Adjusting learning rate of group 0 to 2.5000e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 81 train_ner_loss: 0.3363188373955033 train_pos_loss: 1.7669533595313542 train_loss: 2.103273764540944 val_ner_loss: 0.4880565016688806 val_pos_loss: 2.361802158850414 val_loss: 2.8498594960149495
Adjusting learning rate of group 0 to 2.4752e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 82 train_ner_loss: 0.3452617907380962 train_pos_loss: 1.757636438615327 train_loss: 2.102896397077218 val_ner_loss: 0.49179855478600054 val_pos_loss: 2.3760771737662103 val_loss: 2.8678764761009545
Adjusting learning rate of group 0 to 2.4510e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 83 train_ner_loss: 0.32983707888592395 train_pos_loss: 1.7381766881045697 train_loss: 2.0680149070734655 val_ner_loss: 0.4957786362178044 val_pos_loss: 2.3485251654809076 val_loss: 2.8443017789197587
Adjusting learning rate of group 0 to 2.4272e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 84 train_ner_loss: 0.32951431325310626 train_pos_loss: 1.7509594612554202 train_loss: 2.0804745481362574 val_ner_loss: 0.5009420433374234 val_pos_loss: 2.3871023744258824 val_loss: 2.8880441099491176
Adjusting learning rate of group 0 to 2.4038e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 85 train_ner_loss: 0.3271972167643012 train_pos_loss: 1.7442417666465144 train_loss: 2.0714403677972815 val_ner_loss: 0.4867736640512428 val_pos_loss: 2.3766275455353925 val_loss: 2.8634012095866352
NER F1 Score: 93.84 POS Accuracy: 95.61440924425142 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 2.3810e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 86 train_ner_loss: 0.331847085882776 train_pos_loss: 1.7093301458785024 train_loss: 2.0411783311270013 val_ner_loss: 0.49175462530394454 val_pos_loss: 2.3733807214742435 val_loss: 2.8651327083708575
Adjusting learning rate of group 0 to 2.3585e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 87 train_ner_loss: 0.33315268447193963 train_pos_loss: 1.7289524396472649 train_loss: 2.0621083000646263 val_ner_loss: 0.4868638975819525 val_pos_loss: 2.363257680227846 val_loss: 2.8501219295974423
Adjusting learning rate of group 0 to 2.3364e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 88 train_ner_loss: 0.3254089966227485 train_pos_loss: 1.7277033973169613 train_loss: 2.05311076524975 val_ner_loss: 0.4893346781002342 val_pos_loss: 2.3776150134523597 val_loss: 2.866949691552594
Adjusting learning rate of group 0 to 2.3148e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 89 train_ner_loss: 0.3210937825737992 train_pos_loss: 1.7137875445609574 train_loss: 2.0348826300867247 val_ner_loss: 0.48944953676603026 val_pos_loss: 2.368659731290526 val_loss: 2.8581099716318445
Adjusting learning rate of group 0 to 2.2936e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 90 train_ner_loss: 0.31767793112074716 train_pos_loss: 1.7085101004836558 train_loss: 2.026189986032355 val_ner_loss: 0.4828489891733835 val_pos_loss: 2.3794942630471003 val_loss: 2.862343560034672
NER F1 Score: 93.83 POS Accuracy: 95.65124665555082 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 2.2727e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 91 train_ner_loss: 0.32688672738205676 train_pos_loss: 1.7175336943378918 train_loss: 2.04442300726526 val_ner_loss: 0.4921324892071551 val_pos_loss: 2.372444966340958 val_loss: 2.8645789946190563
Adjusting learning rate of group 0 to 2.2523e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 92 train_ner_loss: 0.32900084838460014 train_pos_loss: 1.699948891709056 train_loss: 2.0289509412525017 val_ner_loss: 0.4987045859740859 val_pos_loss: 2.369315463459114 val_loss: 2.8680217643979646
Adjusting learning rate of group 0 to 2.2321e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 93 train_ner_loss: 0.31792349684946214 train_pos_loss: 1.6944991322339893 train_loss: 2.0124212039797365 val_ner_loss: 0.4929328500709204 val_pos_loss: 2.3859923085149495 val_loss: 2.878924542957493
Adjusting learning rate of group 0 to 2.2124e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 94 train_ner_loss: 0.31065504037196673 train_pos_loss: 1.6842321964643094 train_loss: 1.9948898223815876 val_ner_loss: 0.4967863758977621 val_pos_loss: 2.3873619937072226 val_loss: 2.884147885896974
Adjusting learning rate of group 0 to 2.1930e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 95 train_ner_loss: 0.3075744954644242 train_pos_loss: 1.6910679710953136 train_loss: 1.9986421611803702 val_ner_loss: 0.4897275369517741 val_pos_loss: 2.3877741129322767 val_loss: 2.877502969087716
NER F1 Score: 93.85 POS Accuracy: 95.66869595564 saving model to BLSTM3_HPS.pt
Adjusting learning rate of group 0 to 2.1739e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 96 train_ner_loss: 0.31167185187260255 train_pos_loss: 1.6955341647035524 train_loss: 2.00720499864493 val_ner_loss: 0.49276711411709745 val_pos_loss: 2.361565933447406 val_loss: 2.85433230001576
Adjusting learning rate of group 0 to 2.1552e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 97 train_ner_loss: 0.32053408326905436 train_pos_loss: 1.6645595761757839 train_loss: 1.9850948809623081 val_ner_loss: 0.4912425544145128 val_pos_loss: 2.3890505743988655 val_loss: 2.8802914138486133
Adjusting learning rate of group 0 to 2.1368e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 98 train_ner_loss: 0.3048905976380087 train_pos_loss: 1.6625455381712808 train_loss: 1.9674362986782856 val_ner_loss: 0.4898734848506169 val_pos_loss: 2.3711955925229646 val_loss: 2.861068329824838
Adjusting learning rate of group 0 to 2.1186e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 99 train_ner_loss: 0.30698423786430534 train_pos_loss: 1.6758882549303702 train_loss: 1.9828720449049366 val_ner_loss: 0.500335429518642 val_pos_loss: 2.3870073917619776 val_loss: 2.8873422935991533
Adjusting learning rate of group 0 to 2.1008e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 100 train_ner_loss: 0.30840254179869914 train_pos_loss: 1.6747022429015177 train_loss: 1.9831038074862408 val_ner_loss: 0.4978932757198982 val_pos_loss: 2.368497029505133 val_loss: 2.866390700986131
Adjusting learning rate of group 0 to 2.0833e-03.


[tensor(33978.5898, device='cuda:0'),
 tensor(15831.8379, device='cuda:0'),
 tensor(11075.5479, device='cuda:0'),
 tensor(9522.1172, device='cuda:0'),
 tensor(8575.9746, device='cuda:0'),
 tensor(7960.5874, device='cuda:0'),
 tensor(7490.3218, device='cuda:0'),
 tensor(7114.2744, device='cuda:0'),
 tensor(6802.3066, device='cuda:0'),
 tensor(6522.0615, device='cuda:0'),
 tensor(6305.2832, device='cuda:0'),
 tensor(6150.2847, device='cuda:0'),
 tensor(5935.9648, device='cuda:0'),
 tensor(5753.7246, device='cuda:0'),
 tensor(5640.3320, device='cuda:0'),
 tensor(5485.9707, device='cuda:0'),
 tensor(5404.6382, device='cuda:0'),
 tensor(5298.9702, device='cuda:0'),
 tensor(5194.1030, device='cuda:0'),
 tensor(5076.1860, device='cuda:0'),
 tensor(4988.6929, device='cuda:0'),
 tensor(4946.3071, device='cuda:0'),
 tensor(4837.1909, device='cuda:0'),
 tensor(4743.5293, device='cuda:0'),
 tensor(4651.0283, device='cuda:0'),
 tensor(4642.3140, device='cuda:0'),
 tensor(4554.6792, device='cuda:0')

In [None]:
word_embedding_dim = 100
model_file_name = 'BLSTM3_HPS.pt'
batch_size = 10
use_cnn_for_char_level = True

word_to_idx = {}
idx_to_word = {}
char_to_idx = {}
idx_to_char = {}

training_data = sorted(make_data(train),key=lambda x:len(x[0]))
validation_data = sorted(make_data(dev),key=lambda x:len(x[0]))
testing_data = sorted(make_data(test),key=lambda x:len(x[0]))

make_word_to_idx(training_data)
make_word_to_idx(validation_data,is_valid_or_test=True)
make_word_to_idx(testing_data,is_valid_or_test=True)
make_tag_to_idx(training_data)

build_embedding_table()

training_data_tensors = make_numeric_data(training_data)
validation_data_tensors = make_numeric_data(validation_data)
testing_data_tensors = make_numeric_data(testing_data)

train_batches = data_batching(training_data_tensors)
dev_batches = data_batching(validation_data_tensors)
test_batches = data_batching(testing_data_tensors)

random.seed(random_seed)
model = BLSTM()
model.to(device)
model.load_state_dict(torch.load(model_file_name))
model.eval()
best_dev_f1 , dev_f1, best_dev_acc,dev_acc, save = evaluate_model(model,
                                                                        dev_batches,
                                                                        validation_data,
                                                                        0,
                                                                        0,
                                                                        "dev",
                                                                        "_hps")
best_test_f1 , test_f1, best_test_acc,test_acc, save = evaluate_model(model,test_batches,testing_data,0,0,"test","_hps")
dev_f1,dev_acc,test_f1, test_acc
print("POS Accuracy on Development Data:",dev_acc)
print("POS Accuracy on Testing Data:",test_acc)
print("NER F1-Score on Development Data:",dev_f1)
print("NER F1-Score on Testing Data:",test_f1)

POS Accuracy on Development Data: 95.63185854434062
POS Accuracy on Testing Data: 95.34136201945742
NER F1-Score on Development Data: 93.77
NER F1-Score on Testing Data: 90.41


In [None]:
dev_f1,dev_acc

(93.74, 95.65124665555082)

In [None]:
torch.save(model.state_dict(),"model.pt")

In [None]:
infer(model,dev_batches,validation_data,'dev1.out')

- Downloading the model with best F-1 score

In [None]:
from google.colab import files
files.download('BLSTM3.pt') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

- Loading the best model and running evaluation of loaded model on dev dataset to verify correctness.

In [None]:
word_embedding_dim = 101 #including 1 dimension for capitalization
pre_embeddings = get_embedding_data('glove.6B.100d',100)
model_file_name = 'BLSTM3.pt'
batch_size = 16
use_cnn_for_char_level = True

word_to_idx = {}
idx_to_word = {}
char_to_idx = {}
idx_to_char = {}


training_data = sorted(make_data(train,num="3"),key=lambda x:len(x[0]))
validation_data = sorted(make_data(dev,num="3"),key=lambda x:len(x[0]))
testing_data = sorted(make_data(test,has_tags=False,num="3"),key=lambda x:len(x[0]))

make_word_to_idx(training_data)
make_word_to_idx(validation_data,is_valid_or_test=True)
make_word_to_idx(testing_data,is_valid_or_test=True)

make_tag_to_idx(training_data)

build_embedding_table()

validation_data = make_data(dev,num="3")
validation_data_tensors = make_numeric_data(validation_data)
dev_batches = data_batching(validation_data_tensors)

In [None]:
model = BLSTM()
model.load_state_dict(torch.load(model_file_name))

<All keys matched successfully>

In [None]:
model.eval()
best_dev_f1 , dev_f1, save = evaluate_model(model,dev_batches,validation_data,-1,"dev","3")

processed 51578 tokens with 5942 phrases; found: 6024 phrases; correct: 5413.
accuracy:  98.35%; precision:  89.86%; recall:  91.10%; FB1:  90.47
Best F1 Score: 90.47
              LOC: precision:  94.39%; recall:  93.36%; FB1:  93.87  1817
             MISC: precision:  84.99%; recall:  81.67%; FB1:  83.30  886
              ORG: precision:  81.67%; recall:  88.37%; FB1:  84.89  1451
              PER: precision:  94.12%; recall:  95.55%; FB1:  94.83  1870


####  What are the precision, recall and F1 score on the dev data?
- Precision = 89.86%
- Recall = 91.10%
- F1-Score = 90.47

# References

- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
- Xuezhe Ma, & Eduard Hovy. (2016). End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF.

In [None]:
!apt-get install texlive-xetex texlive-fonts-recommended texlive-latex-recommended
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!jupyter nbconvert --Application.log_level=CRITICAL --to pdf "/content/drive/MyDrive/Colab Notebooks/CSCI_544_HW4.ipynb"