**Nidhi Rajkumar Saini** <br>
**USCID: 3730422872**

In [1]:
#path that contains folder you want to copy
from google.colab import drive
drive.mount('/content/drive')
%cd /content/
%cd /content/drive/MyDrive/USC/NLP_CSCI_544/project/

Mounted at /content/drive
/content
/content/drive/MyDrive/USC/NLP_CSCI_544/project


# Import required libraries

In [2]:
import pandas as pd
import numpy as np
import os
import random
import tqdm 

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

# Define flags and constants

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# special token for unknown words
UNK = "<UNK>"    
# special token for padding                 
PAD = "<PAD>"

START_TAG = "<START>"
STOP_TAG = "<STOP>"

random_seed = 42

model_file_name = 'BLSTM1.pt'

# checks whether to use CNN for character-level representation
use_cnn_for_char_level = False

# vector dimension for word embeddings
word_embedding_dim = 100

# vector dimension for character embeddings
char_embedding_dim = 30

# None if not using glove embedding else contains glove embedding dictionary
pre_embeddings = None

# setting the hyperparameters as per the task description
lstm_hidden_dim = 200
dropout = 0.5

# number of filters in the CNN layer
out_channels = 30


# setting all the other hyperparameters
num_epochs = 100
batch_size = 10
learning_rate = 0.015
momentum = 0.9
decay_rate = 0.05
grad_clip = 5.0

# creating dictionary for storing tag and corresponding index
tag_to_idx = {}
idx_to_tag = {}

# creating dictionary for storing word and corresponding index in training data
word_to_idx = {}
idx_to_word = {}

# creating dictionary for storing character and corresponding index in training data
char_to_idx = {}
idx_to_char = {}

# Load data files

In [4]:
train = pd.read_csv('english/eng.train.bioes.conll', header = None, names = ['idx','word','pos','synt','tag'], sep ='\s',na_values=['<NAN>'], keep_default_na=False)
train.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,idx,word,pos,synt,tag
0,1,EU,NNP,I-NP,S-ORG
1,2,rejects,VBZ,I-VP,O
2,3,German,JJ,I-NP,S-MISC
3,4,call,NN,I-NP,O
4,5,to,TO,I-VP,O


In [5]:
dev = pd.read_csv('english/eng.dev.bioes.conll', header = None, names = ['idx','word','pos','synt','tag'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)
dev.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,idx,word,pos,synt,tag
0,1,CRICKET,NNP,I-NP,O
1,2,-,:,O,O
2,3,LEICESTERSHIRE,NNP,I-NP,S-ORG
3,4,TAKE,NNP,I-NP,O
4,5,OVER,IN,I-PP,O


In [6]:
test = pd.read_csv('english/eng.test.bioes.conll', header = None, names = ['idx','word','pos','synt','tag'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)
test.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,idx,word,pos,synt,tag
0,1,SOCCER,NN,I-NP,O
1,2,-,:,O,O
2,3,JAPAN,NNP,I-NP,S-LOC
3,4,GET,VB,I-VP,O
4,5,LUCKY,NNP,I-NP,O


# Define functions

- The *make_data* function creates a list of 3-tuples where first element is list of preprocessed words, second element is list of actual words and third element is list of corresponding tags.

In [7]:
def make_data(df,has_tags=True):
    sentences = []
    sentence_tags = []
    tags = []
    sentence = None
    for row in df.values.tolist():
        if row[0] == 1:
            if sentence:
                sentence_tags.append(tags)
                sentences.append(sentence)

            sentence = []
            tags = []
        sentence.append(row[1])
        if has_tags:
          tags.append(row[-1])
        else:
          tags.append("O")
    sentence_tags.append(tags)
    sentences.append(sentence)
    
    return list(zip(sentences,sentence_tags))

- The *make_tag_to_idx* function creates the tag to index dictionary.
- The *make_word_to_idx* function creates the word to index dictionary.
- Both functions use the output of the *make_data* function.

In [8]:
def make_tag_to_idx(data):
    if PAD not in tag_to_idx:
      idx_to_tag[len(tag_to_idx)] = PAD
      tag_to_idx[PAD] = len(tag_to_idx)
    
    if START_TAG not in tag_to_idx:
      idx_to_tag[len(tag_to_idx)] = START_TAG
      tag_to_idx[START_TAG] = len(tag_to_idx)
    
    if STOP_TAG not in tag_to_idx:
      idx_to_tag[len(tag_to_idx)] = STOP_TAG
      tag_to_idx[STOP_TAG] = len(tag_to_idx)

    for _,tags in data:
      for tag in tags:
        if tag not in tag_to_idx:
          idx_to_tag[len(tag_to_idx)] = tag
          tag_to_idx[tag] = len(tag_to_idx)


def make_word_to_idx(data,is_valid_or_test=False):
  if is_valid_or_test:
    for sentence,_ in data:
      for word in sentence:
        if word not in word_to_idx:
          idx_to_word[len(word_to_idx)] = word
          word_to_idx[word] = len(word_to_idx)
    return word_to_idx, idx_to_word

  else:
    idx_to_word[0] = PAD
    word_to_idx[PAD] = 0
    idx_to_word[1] = UNK
    word_to_idx[UNK] = 1

    idx_to_char[0] = PAD
    char_to_idx[PAD] = 0
    idx_to_char[1] = UNK
    char_to_idx[UNK] = 1

    for sentence,_ in data:
      for word in sentence:
        if word not in word_to_idx:
          idx_to_word[len(word_to_idx)] = word
          word_to_idx[word] = len(word_to_idx)

    for sentence,_ in data:
      for word in sentence:
        for char in sentence:
          if char not in char_to_idx:
            idx_to_char[len(char_to_idx)] = char
            char_to_idx[char] = len(char_to_idx)

- The *get_embedding_data* function creates the embedding dictionary from the glove.6B.100d text file.
- The *build_embedding_table* function creates a numpy matrix for all word embeddings. If glove embedding dictionary is provided then, that is used for creating a table otherwise the table is created with random entries.

In [9]:
def get_embedding_data(filename,dim):
  embedding = dict()
  with open(filename,'r',encoding='utf-8') as f:
    for line in f.readlines():
      line = line.strip()
      if len(line) == 0:
        continue
      line_split = line.split() #word followed by dim numbers
      embedd = np.empty([1,dim])
      embedd[:] = line_split[1:]
      word = line_split[0]
      embedding[word] = embedd
  return embedding

def build_embedding_table():
    global embeddings
    scale = np.sqrt(3.0 / word_embedding_dim)
    if pre_embeddings is not None:
        embeddings = np.empty([len(word_to_idx), word_embedding_dim])
        for word in word_to_idx:
            if word.lower() in pre_embeddings:
                embeddings[word_to_idx[word], :] = pre_embeddings[word.lower()]
            else:
                embeddings[word_to_idx[word], :] = np.random.uniform(-scale, scale, [1, word_embedding_dim])
    else:
        embeddings = np.empty([len(word_to_idx), word_embedding_dim])
        for word in word_to_idx:
            embeddings[word_to_idx[word], :] = np.random.uniform(-scale, scale, [1, word_embedding_dim])


- The *make_numeric_data* function uses the word_to_idx, tag_to_idx and char_to_idx dictionaries in order to change the dataset into a list of indices.
- The *data_batching* function creates batches from entire dataset. All sentences in a batch are padded to the same length using the *padded_batching* function.



In [10]:
def make_numeric_data(data):
  list_sent_ids = []
  list_tag_ids = []
  list_char_ids = []
  for sentence,tags in data:
    sentence_ids = []
    tag_ids = []
    char_ids = []
    for word in sentence:
      if word in word_to_idx:
        sentence_ids.append(word_to_idx[word])
      else:
        sentence_ids.append(word_to_idx[UNK])
      
      char_id = []
      for c in word:
        if c in char_to_idx:
          char_id.append(char_to_idx[c])
        else:
          char_id.append(char_to_idx[UNK])
      char_ids.append(char_id)
  
    for tag in tags:
      if tag in tag_to_idx:
        tag_ids.append(tag_to_idx[tag])
      else:
        tag_ids.append("O")
    list_sent_ids.append(sentence_ids)
    list_tag_ids.append(tag_ids)
    list_char_ids.append(char_ids)
  
  return list(zip(list_sent_ids,list_char_ids,list_tag_ids))

def data_batching(data):
  num_instances = len(data)
  num_batches = num_instances // batch_size + 1 if num_instances != 0 else num_instances//batch_size
  batched_data = []
  for i in range(num_batches):
    batch = data[i*batch_size:(i+1)*batch_size]
    batched_data.append(padded_batching(batch))
  return batched_data

def padded_batching(data):
  batch_size = len(data)
  batch_data = data
  
  word_seq_len = torch.LongTensor(list(map(lambda x: len(x[0]), batch_data)))
  max_word_seq_len = word_seq_len.max()
  # print(batch_data)
  char_seq_len = torch.LongTensor(
      [list(map(len,x[1])) + [1]* (int(max_word_seq_len) - len(x[1])) for x in batch_data]
  )

  max_char_seq_len = char_seq_len.max()

  word_seq_tensor = torch.zeros((batch_size,max_word_seq_len),dtype=torch.long)
  char_seq_tensor = torch.zeros((batch_size,max_word_seq_len,max_char_seq_len), dtype=torch.long)
  tag_seq_tensor = torch.zeros((batch_size,max_word_seq_len),dtype=torch.long)

  for idx in range(batch_size):
    word_seq_tensor[idx,:word_seq_len[idx]] = torch.LongTensor(batch_data[idx][0])
    tag_seq_tensor[idx,:word_seq_len[idx]] = torch.LongTensor(batch_data[idx][2])
    
    for word_idx in range(word_seq_len[idx]):
      char_seq_tensor[idx,word_idx,:char_seq_len[idx,word_idx]] = torch.LongTensor(
          batch_data[idx][1][word_idx]
      )

    for word_idx in range(word_seq_len[idx],max_word_seq_len):
      char_seq_tensor[idx,word_idx,0:1] = torch.LongTensor([char_to_idx[PAD]])
    
    word_seq_tensor = word_seq_tensor.to(device)
    word_seq_len = word_seq_len.to(device)
    char_seq_tensor = char_seq_tensor.to(device)
    tag_seq_tensor = tag_seq_tensor.to(device)
  return word_seq_tensor,word_seq_len,char_seq_tensor,tag_seq_tensor,data

In [11]:
# CRF helper functions
def argmax(vec):
  _, idx = torch.max(vec, 1)
  return idx.item()

def log_sum_exp(vec):
  max_score,_ = torch.max(vec,-1)
  max_score_broadcast = max_score.unsqueeze(-1).expand_as(vec)
  return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast),-1))

- The *initialize_linear_layer* function initializes the weights and bias of a linear layer using xavier initialization.
- The *initialize_lstm_layer* function initializes the weights of the LSTM layer using an orthogonal matrix. It initializes the bias by sampling a normal distribution.
- The *BLSTM* class defines feed-forward architecture of the model used in tasks 1 and 2. It also takes care of character-level CNN for the bonus task.

In [12]:
def initialize_linear_layer(layer):
    nn.init.xavier_normal_(layer.weight.data)
    nn.init.normal_(layer.bias.data)

def initialize_lstm_layer(layer):
    for param in layer.parameters():
        if len(param.shape) >= 2:
            nn.init.orthogonal_(param.data)
        else:
            nn.init.normal_(param.data)

class BLSTM(nn.Module):
  def __init__(self):
    super(BLSTM,self).__init__()

    self.vocab_size = len(word_to_idx)
    self.tag_size = len(tag_to_idx)
    self.tag_to_idx = tag_to_idx

    if use_cnn_for_char_level:
      self.char_embedd = nn.Embedding(len(char_to_idx),char_embedding_dim)
      nn.init.xavier_uniform_(self.char_embedd.weight)
      self.char_cnn = nn.Conv2d(
          in_channels = 1,
          out_channels = out_channels,
          kernel_size = (3,char_embedding_dim),
          padding = (2,0)
      )

    self.word_embedd = nn.Embedding(self.vocab_size,word_embedding_dim)
    self.word_embedd.weight = nn.Parameter(torch.FloatTensor(embeddings))

    self.dropout = nn.Dropout(dropout)
    
    if use_cnn_for_char_level:
      self.lstm = nn.LSTM(word_embedding_dim+out_channels,lstm_hidden_dim,num_layers=1,
                          batch_first = True, bidirectional=True)

    else:
      self.lstm = nn.LSTM(word_embedding_dim,lstm_hidden_dim,num_layers=1,
                          batch_first = True, bidirectional=True)
    
    initialize_lstm_layer(self.lstm)
    self.dropout = nn.Dropout(dropout)
    self.out_layer = nn.Linear(2 * lstm_hidden_dim,self.tag_size)
    initialize_linear_layer(self.out_layer)

    self.transitions = nn.Parameter(torch.randn(self.tag_size, self.tag_size))
    self.transitions.data[self.tag_to_idx[START_TAG], :] = -10000.
    self.transitions.data[:, self.tag_to_idx[STOP_TAG]] = -10000.
    self.transitions.data[:,self.tag_to_idx[PAD]] = -10000.
    self.transitions.data[self.tag_to_idx[PAD],:] = -10000.
    self.transitions.data[self.tag_to_idx[PAD], tag_to_idx[PAD]] = 0.0

  def init_hidden(self,batch_shape):
    return (
        torch.randn(2,batch_shape,lstm_hidden_dim).to(device),
        torch.randn(2,batch_shape,lstm_hidden_dim).to(device)
            )
  
  def get_mask_from_word_sequences(self, word_seq_len):
    batch_num = len(word_seq_len)
    max_seq_len = max([x for x in word_seq_len])
    mask_tensor = torch.zeros(batch_num, max_seq_len, dtype=torch.float).to(device)
    for k, x in enumerate(word_seq_len):
        mask_tensor[k, :x] = 1
    return mask_tensor # batch_size x max_seq_len

  def _get_lstm_features(self,word_seq_tensor,word_seq_len,char_seq_tensor):
    self.hidden = self.init_hidden(word_seq_tensor.shape[0])
    word_embedds = self.word_embedd(word_seq_tensor)
    

    if use_cnn_for_char_level:
      batch_size = char_seq_tensor.size(0)
      sent_len = char_seq_tensor.size(1)
      char_seq_tensor = char_seq_tensor.view(batch_size*sent_len,-1)
      char_embedds = self.char_embedd(char_seq_tensor).unsqueeze(1)
      char_embedds = self.dropout(char_embedds)
      cnn_out = self.char_cnn(char_embedds)
      char_embedds = nn.functional.max_pool2d(cnn_out,kernel_size=(cnn_out.size(2),1)).view(
          cnn_out.size(0),
          out_channels
      )

      char_features = char_embedds.view(batch_size,sent_len,-1)
      word_embedds = torch.cat([word_embedds,char_features],axis=2)
    
    word_embs = self.dropout(word_embedds)
    self.mask = self.get_mask_from_word_sequences(word_seq_len)
    sorted_seq_len,idx = word_seq_len.sort(0,descending=True)
    _,sorted_idx = idx.sort(0,descending=False)
    sorted_seq_tensor = word_embs[idx]
    packed_words = pack_padded_sequence(sorted_seq_tensor,sorted_seq_len.cpu(),True)
    output, self.hidden = self.lstm(packed_words,self.hidden)
    output, _ = pad_packed_sequence(output,batch_first=True)
    output = output[sorted_idx]
    output = self.dropout(output)
    output = self.out_layer(output)
    return output

  def _forward_alg(self, feats):
    batch_num, max_seq_len = self.mask.shape
    score = torch.full((batch_num, self.tag_size), -10000.).to(device)
    score[:, self.tag_to_idx[START_TAG]] = 0.
    for n in range(max_seq_len):
      curr_mask = self.mask[:, n].unsqueeze(-1).expand_as(score)
      curr_score = score.unsqueeze(1).expand(-1, *self.transitions.size())
      curr_emission = feats[:, n].unsqueeze(-1).expand_as(curr_score)
      curr_transition = self.transitions.unsqueeze(0).expand_as(curr_score)
      curr_score = log_sum_exp(curr_score + curr_emission + curr_transition)
      score = curr_score * curr_mask + score*(1-curr_mask)
    score = log_sum_exp(score)
    return score

  def _score_sentence(self, feats, tags):
    batch_num, max_seq_len = self.mask.shape
    score = torch.zeros(batch_num, dtype=torch.float).to(device)
    start_tag_tensor = torch.zeros(batch_num, 1, dtype=torch.long).fill_(self.tag_to_idx[START_TAG]).to(device)
    tags = torch.cat([start_tag_tensor, tags], 1)
    for n in range(max_seq_len):
      curr_mask = self.mask[:,n]
      curr_emission = torch.zeros(batch_num, dtype=torch.float).to(device)
      curr_transition = torch.zeros(batch_num, dtype=torch.float).to(device)
      for k in range(batch_num):
        curr_emission[k] = feats[k, n, tags[k, n+1]].unsqueeze(0)
        curr_tags = tags[k]
        curr_transition[k] = self.transitions[curr_tags[n+1], curr_tags[n]].unsqueeze(0)
      score = score + curr_emission*curr_mask + curr_transition*curr_mask
    return score
  
  def _viterbi_decode(self, feats):
    batch_size, max_seq_len = self.mask.shape
    seq_len_list = [int(self.mask[k].sum().item()) for k in range(batch_size)]
    backpointers = torch.LongTensor(batch_size, max_seq_len, self.tag_size).to(device)
    score = torch.full((batch_size, self.tag_size), -10000.).to(device)
    score[:,self.tag_to_idx[START_TAG]] = 0

    for n in range(max_seq_len):
      curr_emission = feats[:, n]
      curr_score = torch.Tensor(batch_size, self.tag_size).to(device)
      curr_backpointers = torch.LongTensor(batch_size, self.tag_size).to(device)
      for curr_tag in range(self.tag_size):
        T = self.transitions[curr_tag, :].unsqueeze(0).expand(batch_size, self.tag_size)
        max_values, max_indices = torch.max(score + T, 1)
        curr_score[:, curr_tag] = max_values
        curr_backpointers[:, curr_tag] = max_indices
      curr_mask = self.mask[:, n].unsqueeze(1).expand(batch_size, self.tag_size)
      score = score * (1-curr_mask) + (curr_score + curr_emission) * curr_mask
      backpointers[:,n,:] = curr_backpointers
    best_score_batch, last_best_tag_batch = torch.max(score, 1)
    best_path_batch = [[tag] for tag in last_best_tag_batch.tolist()]
    for k in range(batch_size):
      curr_best_tag = last_best_tag_batch[k]
      curr_seq_len = seq_len_list[k]
      for n in reversed(range(1, curr_seq_len)):
        curr_best_tag = backpointers[k, n, curr_best_tag].item()
        best_path_batch[k].insert(0, curr_best_tag)
    return best_path_batch

  def neg_log_likelihood(self, word_seq_tensor, word_seq_len, char_seq_tensor, tags):
    lstm_feats = self._get_lstm_features(word_seq_tensor, word_seq_len, char_seq_tensor)
    forward_score = self._forward_alg(lstm_feats)
    gold_score = self._score_sentence(lstm_feats, tags)
    loss = torch.mean(forward_score - gold_score)
    return loss
  
  def forward(self, word_seq_tensor, word_seq_len, char_seq_tensor):
    lstm_feats = self._get_lstm_features(word_seq_tensor, word_seq_len, char_seq_tensor)
    tag_seqs = self._viterbi_decode(lstm_feats)
    return tag_seqs

- The *evaluate_model* function creates the .out prediction file with required output. It also runs the perl script to evaluate F-1 score.
- It returns the best F-1 score until current epoch, the current F-1 score and a flag called save, which tells the training function when to save the model.

In [13]:
def evaluate_model(model,data,act_data,best_f1_score,name="dev",model_num="1"):
  save = False
  f1_score = 0.0

  pred_file_name = name + model_num + '.out'
  score_file_name = name+model_num+'_score.out'
  
  pred_file = open(pred_file_name,'w')

  for idx in range(len(data)):
    batch = data[idx]
    rows = batch[4]
    pred_tags = model(*batch[:3])
    orig_data = act_data[idx*batch_size:(idx+1)*batch_size]
    for i,(row,preds) in enumerate(list(zip(rows,pred_tags))):
      for idx,(word,gold,pred) in enumerate(zip(orig_data[i][0],row[2],preds),start=1):
        #print(str(idx),word,gold,pred.item())
        pred_file.write(' '.join([str(idx),word,idx_to_tag[gold],idx_to_tag[pred]]))
        pred_file.write('\n')
      pred_file.write('\n')
    
  pred_file.close()
  os.system('perl conlleval.v2 < %s > %s' % (pred_file_name,score_file_name))
  eval_lines = [l.rstrip() for l in open(score_file_name,'r',encoding='utf-8')]

  for i, line in enumerate(eval_lines):
    #print(line)
    if i==1:
      f1_score = float(line.strip().split()[-1])
      if f1_score>best_f1_score:
        best_f1_score = f1_score
        save = True
  
  return best_f1_score,f1_score,save

- The *train_model* function contains the main loop for training the model.
- First, the model back propagates and learns on training batches and after each epoch, the F-1 score on development dataset is calculated. The model is saved whenever it beats its current best F-1 score.

In [14]:
def train_model(model,train_batches,dev_batches,model_num="1",curr_epoch = 0):
  optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate,momentum=momentum)
  if curr_epoch>0:
    for g in optimizer.param_groups:
      g['initial_lr'] = learning_rate
  lambda_schedule = lambda x: 1/(1+x*0.05)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,lambda_schedule,verbose=True,last_epoch=curr_epoch-1)

  losses = []
  best_dev_f1 = -1.

  # loss_function = loss_func(ignore_index=tag_to_idx[PAD])

  for epoch in range(1,num_epochs+1):
    total_loss = 0
    model.train()
    model.zero_grad()
    for idx in tqdm.notebook.tqdm(np.random.permutation(len(train_batches)),total=len(train_batches)):
      batch = train_batches[idx]
      loss = model.neg_log_likelihood(*batch[:4])
      total_loss += loss.data
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(),grad_clip)
      optimizer.step()
      model.zero_grad()
    losses.append(total_loss)
    
    model.eval()
    total_dev_loss = 0
    for idx in tqdm.notebook.tqdm(np.random.permutation(len(dev_batches)),total=len(dev_batches)):
      batch = dev_batches[idx]
      loss = model.neg_log_likelihood(*batch[:4])
      total_dev_loss += loss.data
      model.zero_grad()

    print("Epoch:",epoch,"train_loss:",total_loss/len(train_batches),"val_loss:",total_dev_loss/len(dev_batches))
    
    if epoch%5==0:
      best_dev_f1 , dev_f1, save = evaluate_model(model,dev_batches,validation_data,best_dev_f1,"dev",model_num)
      if save:
        print('F1 Score:',dev_f1,'-->',best_dev_f1,"saving model to",model_file_name)
        torch.save(model.state_dict(),model_file_name)
        
    
    model.zero_grad()
    scheduler.step()
  return losses

- The *infer* function creates the .out prediction file with required output without the column for gold tags.

In [15]:
def infer(model,data,act_data,filename):
  
  pred_file = open(filename,'w')

  for idx in range(len(data)):
    batch = data[idx]
    rows = batch[4]
    scores,pred_tags = model(*batch[:3])
    # actual_pred_tags = []
    # for i,len in enumerate(batch[2]):
    #   actual_pred_tags.append(pred_tags[i][:len])
    orig_data = act_data[idx*batch_size:(idx+1)*batch_size]
    for i,(row,preds) in enumerate(list(zip(rows,pred_tags))):
      for idx,(word,pred) in enumerate(zip(orig_data[i][0],preds),start=1):
        pred_file.write(' '.join([str(idx),word,idx_to_tag[pred]]))
        pred_file.write('\n')
      pred_file.write('\n')
    
  pred_file.close()

In [None]:
!gzip -d glove.6B.100d.gz

# Bonus Task: Using BLSTM-CNN with GloVe word embeddings

- We use all the above functions to preprocess and prepare our data for training the model.
- For this task, we load the glove embeddings for words in the train, dev and test sets into our embedding layer.
- Additionaly, we also make use_cnn_for_char_level = True, this in turn gives the model a CNN layer which takes character level embedding as input and creates a character-level representation for a word. These representations are then concatenated with the word embeddings to create the inputs for the LSTM Layer.
- After forming the initial training and validation data, we sort the sentences (and corresponding tags) according to length.
- The model is trained with the following parameters:

 - use_cnn_for_char_level = True 
 - word_embedding_dim = 100 (adding 1 more feature for capitalization makes the dimension  101)
 - pre_embeddings = GloVe Embeddings Dictionary
 - lstm_hidden_dim = 256
 - dropout = 0.33
 - output_dimension = 128
 - num_epochs = 100
 - batch_size = 16
 - learning_rate = 0.015
 - momentum = 0.9
 - decay_rate = 0.05
 - grad_clip = 5.0
 - loss function = Cross Entropy Loss
 - optimizer = SGD

In [16]:
pre_embeddings = get_embedding_data('glove.6B.100d',100)

In [None]:
word_embedding_dim = 100
model_file_name = 'BLSTM3.pt'
batch_size = 10
use_cnn_for_char_level = True

word_to_idx = {}
idx_to_word = {}
char_to_idx = {}
idx_to_char = {}

training_data = sorted(make_data(train),key=lambda x:len(x[0]))
validation_data = sorted(make_data(dev),key=lambda x:len(x[0]))
testing_data = sorted(make_data(test,has_tags=False),key=lambda x:len(x[0]))

make_word_to_idx(training_data)
make_word_to_idx(validation_data,is_valid_or_test=True)
make_word_to_idx(testing_data,is_valid_or_test=True)
make_tag_to_idx(training_data)

build_embedding_table()

training_data_tensors = make_numeric_data(training_data)
validation_data_tensors = make_numeric_data(validation_data)

train_batches = data_batching(training_data_tensors)
dev_batches = data_batching(validation_data_tensors)

random.seed(random_seed)
model = BLSTM()
model.to(device)
train_model(model,train_batches,dev_batches,model_num="3")

Adjusting learning rate of group 0 to 1.5000e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 1 train_loss: tensor(4.6447, device='cuda:0') val_loss: tensor(1.3942, device='cuda:0')
Adjusting learning rate of group 0 to 1.4286e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 2 train_loss: tensor(1.5713, device='cuda:0') val_loss: tensor(0.9437, device='cuda:0')
Adjusting learning rate of group 0 to 1.3636e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 3 train_loss: tensor(1.1990, device='cuda:0') val_loss: tensor(0.8233, device='cuda:0')
Adjusting learning rate of group 0 to 1.3043e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 4 train_loss: tensor(1.0127, device='cuda:0') val_loss: tensor(0.7077, device='cuda:0')
Adjusting learning rate of group 0 to 1.2500e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 5 train_loss: tensor(0.9037, device='cuda:0') val_loss: tensor(0.6870, device='cuda:0')
F1 Score: 91.93 --> 91.93 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 1.2000e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 6 train_loss: tensor(0.8161, device='cuda:0') val_loss: tensor(0.6206, device='cuda:0')
Adjusting learning rate of group 0 to 1.1538e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 7 train_loss: tensor(0.7568, device='cuda:0') val_loss: tensor(0.5853, device='cuda:0')
Adjusting learning rate of group 0 to 1.1111e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 8 train_loss: tensor(0.6933, device='cuda:0') val_loss: tensor(0.5981, device='cuda:0')
Adjusting learning rate of group 0 to 1.0714e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 9 train_loss: tensor(0.6500, device='cuda:0') val_loss: tensor(0.5726, device='cuda:0')
Adjusting learning rate of group 0 to 1.0345e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 10 train_loss: tensor(0.6161, device='cuda:0') val_loss: tensor(0.5660, device='cuda:0')
F1 Score: 93.13 --> 93.13 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 1.0000e-02.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 11 train_loss: tensor(0.5854, device='cuda:0') val_loss: tensor(0.5827, device='cuda:0')
Adjusting learning rate of group 0 to 9.6774e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 12 train_loss: tensor(0.5446, device='cuda:0') val_loss: tensor(0.5564, device='cuda:0')
Adjusting learning rate of group 0 to 9.3750e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 13 train_loss: tensor(0.5288, device='cuda:0') val_loss: tensor(0.5797, device='cuda:0')
Adjusting learning rate of group 0 to 9.0909e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 14 train_loss: tensor(0.5024, device='cuda:0') val_loss: tensor(0.5203, device='cuda:0')
Adjusting learning rate of group 0 to 8.8235e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 15 train_loss: tensor(0.4906, device='cuda:0') val_loss: tensor(0.5120, device='cuda:0')
F1 Score: 93.41 --> 93.41 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 8.5714e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 16 train_loss: tensor(0.4721, device='cuda:0') val_loss: tensor(0.5218, device='cuda:0')
Adjusting learning rate of group 0 to 8.3333e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 17 train_loss: tensor(0.4540, device='cuda:0') val_loss: tensor(0.4983, device='cuda:0')
Adjusting learning rate of group 0 to 8.1081e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 18 train_loss: tensor(0.4292, device='cuda:0') val_loss: tensor(0.4880, device='cuda:0')
Adjusting learning rate of group 0 to 7.8947e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 19 train_loss: tensor(0.4241, device='cuda:0') val_loss: tensor(0.4966, device='cuda:0')
Adjusting learning rate of group 0 to 7.6923e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 20 train_loss: tensor(0.3970, device='cuda:0') val_loss: tensor(0.5091, device='cuda:0')
F1 Score: 93.75 --> 93.75 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 7.5000e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 21 train_loss: tensor(0.3948, device='cuda:0') val_loss: tensor(0.5110, device='cuda:0')
Adjusting learning rate of group 0 to 7.3171e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 22 train_loss: tensor(0.3833, device='cuda:0') val_loss: tensor(0.4873, device='cuda:0')
Adjusting learning rate of group 0 to 7.1429e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 23 train_loss: tensor(0.3638, device='cuda:0') val_loss: tensor(0.4702, device='cuda:0')
Adjusting learning rate of group 0 to 6.9767e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 24 train_loss: tensor(0.3565, device='cuda:0') val_loss: tensor(0.5138, device='cuda:0')
Adjusting learning rate of group 0 to 6.8182e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 25 train_loss: tensor(0.3500, device='cuda:0') val_loss: tensor(0.4949, device='cuda:0')
F1 Score: 93.92 --> 93.92 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 6.6667e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 26 train_loss: tensor(0.3464, device='cuda:0') val_loss: tensor(0.5021, device='cuda:0')
Adjusting learning rate of group 0 to 6.5217e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 27 train_loss: tensor(0.3251, device='cuda:0') val_loss: tensor(0.4944, device='cuda:0')
Adjusting learning rate of group 0 to 6.3830e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 28 train_loss: tensor(0.3329, device='cuda:0') val_loss: tensor(0.4824, device='cuda:0')
Adjusting learning rate of group 0 to 6.2500e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 29 train_loss: tensor(0.3201, device='cuda:0') val_loss: tensor(0.5059, device='cuda:0')
Adjusting learning rate of group 0 to 6.1224e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 30 train_loss: tensor(0.3169, device='cuda:0') val_loss: tensor(0.4697, device='cuda:0')
Adjusting learning rate of group 0 to 6.0000e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 31 train_loss: tensor(0.3075, device='cuda:0') val_loss: tensor(0.4797, device='cuda:0')
Adjusting learning rate of group 0 to 5.8824e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 32 train_loss: tensor(0.2949, device='cuda:0') val_loss: tensor(0.5000, device='cuda:0')
Adjusting learning rate of group 0 to 5.7692e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 33 train_loss: tensor(0.2852, device='cuda:0') val_loss: tensor(0.4847, device='cuda:0')
Adjusting learning rate of group 0 to 5.6604e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 34 train_loss: tensor(0.2871, device='cuda:0') val_loss: tensor(0.5043, device='cuda:0')
Adjusting learning rate of group 0 to 5.5556e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 35 train_loss: tensor(0.2822, device='cuda:0') val_loss: tensor(0.5192, device='cuda:0')
Adjusting learning rate of group 0 to 5.4545e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 36 train_loss: tensor(0.2779, device='cuda:0') val_loss: tensor(0.4922, device='cuda:0')
Adjusting learning rate of group 0 to 5.3571e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 37 train_loss: tensor(0.2765, device='cuda:0') val_loss: tensor(0.4936, device='cuda:0')
Adjusting learning rate of group 0 to 5.2632e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 38 train_loss: tensor(0.2756, device='cuda:0') val_loss: tensor(0.4928, device='cuda:0')
Adjusting learning rate of group 0 to 5.1724e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 39 train_loss: tensor(0.2603, device='cuda:0') val_loss: tensor(0.5061, device='cuda:0')
Adjusting learning rate of group 0 to 5.0847e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 40 train_loss: tensor(0.2548, device='cuda:0') val_loss: tensor(0.4921, device='cuda:0')
F1 Score: 94.2 --> 94.2 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 5.0000e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 41 train_loss: tensor(0.2637, device='cuda:0') val_loss: tensor(0.4910, device='cuda:0')
Adjusting learning rate of group 0 to 4.9180e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 42 train_loss: tensor(0.2515, device='cuda:0') val_loss: tensor(0.5114, device='cuda:0')
Adjusting learning rate of group 0 to 4.8387e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 43 train_loss: tensor(0.2396, device='cuda:0') val_loss: tensor(0.4963, device='cuda:0')
Adjusting learning rate of group 0 to 4.7619e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 44 train_loss: tensor(0.2450, device='cuda:0') val_loss: tensor(0.4968, device='cuda:0')
Adjusting learning rate of group 0 to 4.6875e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 45 train_loss: tensor(0.2315, device='cuda:0') val_loss: tensor(0.4868, device='cuda:0')
F1 Score: 94.41 --> 94.41 saving model to BLSTM3.pt
Adjusting learning rate of group 0 to 4.6154e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 46 train_loss: tensor(0.2291, device='cuda:0') val_loss: tensor(0.4947, device='cuda:0')
Adjusting learning rate of group 0 to 4.5455e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 47 train_loss: tensor(0.2291, device='cuda:0') val_loss: tensor(0.4980, device='cuda:0')
Adjusting learning rate of group 0 to 4.4776e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 48 train_loss: tensor(0.2202, device='cuda:0') val_loss: tensor(0.5204, device='cuda:0')
Adjusting learning rate of group 0 to 4.4118e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 49 train_loss: tensor(0.2255, device='cuda:0') val_loss: tensor(0.5098, device='cuda:0')
Adjusting learning rate of group 0 to 4.3478e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 50 train_loss: tensor(0.2232, device='cuda:0') val_loss: tensor(0.5147, device='cuda:0')
Adjusting learning rate of group 0 to 4.2857e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 51 train_loss: tensor(0.2165, device='cuda:0') val_loss: tensor(0.5079, device='cuda:0')
Adjusting learning rate of group 0 to 4.2254e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 52 train_loss: tensor(0.2207, device='cuda:0') val_loss: tensor(0.5042, device='cuda:0')
Adjusting learning rate of group 0 to 4.1667e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 53 train_loss: tensor(0.2094, device='cuda:0') val_loss: tensor(0.5053, device='cuda:0')
Adjusting learning rate of group 0 to 4.1096e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 54 train_loss: tensor(0.2107, device='cuda:0') val_loss: tensor(0.5119, device='cuda:0')
Adjusting learning rate of group 0 to 4.0541e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 55 train_loss: tensor(0.2119, device='cuda:0') val_loss: tensor(0.4936, device='cuda:0')
Adjusting learning rate of group 0 to 4.0000e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 56 train_loss: tensor(0.2007, device='cuda:0') val_loss: tensor(0.4893, device='cuda:0')
Adjusting learning rate of group 0 to 3.9474e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 57 train_loss: tensor(0.2182, device='cuda:0') val_loss: tensor(0.5216, device='cuda:0')
Adjusting learning rate of group 0 to 3.8961e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 58 train_loss: tensor(0.2079, device='cuda:0') val_loss: tensor(0.4982, device='cuda:0')
Adjusting learning rate of group 0 to 3.8462e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 59 train_loss: tensor(0.1954, device='cuda:0') val_loss: tensor(0.5204, device='cuda:0')
Adjusting learning rate of group 0 to 3.7975e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 60 train_loss: tensor(0.1986, device='cuda:0') val_loss: tensor(0.5188, device='cuda:0')
Adjusting learning rate of group 0 to 3.7500e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 61 train_loss: tensor(0.2017, device='cuda:0') val_loss: tensor(0.5077, device='cuda:0')
Adjusting learning rate of group 0 to 3.7037e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 62 train_loss: tensor(0.1894, device='cuda:0') val_loss: tensor(0.5077, device='cuda:0')
Adjusting learning rate of group 0 to 3.6585e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 63 train_loss: tensor(0.1911, device='cuda:0') val_loss: tensor(0.5286, device='cuda:0')
Adjusting learning rate of group 0 to 3.6145e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

Epoch: 64 train_loss: tensor(0.1902, device='cuda:0') val_loss: tensor(0.5030, device='cuda:0')
Adjusting learning rate of group 0 to 3.5714e-03.


  0%|          | 0/1499 [00:00<?, ?it/s]

In [19]:
word_embedding_dim = 100
model_file_name = 'BLSTM3_NER.pt'
batch_size = 10
use_cnn_for_char_level = True

word_to_idx = {}
idx_to_word = {}
char_to_idx = {}
idx_to_char = {}

training_data = sorted(make_data(train),key=lambda x:len(x[0]))
validation_data = sorted(make_data(dev),key=lambda x:len(x[0]))
testing_data = sorted(make_data(test),key=lambda x:len(x[0]))

make_word_to_idx(training_data)
make_word_to_idx(validation_data,is_valid_or_test=True)
make_word_to_idx(testing_data,is_valid_or_test=True)
make_tag_to_idx(training_data)

build_embedding_table()

training_data_tensors = make_numeric_data(training_data)
validation_data_tensors = make_numeric_data(validation_data)
testing_data_tensors = make_numeric_data(testing_data)

train_batches = data_batching(training_data_tensors)
dev_batches = data_batching(validation_data_tensors)
test_batches = data_batching(testing_data_tensors)

# random.seed(random_seed)
model = BLSTM()
model.to(device)
model.load_state_dict(torch.load(model_file_name))
model.eval()
best_dev_f1 , dev_f1, save = evaluate_model(model,dev_batches,validation_data,0,"dev","_ner")
best_test_f1 , test_f1, save = evaluate_model(model,test_batches,testing_data,0,"test","_ner")

print("NER F1-Score on Development Data:",dev_f1)
print("NER F1-Score on Testing Data:",test_f1)

NER F1-Score on Development Data: 94.32
NER F1-Score on Testing Data: 90.34


In [None]:
torch.save(model.state_dict(),"model.pt")

In [None]:
infer(model,dev_batches,validation_data,'dev1.out')

- Downloading the model with best F-1 score

In [None]:
from google.colab import files
files.download('BLSTM3.pt') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

- Loading the best model and running evaluation of loaded model on dev dataset to verify correctness.

In [None]:
word_embedding_dim = 101 #including 1 dimension for capitalization
pre_embeddings = get_embedding_data('glove.6B.100d',100)
model_file_name = 'BLSTM3.pt'
batch_size = 16
use_cnn_for_char_level = True

word_to_idx = {}
idx_to_word = {}
char_to_idx = {}
idx_to_char = {}


training_data = sorted(make_data(train,num="3"),key=lambda x:len(x[0]))
validation_data = sorted(make_data(dev,num="3"),key=lambda x:len(x[0]))
testing_data = sorted(make_data(test,has_tags=False,num="3"),key=lambda x:len(x[0]))

make_word_to_idx(training_data)
make_word_to_idx(validation_data,is_valid_or_test=True)
make_word_to_idx(testing_data,is_valid_or_test=True)

make_tag_to_idx(training_data)

build_embedding_table()

validation_data = make_data(dev,num="3")
validation_data_tensors = make_numeric_data(validation_data)
dev_batches = data_batching(validation_data_tensors)

In [None]:
model = BLSTM()
model.load_state_dict(torch.load(model_file_name))

<All keys matched successfully>

In [None]:
model.eval()
best_dev_f1 , dev_f1, save = evaluate_model(model,dev_batches,validation_data,-1,"dev","3")

processed 51578 tokens with 5942 phrases; found: 6024 phrases; correct: 5413.
accuracy:  98.35%; precision:  89.86%; recall:  91.10%; FB1:  90.47
Best F1 Score: 90.47
              LOC: precision:  94.39%; recall:  93.36%; FB1:  93.87  1817
             MISC: precision:  84.99%; recall:  81.67%; FB1:  83.30  886
              ORG: precision:  81.67%; recall:  88.37%; FB1:  84.89  1451
              PER: precision:  94.12%; recall:  95.55%; FB1:  94.83  1870


####  What are the precision, recall and F1 score on the dev data?
- Precision = 89.86%
- Recall = 91.10%
- F1-Score = 90.47

# References

- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
- Xuezhe Ma, & Eduard Hovy. (2016). End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF.

In [None]:
!apt-get install texlive-xetex texlive-fonts-recommended texlive-latex-recommended
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!jupyter nbconvert --Application.log_level=CRITICAL --to pdf "/content/drive/MyDrive/Colab Notebooks/CSCI_544_HW4.ipynb"