In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset, DataLoader

import numpy as np
import math
import os.path
from collections import defaultdict
import pickle

from sklearn.metrics import classification_report



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:

SEED = 544

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Credit: From PyTorch's documentation
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
def reduceTag(tag,flag=False):
  if flag:
    return tag
  else:
    if tag == 'O':
      return tag
    else:
      return 'X'

In [5]:
def read_data(fname, test_dataset=False):
    sentences = []
    with open(fname, 'r') as f:
        lines = [line.strip() for line in f.readlines()]
        sentence_words = []
        sentence_tags = []
        for line in lines:
            if line:
                # test data has only index and word
                if test_dataset:
                    word = line
                    sentence_words.append(word)
                # train/dev data has index, word, and tag
                else:
                    if len(line) < 2:
                      continue
                    word, tag = line.split()
                    sentence_words.append(word)
                    sentence_tags.append(reduceTag(tag))
            else:
                # Create a sentence upon reaching an empty new line
                if test_dataset:
                    sentences.append(sentence_words)
                else:
                    sentences.append((sentence_words, sentence_tags))
                sentence_words = []
                sentence_tags = []
        # Create a sentence for the last sentence in the document
        # incase it missed a newline in the document at the end
        if len(sentence_words) > 0:
            if test_dataset:
                sentences.append(sentence_words)
            else:
                sentences.append((sentence_words, sentence_tags))
    return sentences

In [6]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
train_Path = '/content/drive/MyDrive/data/train.tagged'
test_Path = '/content/drive/MyDrive/data/dev.tagged'
eval_Path = '/content/drive/MyDrive/data/test.untagged'

In [8]:
# Read all datasets given
train_data = read_data(train_Path)
dev_data = read_data(test_Path)
test_data = read_data(eval_Path, test_dataset=True)



In [9]:
test_data


[['&',
  'gt',
  ';',
  '*',
  'The',
  'soldier',
  'was',
  'killed',
  'when',
  'another',
  'avalanche',
  'hit',
  'an',
  'army',
  'barracks',
  'in',
  'the',
  'northern',
  'area',
  'of',
  'Sonmarg',
  ',',
  'said',
  'a',
  'military',
  'spokesman',
  '.'],
 ['&',
  'gt',
  ';',
  '*',
  'Police',
  'last',
  'week',
  'evacuated',
  '80',
  'villagers',
  'from',
  'Waltengoo',
  'Nar',
  'where',
  'dozens',
  'were',
  'killed',
  'after',
  'a',
  'series',
  'of',
  'avalanches',
  'hit',
  'the',
  'area',
  'in',
  '2005',
  'in',
  'the',
  'south',
  'of',
  'the',
  'territory',
  '.'],
 ['&',
  'gt',
  ';',
  '*',
  'The',
  'army',
  'on',
  'Thursday',
  'recovered',
  'the',
  'bodies',
  'of',
  'ten',
  'of',
  'its',
  'men',
  'who',
  'were',
  'killed',
  'in',
  'an',
  'avalanche',
  'the',
  'previous',
  'day',
  '.'],
 ['&',
  'gt',
  ';',
  '*',
  'The',
  'four',
  'civilians',
  'killed',
  'included',
  'two',
  'children',
  'of',
  'a',
  

In [10]:
# Converts a sequence of words to a series of indices as given by the to_ix mapping (word -> index).
def prepare_sequence(seq, to_ix, use_unk=False):
    if use_unk:
        indices = [to_ix[w] if w in to_ix else to_ix[''] for w in seq]
    else:
        indices = [to_ix[w] for w in seq]
    return indices

# Returns the spelling features for each word in the sentence.
# Currently there are only 5 features:
# 0 - PAD = special token used for the word ''
# 1 - ALL_LOWER = when the word is all lower case, such as 'cat'
# 2 - ALL_UPPER = when the word is all upper case, such as 'IBM'
# 3 - FIRST_UPPER = when the first character is capitalized, such as 'John'
# 4 - OTHERS = all other words that did not fit the categories above
def get_spelling_feature(sentence):
    result = []
    for word in sentence:
        # PAD = 0
        if word == '':
            result.append(0)
        ## ALL LOWER = 1
        elif word.islower():
            result.append(1)
        # ALL UPPER = 2
        elif word.isupper():
            result.append(2)
        # FIRST UPPER = 3
        elif word[0].isupper():
            result.append(3)
        # OTHERS = 4
        else:
            result.append(4)
    return result


In [11]:
# The NERDataset is responsible for converting the data, as retrieved from read_data(), into PyTorch tensors of indices
# It will pad all sentences to the same length and convert words to indices using a vocabulary lookup
class NERDataset(Dataset):
    def __init__(self, data):
        # Retrieves longest sentence, for padding
        max_sentence_len = max([len(sentence) for sentence, tags in data])
        self.X = []
        self.X_original = []
        self.y = []
        self.X_spelling = []
        
        for sentence, tags in data:
            # Pad the sentences to the same length
            padded_sentence = sentence.copy()
            padded_tags = tags.copy()
            while len(padded_sentence) < max_sentence_len:
                padded_sentence.append('')
                padded_tags.append('')
            # Convert to indices
            transformed_sentence = prepare_sequence(padded_sentence, word_to_ix, use_unk=True)
            transformed_tags = prepare_sequence(padded_tags, tag_to_ix)
            # Get spelling indices
            spelling_sentence = get_spelling_feature(padded_sentence)
            # Add to dataset
            self.X.append(transformed_sentence)
            self.X_original.append(padded_sentence)
            self.y.append(transformed_tags)
            self.X_spelling.append(spelling_sentence)
            
        self.X = torch.from_numpy(np.array(self.X, dtype=np.int64)).to(device)
        self.y = torch.from_numpy(np.array(self.y, dtype=np.int64)).to(device)
        self.X_spelling = torch.from_numpy(np.array(self.X_spelling, dtype=np.int64)).to(device)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index], self.X_original[index], self.X_spelling[index]

In [12]:
VOCAB_THRESHOLD = 0

# Generate vocab
words_freq = defaultdict(int)
for sentence, tags in train_data:
    for word in sentence:
        words_freq[word] += 1
        
vocab = {key for key, val in words_freq.items() if val >= VOCAB_THRESHOLD}

# Generate word/tag to index mappings
word_to_ix = {'': 0, '': 1}
tag_to_ix = {'': 0}
for sentence, tags in train_data:
    for word in sentence:
        if word not in vocab:
            word = ''
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
            
# Generate index to word/tag mappings
ix_to_word = {v: k for k, v in word_to_ix.items()}
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# Calculate the size of vocabulary & tags
VOCAB_SIZE = len(word_to_ix)
TAGS_SIZE = len(tag_to_ix)


# Utility Functions for Prediction


# Using GloVe word embeddings


In [13]:
BATCH_SIZE = 1

EMBEDDING_DIM = 200
LSTM_HIDDEN_DIM = 256
LSTM_DROPOUT = 0.25
LINEAR_DIM = 164

LEARNING_RATE = 0.01
MOMENTUM = 0.9

ELU_ALPHA = 0.5

SCHEDULER_STEP_SIZE = 5
SCHEDULER_GAMMA = 0.5

NUM_EPOCHS = 100

SPELLING_EMBEDDING_DIM = 15

In [14]:
for sentence,tag in train_data:
  print(sentence)
  break
for sentence in test_data:
  print(sentence)
  break

['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
['&', 'gt', ';', '*', 'The', 'soldier', 'was', 'killed', 'when', 'another', 'avalanche', 'hit', 'an', 'army', 'barracks', 'in', 'the', 'northern', 'area', 'of', 'Sonmarg', ',', 'said', 'a', 'military', 'spokesman', '.']


In [15]:
embeddings_dict = {}
vocab = set(['', ''])

with open('/content/drive/MyDrive/data/glove.twitter.27B.200d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

for sentence, tags in train_data:
    vocab.update(sentence)
for sentence, tags in dev_data:
    vocab.update(sentence)
for sentence in test_data:
    vocab.update(sentence)

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {v: k for k, v in word_to_ix.items()}

embedding_matrix = np.zeros((len(vocab), EMBEDDING_DIM))

for word in vocab:
    index = word_to_ix[word]
    if word in embeddings_dict:
        vector = embeddings_dict[word]
    elif word.lower() in embeddings_dict:
        vector = embeddings_dict[word.lower()]
    else:
        vector = np.random.rand(EMBEDDING_DIM)
    embedding_matrix[index] = vector

VOCAB_SIZE = len(word_to_ix)

In [16]:
class BLSTM2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, linear_dim, tags_size, lstm_dropout, elu_alpha, embeddings, spelling_embedding_dim):
        super(BLSTM2, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.embeddings_word = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float(), freeze=False, padding_idx=word_to_ix[''])
        self.embeddings_spelling = nn.Embedding(num_embeddings=5, embedding_dim=spelling_embedding_dim, padding_idx=0)
        self.dropout_pre_lstm = nn.Dropout(lstm_dropout)
        self.lstm = nn.LSTM(embedding_dim+spelling_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout_post_lstm = nn.Dropout(lstm_dropout)
        self.linear = nn.Linear(hidden_dim * 2, linear_dim)
        self.elu = nn.ELU(alpha=elu_alpha)
        #self.ReLu = nn.ReLU()
        self.linear2 = nn.Linear(linear_dim, tags_size)
    
    def forward(self, x_word, x_spelling):
        x1 = self.embeddings_word(x_word)
        x2 = self.embeddings_spelling(x_spelling)
        x = torch.cat((x1, x2), dim=2).to(device)
        x = self.dropout_pre_lstm(x)
        
        h0 = torch.zeros(2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(2, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        
        out = self.dropout_post_lstm(out)
        out = self.linear(out)
        # out = self.ReLu(out)
        out = self.elu(out)
        out = self.linear2(out)
    
        return out


# Utils

In [17]:
# Used to predict on a development data loader
# Writes the output to a file, i.e. to dev.out
def predict_dev2(model, data_loader, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = X_original[i][0]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '':
                    break
                output.append((idx, word, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, pred = outputs[i][j]
                f.write(f'{idx} {word} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')

# Used to predict on a test data, list of sentences
# Writes the output to a file, i.e. to test.out
def predict_test2(model, sentences, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for sentence in sentences:
            spelling_sentence = [get_spelling_feature(sentence)]
            spelling_sentence = torch.from_numpy(np.array(spelling_sentence, dtype=np.int64)).to(device)
            
            transformed_sentence = [prepare_sequence(sentence, word_to_ix, use_unk=True)]
            transformed_sentence = torch.from_numpy(np.array(transformed_sentence, dtype=np.int64)).to(device)
            
            y_pred_scores = model(transformed_sentence, spelling_sentence)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = sentence[i]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '':
                    break
                output.append((word, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                word, pred = outputs[i][j]
                f.write(f'{word} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')
                
# Used to predict on a development data loader
# Writes statistics to console
def predict2(model, data_loader, message):
    all_y = []
    all_y_pred = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()
            y_flat = torch.flatten(y).tolist()
            
            for i in range(len(y_pred_flat)):
                if y_flat[i] == tag_to_ix['']:
                    break
                all_y.append(y_flat[i])
                all_y_pred.append(y_pred_flat[i])

    print(message, classification_report(all_y, all_y_pred))
                
# Used to predict on a development data loader
# Writes the output to a file for PERL script, i.e. to prediction.txt
def predict_perl2(model, data_loader, fname):
    outputs = []
    model.eval()
    with torch.no_grad():
        for X, y, X_original, X_spelling in data_loader:
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.argmax(y_pred_scores, dim=2)
            y_pred_flat = torch.flatten(y_pred).tolist()
            y_flat = torch.flatten(y).tolist()

            idx = 1
            output = []
            for i in range(len(y_pred_flat)):
                word = X_original[i][0]
                gold = ix_to_tag[y_flat[i]]
                pred = ix_to_tag[y_pred_flat[i]]
                if word == '':
                    break
                output.append((idx, word, gold, pred))
                idx += 1
            outputs.append(output)

    with open(fname, 'w') as f:
        for i in range(len(outputs)):
            for j in range(len(outputs[i])):
                idx, word, gold, pred = outputs[i][j]
                f.write(f'{idx} {word} {gold} {pred}\n')
            if i != len(outputs)-1:
                f.write('\n')

# Data

In [18]:
train_dataset = NERDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

dev_dataset = NERDataset(dev_data)
dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False)


In [19]:
model = BLSTM2(VOCAB_SIZE, EMBEDDING_DIM, LSTM_HIDDEN_DIM, LINEAR_DIM, TAGS_SIZE, LSTM_DROPOUT, ELU_ALPHA,
               embedding_matrix, SPELLING_EMBEDDING_DIM).to(device)
ratio = float(2000/14483)

weights = [0,ratio,1-ratio]
class_weights = torch.FloatTensor(weights).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)


In [20]:
# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)

In [21]:
from sklearn.metrics import f1_score

In [22]:
%%time

if os.path.isfile('blstm2.pt'):
    print('Task 2', 'blstm2.pt exists. Loading existing model...')
    model = torch.load('blstm2.pt')
    model.to(device)
else:
    print('Task 2', 'blstm2.pt does not exist. Training a new model...')
    total_loss = []
    for epoch in range(3):
        model.train()
        for i, (X, y, X_original, X_spelling) in enumerate(train_loader):
            X, y = X.to(device), y.to(device)

            y_pred_scores = model(X, X_spelling)
            y_pred = torch.flatten(y_pred_scores, start_dim=0, end_dim=1)
            y = torch.flatten(y)
            loss = criterion(y_pred, y) 
            # loss = loss_fn(y_pred,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
        print(f'Epoch {epoch+1} / {NUM_EPOCHS}, training loss: {np.average(total_loss):.5f}, learning rate: {optimizer.param_groups[0]["lr"]:.5f}')
        total_loss = []
        scheduler.step()
        predict2(model, dev_loader, f'Epoch {epoch+1} / {NUM_EPOCHS}')


Task 2 blstm2.pt does not exist. Training a new model...
Epoch 1 / 100, training loss: 0.22706, learning rate: 0.01000
Epoch 1 / 100               precision    recall  f1-score   support

           1       0.97      0.97      0.97     14483
           2       0.63      0.70      0.66      1250

    accuracy                           0.94     15733
   macro avg       0.80      0.83      0.82     15733
weighted avg       0.95      0.94      0.95     15733

Epoch 2 / 100, training loss: 0.17293, learning rate: 0.01000
Epoch 2 / 100               precision    recall  f1-score   support

           1       0.96      0.99      0.98     14483
           2       0.85      0.53      0.65      1250

    accuracy                           0.96     15733
   macro avg       0.91      0.76      0.82     15733
weighted avg       0.95      0.96      0.95     15733

Epoch 3 / 100, training loss: 0.15392, learning rate: 0.01000
Epoch 3 / 100               precision    recall  f1-score   support

      