In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.utils import class_weight
import warnings
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics import confusion_matrix, classification_report
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import ReduceLROnPlateau as lr_scheduler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
warnings.filterwarnings('ignore')

In [2]:
#config variables
train_path = "data/train"
dev_path = "data/dev"
test_path = "data/test"
glove_path = "data/glove.6B.100d.gz"
unk = "<unk>"
pad = "<pad>"
num = "<num>"
sym = "<sym>"
max_len = 128
batch_size = 8
numbers = ['one','two','three','four','five', 'six','seven','eight','nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'zero', 'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion', 'quintillion', 'sextillion', 'septillion', 'octillion', 'nonillion', 'decillion']

In [3]:
def is_number(s):
    try:
        if "," in s:
            s = s.replace(",", "")
        if ":" in s:
            s = s.replace(":", "")
        if "-" in s:
            s = s.replace("-", "")
        if "/" in s:
            s = s.replace("/", "")
        if "." in s:
            s = s.replace(".", "")
        if s.lower() in numbers:
            return True
        float(s)
        return True
    except ValueError:
        return False
def is_symbol(s):
    flag = 1
    for char in s:
        if char.isalnum():
            flag = 0
            break
    if flag == 1:
        return True
    else:
        return False

In [4]:
#Function to read the datafile in the "path" line by line and return a list of lists 
#after removing trailing white spaces and empty lists
def read_data(path):
    with open(path, "r") as file_obj:
        lines = file_obj.readlines()
    line_list = [line.rstrip().split() for line in lines]
    line_list = [x for x in line_list if x != []]
    return line_list

In [5]:
train_set = read_data(train_path)
dev_set = read_data(dev_path)
test_set = read_data(test_path)

In [6]:
vocab = {}
tag_to_idx = {}
idx_to_tag = {}
just_tags = []
idx = 0
threshold = 1

In [7]:
#creating vocabulary from the training_data for task1,
#removed the process of deleting words that are rare to increase performance
#also, creating the tag to idx and idx to tag dictionaries
for line in train_set:
    word = line[1]
    tag = line[2]
    if is_number(word):
        word = num
    if word in vocab:
        vocab[word] += 1
    else:
        vocab[word] = 1
    if tag not in tag_to_idx:
        tag_to_idx[tag] = idx
        idx_to_tag[idx] = tag
        idx += 1
    just_tags.append(tag_to_idx[tag])
# for word, freq in  list(vocab.items()):
#     if freq <= threshold:
#         del vocab[word]
vocab["<unk>"] = 1

In [8]:
print("Length of training set vocabulary :", len(vocab))

Length of training set vocabulary : 20194


In [9]:
#creating word to idx and idx to word dictionaries to represent words as indices
word_to_idx = {}
idx_to_word = {}
idx = 1
for word in vocab:
    if word not in word_to_idx:
        word_to_idx[word] = idx
        idx_to_word[idx] = word
        idx += 1
word_to_idx[pad] = 0
idx_to_word[0] = pad
pad_idx = 0

In [10]:
#function to return a numpy array of indices of the words padded to max_len = 128
def dataset_creation(sentence, test = False):
    global word_to_idx, tag_to_idx, unk, max_len
    sen_len = len(sentence)
    pad_len = max_len - sen_len
    words = []
    tags = []
    if test:
        for idx, word in sentence:
            if word not in word_to_idx:
                if is_number(word):
                    word = num
                else:
                    word = unk
            words.append(word_to_idx[word])
        words = np.array(words)
        pad_seq = pad_idx * np.ones(pad_len)
        words = np.concatenate((words, pad_seq), axis = 0)
        return words
    else:
        for idx, word, tag in sentence:
            if word not in word_to_idx:
                if is_number(word):
                    word = num
    #             elif word.isupper():
    #                 temp = word[0] + word[1:].lower()
    #                 if temp in word_to_idx:
    #                     word = temp
    #                 else:
    #                     word = unk
                else:
                    word = unk
            words.append(word_to_idx[word])
            tags.append(tag_to_idx[tag])
        words = np.array(words)
        tags = np.array(tags)
        pad_seq = pad_idx * np.ones(pad_len)
        pad_tag = -1 * np.ones(pad_len)
        words = np.concatenate((words, pad_seq), axis = 0)
        tags = np.concatenate((tags, pad_tag), axis = 0)
        return words, tags

In [11]:
#function to create the input and output data that is used to train the network
#function calls dataset_creation on each sentence
def create_lstm_ip(dataset, test = False):
    x_lstm = []
    y_lstm = []
    sentence = []
    for i in range(len(dataset)):
        if i == len(dataset) - 1 or (dataset[i][0] == '1' and i != 0):
            if test:
                x = dataset_creation(sentence, test)
                x_lstm.append(x)
            else:
                x, y = dataset_creation(sentence, test)
                x_lstm.append(x)
                y_lstm.append(y)
            sentence = []
            sentence.append(dataset[i])
        else:
            sentence.append(dataset[i])
    if sentence != []:
        if test:
            last_x = dataset_creation(sentence, test)
            x_lstm.append(last_x)
        else:
            last_x, last_y = dataset_creation(sentence)
            x_lstm.append(last_x)
            y_lstm.append(last_y)
    if test:
        return np.array(x_lstm)
    else:
        return np.array(x_lstm), np.array(y_lstm)

In [12]:
x_lstm_train, y_lstm_train = create_lstm_ip(train_set, False)
x_lstm_dev, y_lstm_dev = create_lstm_ip(dev_set, False)
x_lstm_test = create_lstm_ip(test_set, True)

In [13]:
#dataset class definition
class Dataset(object):
    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

    def __add__(self, other):
        return ConcatDataset([self, other])

class data(Dataset):
    def __init__(self, inputs, transform = None):
        self.data = inputs
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        inputs = self.data[index][0]
        label = self.data[index][1]
        if self.transform is not None:
            inputs = self.transform(inputs)
            
        return inputs, label

In [14]:
#convertin the arrays into tensors
x_lstm_train, x_lstm_dev, x_lstm_test = torch.LongTensor(x_lstm_train), torch.LongTensor(x_lstm_dev), torch.LongTensor(x_lstm_test)
y_lstm_train, y_lstm_dev = torch.LongTensor(y_lstm_train), torch.LongTensor(y_lstm_dev)

In [15]:
#function to return the original lengths of the padded sentences in a batch
#function returns a tensor of batch_size containing the lengths of the corresponding sentence
def get_lengths(seq, idx):
    lens = []
    for x in seq:
        length = 0
        for i in range(len(x)):
            if x[i] == idx:
                break
            length += 1
        lens.append(length)
    return torch.Tensor(lens)

In [16]:
#creating the data loaders
lstm_train_dataset = TensorDataset(x_lstm_train, y_lstm_train)
lstm_train_dataset = data(lstm_train_dataset)
lstm_dev_dataset = TensorDataset(x_lstm_dev, y_lstm_dev)
lstm_dev_dataset = data(lstm_dev_dataset)

lstm_train_loader = DataLoader(lstm_train_dataset, batch_size = batch_size, drop_last = True, shuffle = True)
lstm_dev_loader = DataLoader(lstm_dev_dataset, batch_size = batch_size, drop_last = True, shuffle = True)

In [17]:
#initializing network hyperparameters
input_dim = len(word_to_idx)
embed_dim = 100
hidden_dim = 256
linear_dim = 128
output_dim = len(tag_to_idx)
pad_idx = word_to_idx[pad]
class_weights = class_weight.compute_class_weight('balanced', np.unique(just_tags), just_tags)

In [18]:
#accuracy function to determine sentence level accuracy in a batch
def accuracy(pred, targ):
    pred = pred.argmax(dim = 1, keepdim = True)
    non_pad_elements = (targ != -1).nonzero()
    correct = pred[non_pad_elements].squeeze(1).eq(targ[non_pad_elements])
    return correct.sum() / torch.FloatTensor([targ[non_pad_elements].shape[0]]).to(device)

In [19]:
#bi-LSTM model with an embedding layer
class bLSTM(torch.nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, linear_dim, output_dim, pad_idx):
        super(bLSTM, self).__init__()
        self.embedding_dim = embed_dim
        self.embedding = torch.nn.Embedding(num_embeddings = input_dim, embedding_dim = embed_dim)
        self.blstm = torch.nn.LSTM(input_size = embed_dim, hidden_size = hidden_dim, num_layers = 1, bidirectional = True, batch_first = True, dropout = 0.33)
        self.linear = torch.nn.Linear(hidden_dim*2, linear_dim)
        self.elu = torch.nn.ELU()
        self.classifier = torch.nn.Linear(linear_dim, output_dim)
    
    def forward(self, x):
        emb = self.embedding(x)
        lens = get_lengths(x, 0)
        packed = pack_padded_sequence(emb, lens, batch_first = True, enforce_sorted = False)
        blstm_out, _ = self.blstm(packed)
        blstm_out, _ = pad_packed_sequence(blstm_out, batch_first = True, padding_value = 0, total_length = 128)
        lin_out = self.elu(self.linear(blstm_out))
        class_out = self.classifier(lin_out)
        return class_out
    
    def init_weights(self):
        for name, param in self.named_parameters():
            torch.nn.init.normal_(param.data, mean=0, std=0.1)

    def init_embeddings(self, padding_idx):
        self.embedding.weight.data[padding_idx] = torch.zeros(self.embedding_dim)

In [20]:
#initializing the model and the weights and printing the model architecture
model = bLSTM(input_dim, embed_dim, hidden_dim, linear_dim, output_dim, pad_idx).to(device)
model.init_weights()
model.init_embeddings(padding_idx = pad_idx)
print(model)

bLSTM(
  (embedding): Embedding(20195, 100)
  (blstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)


In [21]:
#initializing the loss function, optimizer (with learning rate and momentum) and the scheduler (with step size)
class_weights = torch.FloatTensor(class_weights)
loss_fn = torch.nn.CrossEntropyLoss(weight = class_weights, ignore_index = -1).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.017, momentum = 0.9)
scheduler = StepLR(optimizer, step_size = 25) 
# scheduler = lr_scheduler(optimizer, 'max', patience = 4, factor = 0.9)

In [22]:
#training the model
epochs = 100
dev_max_acc = 0
train_loader = lstm_train_loader
dev_loader = lstm_dev_loader

for epoch in range(epochs):
    model.train()
    train_acc = 0.0
    dev_acc = 0.0
    batch = 0
    for inputs, target in train_loader:
        print(batch, end = "\r")
        batch += 1
        inputs, target = inputs.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        output = output.view(-1, output.shape[-1])
        target = target.view(-1)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        train_acc += float(accuracy(output, target).item())
    
    model.eval()
    for inputs, target in dev_loader:
        inputs, target = inputs.to(device), target.to(device)
        inputs, target = inputs, target
        output = model(inputs)
        output = output.view(-1, output.shape[-1])
        target = target.view(-1)
        loss = loss_fn(output, target)
        dev_acc += float(accuracy(output, target).item())
    
    train_acc = (train_acc * batch_size)/len(train_loader.dataset)
    dev_acc = (dev_acc * batch_size)/len(dev_loader.dataset)
    
    print('Epoch: {} \tTraining Acc: {:.6f} \tDev Set Acc: {:.6f}'.format(epoch+1, train_acc,dev_acc))
    if dev_acc >= dev_max_acc:
        print('Dev Set acc increased ({:.6f} --> {:.6f}). Saving model...'.format(dev_max_acc, dev_acc))
        torch.save(model.state_dict(), 'model/blstm1.pt')
        dev_max_acc = dev_acc
    scheduler.step(dev_max_acc)

Epoch: 1 	Training Acc: 0.580134 	Dev Set Acc: 0.699054
Dev Set acc increased (0.000000 --> 0.699054). Saving model...
Epoch: 2 	Training Acc: 0.806081 	Dev Set Acc: 0.787716
Dev Set acc increased (0.699054 --> 0.787716). Saving model...
Epoch: 3 	Training Acc: 0.893995 	Dev Set Acc: 0.865295
Dev Set acc increased (0.787716 --> 0.865295). Saving model...
Epoch: 4 	Training Acc: 0.932915 	Dev Set Acc: 0.928622
Dev Set acc increased (0.865295 --> 0.928622). Saving model...
Epoch: 5 	Training Acc: 0.955312 	Dev Set Acc: 0.942170
Dev Set acc increased (0.928622 --> 0.942170). Saving model...
Epoch: 6 	Training Acc: 0.969550 	Dev Set Acc: 0.948110
Dev Set acc increased (0.942170 --> 0.948110). Saving model...
Epoch: 7 	Training Acc: 0.978083 	Dev Set Acc: 0.936674
Epoch: 8 	Training Acc: 0.981275 	Dev Set Acc: 0.952127
Dev Set acc increased (0.948110 --> 0.952127). Saving model...
Epoch: 9 	Training Acc: 0.983808 	Dev Set Acc: 0.944377
Epoch: 10 	Training Acc: 0.988801 	Dev Set Acc: 0.95729

In [28]:
# del model, optimizer, loss_fn

In [20]:
#function to create the output file in the required format
def create_op_file(x, y, model, dataset, file, pad_idx, test = False):
    count = 0
    model.eval()
    line_num = 0
    global idx_to_word, idx_to_tag
    with torch.no_grad():
        with open(file, "w") as fp:
            for i in range(len(x)):
                idx = 1
                print(i, end = "\r")
                ip = x[i].to(device)
                ip = torch.unsqueeze(ip, 0)
                op = model(ip)
                op = op.view(-1, op.shape[-1])
                _, pred = torch.max(op, 1)
                if test:
                    for j in range(len(pred)):
                        if x[i][j] == pad_idx:
                            if i != len(x) - 1:
                                fp.write("\n")
                            break
                        pred_tag = int(pred[j].item())
                        z = dataset[line_num][1]
                        fp.write("{} {} {}\n".format(idx, z, idx_to_tag[pred_tag]))
                        line_num += 1
                        idx += 1
                else:
                    target = y[i]
                    for j in range(len(target)):
                        if target[j] == -1:
                            if i != len(x) - 1:
                                fp.write("\n")
                            break
                        pred_tag = int(pred[j].item())
                        targ_tag = int(target[j].item())
                        if pred_tag == targ_tag:
                            count += 1
                        z = dataset[line_num][1]
                        fp.write("{} {} {}\n".format(idx, z, idx_to_tag[pred_tag]))
                        line_num += 1
                        idx += 1
#     print(count / line_num)

In [27]:
#loading the model to predict on given dataset and store it as a file
model = bLSTM(input_dim, embed_dim, hidden_dim, linear_dim, output_dim, pad_idx).to(device)
print(model)
model.load_state_dict(torch.load('model/blstm1.pt'))
create_op_file(x_lstm_dev, y_lstm_dev, model, dev_set, "outputs/dev1.out", pad_idx, False)

bLSTM(
  (embedding): Embedding(20195, 100)
  (blstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
3465

In [30]:
del model

In [31]:
#loading the model to predict on given dataset and store it as a file
model = bLSTM(input_dim, embed_dim, hidden_dim, linear_dim, output_dim, pad_idx).to(device)
print(model)
model.load_state_dict(torch.load('model/blstm1.pt'))
create_op_file(x_lstm_test, None, model, test_set, "outputs/test1.out", pad_idx, True)

bLSTM(
  (embedding): Embedding(20195, 100)
  (blstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
3683

In [21]:
#function to create the list of lists which just contain the words, from the given dataset
def create_corpus(dataset, test = False):
    sentences = []
    sent = []
    if test:
        for idx, word in dataset:
            if idx == '1' and sent != []:
                sentences.append(sent)
                sent = [word]
            else:
                sent.append(word)
        if sent != []:
            sentences.append(sent)
    else:
        for idx, word, tag in dataset:
            if idx == '1' and sent != []:
                sentences.append(sent)
                sent = [word]
            else:
                sent.append(word)
        if sent != []:
            sentences.append(sent)
    return sentences

In [22]:
#creating a corpus that contains sentences from train, dev and test dataset
train_corpus = create_corpus(train_set, False)
dev_corpus = create_corpus(dev_set, False)
test_corpus = create_corpus(test_set, True)
corpus = train_corpus + dev_corpus + test_corpus

In [23]:
#loading the glove model
glove_w2v_file = 'data/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_path, glove_w2v_file)
glove_vec = KeyedVectors.load_word2vec_format(glove_w2v_file)

In [24]:
#function to create a global vocabulary consisting of words from train, dev and test sets
#function creates the global word to idx and idx to word dictionaries
def create_global_word2idx(corpus):
    idx = 0
    global all_word_to_idx, all_idx_to_word
    for sent in corpus:
        for word in sent:
            if is_number(word):
                word = num
            if word not in all_word_to_idx:
                all_word_to_idx[word] = idx
                all_idx_to_word[idx] = word
                idx += 1
    all_word_to_idx[pad] = idx
    all_idx_to_word[idx] = pad

In [25]:
#function to create the weight matrix, for the embedding layer, from the glove vectors of each word in the corpus
#words that do not have a glove embedding are assigned random embedding from a normal distribution
#words are first converted into lower case and then assigned an embedding which is stored in a embedding dictionary
#two words, capitalised and lower case are assigned the same embedding initially but they differ by their indices
#since the embedding layer with the weight matrix is trainable, at the end of the training,
#the embeddings of both those words should look different if they are not semantically similar to each other
def create_weight_matrix(weight_matrix, model):
    global all_word_to_idx, embedding_dict
    for word, idx in all_word_to_idx.items():
#         embed = np.zeros(100, dtype = float)
        word = word.lower()
        if word in embedding_dict:
            weight_matrix[idx] = embedding_dict[word]
        else:
            try:
                weight_matrix[idx] = model[word]
            except KeyError:
                rand_embed = np.random.normal(scale = 0.6, size = (100,))
                weight_matrix[idx] = rand_embed
                embedding_dict[word] = rand_embed
    
    return weight_matrix

In [26]:
#function to pad the sentences
def pad_glove(sentence):
    global max_len, glove_pad
    diff = max_len - len(sentence)
    sentence = np.concatenate((sentence, glove_pad * np.ones(diff, dtype = float)))
    return sentence

In [27]:
#function to create the dataset for bi-LSTM with GloVe embeddings
def create_glove_data(sentences, test = False):
    global all_word_to_idx
    glove_sentences = []
    glove_sent = []
    if test:
        for idx, word, in sentences:
            if is_number(word):
                word = num
            if idx == '1' and glove_sent != []:
                temp = np.array(glove_sent)
                temp = pad_glove(temp)
                glove_sentences.append(temp)
                glove_sent = [all_word_to_idx[word]]
            else:
                glove_sent.append(all_word_to_idx[word])
        if glove_sent != []:
            temp = np.array(glove_sent)
            temp = pad_glove(temp)
            glove_sentences.append(temp)
    else:
        for idx, word, tag in sentences:
            if is_number(word):
                word = num
            if idx == '1' and glove_sent != []:
                temp = np.array(glove_sent)
                temp = pad_glove(temp)
                glove_sentences.append(temp)
                glove_sent = [all_word_to_idx[word]]
            else:
                glove_sent.append(all_word_to_idx[word])
        if glove_sent != []:
            temp = np.array(glove_sent)
            temp = pad_glove(temp)
            glove_sentences.append(temp)
    return np.array(glove_sentences)

In [28]:
#creating the global corpus and dictionaries
all_word_to_idx = {}
all_idx_to_word = {}
create_global_word2idx(corpus)
glove_pad = all_word_to_idx[pad]

In [29]:
#creating the weight matrix
weight_matrix = np.zeros((len(all_word_to_idx), 100), dtype = float)
embedding_dict = {}
embedding_dict[pad] = np.zeros(100, dtype = float)
weight_matrix = create_weight_matrix(weight_matrix, glove_vec)

In [30]:
train_sent = create_glove_data(train_set, False)
dev_sent = create_glove_data(dev_set, False)
test_sent = create_glove_data(test_set, True)

In [31]:
glove_train_x, glove_dev_x, glove_test_x = torch.LongTensor(train_sent).to(device), torch.LongTensor(dev_sent).to(device), torch.LongTensor(test_sent).to(device)

In [32]:
#since the tags and their indices are the same in both tasks, the tag tensors can be reused
lstm_glove_train_dataset = TensorDataset(glove_train_x, y_lstm_train)
lstm_glove_train_dataset = data(lstm_glove_train_dataset)
lstm_glove_dev_dataset = TensorDataset(glove_dev_x, y_lstm_dev)
lstm_glove_dev_dataset = data(lstm_glove_dev_dataset)

lstm_glove_train_loader = DataLoader(lstm_glove_train_dataset, batch_size = batch_size, drop_last = True, shuffle = True)
lstm_glove_dev_loader = DataLoader(lstm_glove_dev_dataset, batch_size = batch_size, drop_last = True, shuffle = True)

In [33]:
input_dim = len(all_word_to_idx)
embed_dim = 100
hidden_dim = 256
linear_dim = 128
output_dim = len(tag_to_idx)
class_weights = class_weight.compute_class_weight('balanced', np.unique(just_tags), just_tags)

In [34]:
#function to create the embedding layer and load the weights from the weight matrix
#returns an embedding layer
def create_embedding(input_dim, embed_dim, pad_idx, weight_matrix):
    weight_matrix = torch.FloatTensor(weight_matrix).to(device)
    embedding = torch.nn.Embedding(num_embeddings = input_dim, embedding_dim = embed_dim, padding_idx = pad_idx)
    embedding.load_state_dict({'weight': weight_matrix})
    return embedding

In [35]:
#bi-LSTM with GloVe embeddings model
#initialising weights of the network didn't improve performance
class glove_bLSTM(torch.nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, linear_dim, output_dim, pad_idx, weight_matrix):
        super(glove_bLSTM, self).__init__()
        self.pad_idx = pad_idx
        self.embedding = create_embedding(input_dim, embed_dim, pad_idx, weight_matrix)
        self.blstm = torch.nn.LSTM(input_size = embed_dim, hidden_size = hidden_dim, num_layers = 1, bidirectional = True, batch_first = True, dropout = 0.33)
        self.linear = torch.nn.Linear(hidden_dim * 2, linear_dim)
        self.elu = torch.nn.ELU()
        self.classifier = torch.nn.Linear(linear_dim, output_dim)
    
    def forward(self, x):
        emb = self.embedding(x)
        pad_idx = self.pad_idx
        lens = get_lengths(x, pad_idx)
        packed = pack_padded_sequence(emb, lens, batch_first = True, enforce_sorted = False)
        blstm_out, _ = self.blstm(packed)
        blstm_out, _ = pad_packed_sequence(blstm_out, batch_first = True, padding_value = 0, total_length = 128)
        lin_out = self.elu(self.linear(blstm_out))
        class_out = self.classifier(lin_out)
        return class_out
    
#     def init_weights(self):
#         for name, param in self.named_parameters():
#             torch.nn.init.normal_(param.data, mean=0, std=0.1)

In [36]:
model = glove_bLSTM(input_dim, embed_dim, hidden_dim, linear_dim, output_dim, glove_pad, weight_matrix).to(device)
# model.init_weights()
print(model)

glove_bLSTM(
  (embedding): Embedding(25455, 100, padding_idx=25454)
  (blstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)


In [37]:
class_weights = torch.FloatTensor(class_weights)
loss_fn = torch.nn.CrossEntropyLoss(weight = class_weights, ignore_index = -1).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.017, momentum = 0.9)
scheduler = StepLR(optimizer, step_size = 25) 
# scheduler = lr_scheduler(optimizer, 'max', patience = 4, factor = 0.9)

In [38]:
epochs = 100
dev_max_acc = 0
train_loader = lstm_glove_train_loader
dev_loader = lstm_glove_dev_loader

for epoch in range(epochs):
    model.train()
    train_acc = 0.0
    dev_acc = 0.0
    batch = 0
    for inputs, target in train_loader:
        print(batch, end = "\r")
        batch += 1
        inputs, target = inputs.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        output = output.view(-1, output.shape[-1])
        target = target.view(-1)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        train_acc += float(accuracy(output, target).item())
    
    model.eval()
    for inputs, target in dev_loader:
        inputs, target = inputs.to(device), target.to(device)
        inputs, target = inputs, target
        output = model(inputs)
        output = output.view(-1, output.shape[-1])
        target = target.view(-1)
        loss = loss_fn(output, target)
        dev_acc += float(accuracy(output, target).item())
    
    train_acc = (train_acc * batch_size)/len(train_loader.dataset)
    dev_acc = (dev_acc * batch_size)/len(dev_loader.dataset)
    
    print('Epoch: {} \tTraining Acc: {:.6f} \tDev Set Acc: {:.6f}'.format(epoch+1, train_acc,dev_acc))
    if dev_acc >= dev_max_acc:
        print('Dev Set acc increased ({:.6f} --> {:.6f}). Saving model...'.format(dev_max_acc, dev_acc))
        torch.save(model.state_dict(), 'model/blstm2.pt')
        dev_max_acc = dev_acc
    scheduler.step(dev_max_acc)

Epoch: 1 	Training Acc: 0.760463 	Dev Set Acc: 0.609282
Dev Set acc increased (0.000000 --> 0.609282). Saving model...
Epoch: 2 	Training Acc: 0.852551 	Dev Set Acc: 0.890627
Dev Set acc increased (0.609282 --> 0.890627). Saving model...
Epoch: 3 	Training Acc: 0.889948 	Dev Set Acc: 0.915076
Dev Set acc increased (0.890627 --> 0.915076). Saving model...
Epoch: 4 	Training Acc: 0.909056 	Dev Set Acc: 0.913437
Epoch: 5 	Training Acc: 0.926034 	Dev Set Acc: 0.940713
Dev Set acc increased (0.915076 --> 0.940713). Saving model...
Epoch: 6 	Training Acc: 0.940557 	Dev Set Acc: 0.943279
Dev Set acc increased (0.940713 --> 0.943279). Saving model...
Epoch: 7 	Training Acc: 0.949746 	Dev Set Acc: 0.947391
Dev Set acc increased (0.943279 --> 0.947391). Saving model...
Epoch: 8 	Training Acc: 0.958728 	Dev Set Acc: 0.949210
Dev Set acc increased (0.947391 --> 0.949210). Saving model...
Epoch: 9 	Training Acc: 0.965428 	Dev Set Acc: 0.957698
Dev Set acc increased (0.949210 --> 0.957698). Saving m

In [39]:
del model, optimizer, loss_fn

In [40]:
model = glove_bLSTM(input_dim, embed_dim, hidden_dim, linear_dim, output_dim, glove_pad, weight_matrix).to(device)
model.load_state_dict(torch.load('blstm2.pt'))
create_op_file(glove_dev_x, y_lstm_dev, model, dev_set, "outputs/dev2.out", glove_pad, False)

3465

In [41]:
del model

In [42]:
model = glove_bLSTM(input_dim, embed_dim, hidden_dim, linear_dim, output_dim, glove_pad, weight_matrix).to(device)
model.load_state_dict(torch.load('blstm2.pt'))
create_op_file(glove_test_x, None, model, test_set, "outputs/test2.out", glove_pad, True)

3683