# Data And Input

## Data Download and Load

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])

In [None]:
#Download file if not existing
try:
  _ = open("train.csv", "r")
except:
  downloadFiles([["train.csv", "1pRTJ3aTh1yZV2ZN7Fof2NSt1c137SzYw"]])

try:
  _ = open("val.csv", "r")
except:
  downloadFiles([["val.csv", "1khO0wHBC8bBzLVH4G09NhKTHHfAwBIna"]])

try:
  _ = open("test.csv", "r")
except:
  downloadFiles([["test.csv", "1-E3dhTaMhG5oRKS9HGF9ZTTzsOKc_D14"]])

try:
  _ = open("dota.csv", "r")
except:
  downloadFiles([["dota.csv", "1DCsO0uICmtabGiy8MQAYPl0xeDwhFP5S"]])

In [None]:
import pandas as pd


def read_data(file_name, test=False):
    f = pd.read_csv(file_name)

    temp1 = f['sents'].tolist()
    input_data = [sent.lower().split() for sent in temp1]

    if not test:
      temp2 = f['labels'].tolist()
      target_data = [sent.split() for sent in temp2]

      return input_data, target_data

    return input_data

train_data, target_y_train = read_data('train.csv')
validation_data, target_y_validation = read_data('val.csv')
test_data = read_data('test.csv', True)

print(len(train_data))
print(type(train_data[2]))
print(train_data[2])
print(target_y_train[2])

26078
<class 'list'>
['wpe', 'wpe']
['O', 'O']


## Data Preprocessing

#### Generate word_to_ix and tag_to_ix

In [None]:
word_to_ix = {}
for sentence in train_data+validation_data+test_data:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

### convert dataset into idxs

In [None]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)
# For kaggle leaderboard
train_val_input_index = train_input_index + val_input_index
train_val_output_index = train_output_index + val_output_index
# test_output_index = to_index(target_y_test,tag_to_ix)

## Input Embedding

### Aspect 1) Syntactic Textual Feature Embedding: PoS tag information, Dependency Path, etc.


In [None]:
import spacy
import numpy as np
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")

labels = [nlp(word)[0].tag_ for word in word_list]

# one-hot encoding
syntactic_labels = dict()
for idx,label in enumerate(set(labels)):
  syntactic_labels[label] = [0]*len(set(labels))
  syntactic_labels[label][idx] = 1

syntactic_embedding_dim = len(set(labels))

syntactic_embedding_matrix = []
for label in labels:
    try:
        syntactic_embedding_matrix.append(syntactic_labels[label])
    except:
        syntactic_embedding_matrix.append([0]*len(set(labels)))
syntactic_embedding_matrix = np.array(syntactic_embedding_matrix)
syntactic_embedding_matrix.shape

(11243, 38)

### Aspect 2) Semantic Textual Feature Embedding: Word Embeddings (Word2Vec, ELMO, etc.)

In [None]:
import numpy as np
import gensim.downloader as api

word_emb_model = api.load("glove-twitter-200")

semantic_embedding_dim = 200

semantic_embedding_matrix = []
for word in word_list:
    try:
        semantic_embedding_matrix.append(word_emb_model.wv[word])
    except:
        semantic_embedding_matrix.append([0]*semantic_embedding_dim)
semantic_embedding_matrix = np.array(semantic_embedding_matrix)
semantic_embedding_matrix.shape

  # This is added back by InteractiveShellApp.init_path()


(11243, 200)

### Aspect 3) Domain Feature Embedding: Your own new feature embedding to solve this in-game chat word slot filling(tagging).

In [None]:
dota = pd.read_csv('dota.csv').dropna(subset=['key'])
corpus = dota['key'].tolist()

# Import libraries
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import FastText

# Prepare training corpus for Gensim Word2Vec Skip-Gram model - List of List
sentences=[]
sentences=[word_tokenize(sentence.lower()) for sentence in corpus]

# Train the Gensim Word2Vec Skip-Gram model
# domain_model = Word2Vec(sentences=sentences, size=50, window=2, min_count=1, workers=2, sg=1)
domain_model = FastText(sentences=sentences, size=100, window=2, min_count=1, workers=2, sg=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Save and load model
domain_model.save("domain.model")
from gensim.models import FastText
domain_model = FastText.load("domain.model")

In [None]:
domain_embedding_dim = 100

domain_embedding_matrix = []
for word in word_list:
    try:
        domain_embedding_matrix.append(domain_model[word])
    except:
        domain_embedding_matrix.append([0]*domain_embedding_dim)
domain_embedding_matrix = np.array(domain_embedding_matrix)
domain_embedding_matrix.shape

  


(11243, 100)

## Generate Embedding Matrix

In [None]:
# EMBEDDING_DIM = syntactic_embedding_dim + semantic_embedding_dim + domain_embedding_dim
# embedding_matrix = np.concatenate((syntactic_embedding_matrix, semantic_embedding_matrix, domain_embedding_matrix), axis=1)
# embedding_matrix.shape
EMBEDDING_DIM = semantic_embedding_dim + domain_embedding_dim
embedding_matrix = np.concatenate((semantic_embedding_matrix, domain_embedding_matrix), axis=1)
embedding_matrix.shape
# EMBEDDING_DIM = domain_embedding_dim
# embedding_matrix = domain_embedding_matrix
# embedding_matrix.shape

(11243, 300)

# Best Model

## Attention

In [None]:
import torch
import torch.nn as nn
# Reference
# https://github.com/ROBINADC/BiGRU-CRF-with-Attention-for-NER
class CosineAttention(nn.Module):
    def __init__(self, dropout_rate=0.0, eps=1e-10, **kwargs):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.eps = eps

    def forward(self, q, k, v, attn_mask=None):
        q_norm = q / (q.norm(p=2, dim=-1, keepdim=True) + self.eps)
        k_norm = k / (k.norm(p=2, dim=-1, keepdim=True) + self.eps)
        attention = torch.bmm(q_norm, k_norm.permute(0, 2, 1))
        if attn_mask is not None:
            attention.masked_fill_(attn_mask, -np.inf)
        attention = F.softmax(attention, dim=-1)
        attention = self.dropout(attention)
        output = attention.bmm(v)
        return output, attention

In [None]:
# Reference
# https://github.com/ROBINADC/BiGRU-CRF-with-Attention-for-NER
class DotProductAttention(nn.Module):
    def __init__(self, dropout_rate=0.0, **kwargs):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, q, k, v, attn_mask=None):
        attention = torch.bmm(q, k.permute(0, 2, 1))
        if attn_mask is not None:
            attention.masked_fill_(attn_mask, -np.inf)
        attention = F.softmax(attention, dim=-1)
        attention = self.dropout(attention)
        output = attention.bmm(v)
        return output, attention

In [None]:
# Reference
# https://github.com/ROBINADC/BiGRU-CRF-with-Attention-for-NER
class ScaledDotProductAttention(nn.Module):
    def __init__(self, dropout_rate=0.0, **kwargs):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, q, k, v, attn_mask=None):
        attention = torch.bmm(q, k.permute(0, 2, 1))
        attention *= k.size(-1) ** -0.5
        if attn_mask is not None:
            attention.masked_fill_(attn_mask, -np.inf)
        attention = F.softmax(attention, dim=-1)
        attention = self.dropout(attention)
        output = attention.bmm(v)
        return output, attention

## Slot Filling/Tagging model

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# from flair.data import Sentence

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiGRU_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, crf=True, attention_method=None, n_layer = 1, attention_position=False):
        super(BiGRU_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.crf = crf
        self.n_layer = n_layer
        self.attention_position = attention_position

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        # """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))


        # self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
        #                     num_layers=3, bidirectional=True)
        if self.attention_position == True:
            self.gru_1 = nn.GRU(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
            self.gru_2 = nn.GRU(hidden_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        else:
            self.gru = nn.GRU(embedding_dim, hidden_dim // 2,
                                num_layers=self.n_layer, bidirectional=True)

        # self.attention = nn.Transformer(hidden_dim, nhead=10, num_encoder_layers=1).encoder
        self.attention_method = attention_method

        if self.attention_method != None:
            self.linear_q = nn.Linear(self.hidden_dim, self.hidden_dim)
            self.linear_k = nn.Linear(self.hidden_dim, self.hidden_dim)
            self.linear_v = nn.Linear(self.hidden_dim, self.hidden_dim)

            if self.attention_method == 'cosine':
                self.attention = CosineAttention()
            elif self.attention_method == 'dot':
                self.attention = DotProductAttention()
            elif self.attention_method == 'scaled_dot':
                self.attention = ScaledDotProductAttention()

        # Maps the output of the GRU into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    # def init_hidden(self):
    #     return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
    #             torch.randn(2, 1, self.hidden_dim // 2).to(device))
    def init_hidden(self):
        if self.attention_position == True:
            torch.randn(self.n_layer*2, 1, self.hidden_dim // 2).to(device)
        else:
            return torch.randn(self.n_layer*2, 1, self.hidden_dim // 2).to(device)

    def _forward_alg(self, feats):
        # feats [L,9]
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # init_alphas [1,9]
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_gru_features(self, sentence):
        # N: batch size
        # L: sequence length
        # D: 2 if bidirectional=True otherwise 1
        # H_in: input size
        # H_out: hidden size

        self.hidden = self.init_hidden()

        # sentence = Sentence(" ".join(sentence), use_tokenizer=False)
        # embeds = self.word_embeds.embed(sentence)
        # embeds = torch.stack([i.embedding for i in sentence])
        # embeds = embeds.view(len(sentence), 1, -1).to(device)

        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)

        # sentence sentence torch.Size([8]), (L)
        # embeds torch.Size([8, 1, 50]), (L,N,H_in)
        # lstm_out torch.Size([8, 1, 50]), (L,N,D*H_out)
        # self.hidden[0] torch.Size([2, 1, 25]), (D*num_layers, N, H_out)
        # self.hidden[1] torch.Size([2, 1, 25]), (D*num_layers, N, H_out)
        if self.attention_method == None:
            gru_out, self.hidden = self.gru(embeds, self.hidden)
            gru_out = gru_out.view(len(sentence), self.hidden_dim)
            # gru_out after view torch.Size([8, 50])
            gru_feats = self.hidden2tag(gru_out)
            # gru_feats torch.Size([8, 9])

        # attention_out = self.attention(lstm_out)
        # attention_out = attention_out.view(len(sentence), self.hidden_dim)
        # lstm_feats = self.hidden2tag(attention_out)
        else:
            if self.attention_position == True:
                gru_out, self.hidden = self.gru_1(embeds, self.hidden)
                gru_out = gru_out.permute(1,0,2)
                q = self.linear_q(gru_out)
                k = self.linear_k(gru_out)
                v = self.linear_v(gru_out)
                output, attention = self.attention(q,k,v)
                self.hidden = self.init_hidden()
                gru_out, self.hidden = self.gru_2(output, self.hidden)
                gru_out = gru_out.view(len(sentence), self.hidden_dim)
                gru_feats = self.hidden2tag(gru_out)
            else:
                gru_out, self.hidden = self.gru(embeds, self.hidden)
                gru_out = gru_out.permute(1,0,2)
                q = self.linear_q(gru_out)
                k = self.linear_k(gru_out)
                v = self.linear_v(gru_out)
                output, attention = self.attention(q,k,v)
                output = output.view(len(sentence), self.hidden_dim)
                gru_feats = self.hidden2tag(output)

        return gru_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])

        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_gru_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiGRU
        gru_feats = self._get_gru_features(sentence)

        # Find the best path, given the features.
        # score, tag_seq = self._viterbi_decode(lstm_feats)

        if self.crf:
            # Find the best path, given the features.
            score, tag_seq = self._viterbi_decode(gru_feats)
        else:
            score = None
            tag_seq = torch.max(F.softmax(gru_feats, dim=1), dim=1).indices.tolist()

        return score, tag_seq

#### Function for accuracy

In [None]:
# def cal_acc(model, input_index, output_index):
#     ground_truth = []
#     predicted = []
#     for i,idxs in enumerate(input_index):
#         ground_truth += output_index[i]
#         score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
#         predicted += pred
#     accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
#     return predicted, ground_truth, accuracy

from sklearn.metrics import f1_score
def cal_f1(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    # accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    each_score = f1_score(ground_truth, predicted, average=None, labels=[3,6,8,7,4,2]) # T,S,C,D,P,O
    mean_score = f1_score(ground_truth, predicted, average='micro')
    return predicted, ground_truth, each_score, mean_score

#### Initialize model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

# model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, crf=True, attention_method=None, n_layer = 1, attention_position=True).to(device)

model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                  crf = True, attention_method = None,
                  n_layer = 1, attention_position = False).to(device)

# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
# optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)

#### Train the model

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

# bptt = 100
dictionary_data = {}

for epoch in range(2):
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
    # for i, idxs in enumerate(train_data):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.

        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        # sentence_in = idxs
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()


    model.eval()
    # Call the cal_f1 functions you implemented as required
    _, _, train_each_f1, train_mean_f1 = cal_f1(model,train_input_index,train_output_index)
    _, _, val_each_f1, val_mean_f1 = cal_f1(model,val_input_index,val_output_index)


    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()
    dictionary_data[str(epoch)+'tloss'] = train_loss
    dictionary_data[str(epoch)+'vloss'] = val_loss
    dictionary_data[str(epoch)+'tef1'] = train_each_f1
    dictionary_data[str(epoch)+'vef1'] = val_each_f1
    dictionary_data[str(epoch)+'tmf1'] = train_mean_f1
    dictionary_data[str(epoch)+'vmf1'] = val_mean_f1

    print("Epoch:{:d}, Training loss: {:.2f}, train each f1: {}, train mean f1: {:.4f}, val loss: {:.2f}, val each f1: {}, val mean f1: {:.4f}, time: {:.2f}s".format(epoch+1, train_loss, train_each_f1.round(4), train_mean_f1, val_loss, val_each_f1.round(4), val_mean_f1, (time2-time1).total_seconds()))

    # The log below is the sample output for this section
    # Please make sure you keep your own running log for submission

    filename = 'best_model.pkl'
    a_file = open(filename, "wb")
    pickle.dump(dictionary_data, a_file)
    a_file.close()


# The log below is the sample output for this section
# Please make sure you keep your own running log for submission

Epoch:1, Training loss: 9702.79, train each f1: [0.9758 0.9939 0.986  0.9329 0.999  0.9946], train mean f1: 0.9937, val loss: 1117.81, val each f1: [0.9737 0.9924 0.9733 0.9256 0.9989 0.993 ], val mean f1: 0.9918, time: 786.66s
Epoch:2, Training loss: 2229.80, train each f1: [0.9873 0.9984 0.9949 0.9728 0.9997 0.9978], train mean f1: 0.9974, val loss: 830.07, val each f1: [0.9758 0.9949 0.9832 0.9543 0.9991 0.9949], val mean f1: 0.9941, time: 765.04s


In [None]:
import os
import torch
torch.save(model,'model')
model = torch.load('model')
model.eval()

BiGRU_CRF(
  (word_embeds): Embedding(11243, 300)
  (gru): GRU(300, 50, bidirectional=True)
  (hidden2tag): Linear(in_features=100, out_features=9, bias=True)
)

## Test

In [None]:
def test(model, input_index):
    predicted = []
    for i,idxs in enumerate(input_index):
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    return predicted

y_pred = test(model,test_input_index)

In [None]:
def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_pred_decode = decode_output(y_pred)

In [None]:
results = []
for i in range(len(y_pred_decode)):
  for token in y_pred_decode[i].split():
    results.append(token)

In [None]:
results.insert(0, 'Predicted')
id = [item for item in range(0, 2326)]
id.insert(0, 'Id')

In [None]:
import csv
file = open("sample.csv", "w")
writer = csv.writer(file)

for w in range(len(results)):

  writer.writerow([id[w], results[w]])

file.close()

# Testing and Evaluation

## Performance Comparison

### Baseline model

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# from flair.data import Sentence

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, crf=True, attention_method=None, n_layer = 1, attention_position=False):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.crf = crf
        self.n_layer = n_layer
        self.attention_position = attention_position

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        # """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))


        # self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
        #                     num_layers=3, bidirectional=True)
        if self.attention_position == True:
            self.lstm_1 = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
            self.lstm_2 = nn.LSTM(hidden_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        else:
            self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                                num_layers=self.n_layer, bidirectional=True)

        # self.attention = nn.Transformer(hidden_dim, nhead=10, num_encoder_layers=1).encoder
        self.attention_method = attention_method

        if self.attention_method != None:
            self.linear_q = nn.Linear(self.hidden_dim, self.hidden_dim)
            self.linear_k = nn.Linear(self.hidden_dim, self.hidden_dim)
            self.linear_v = nn.Linear(self.hidden_dim, self.hidden_dim)

            if self.attention_method == 'cosine':
                self.attention = CosineAttention()
            elif self.attention_method == 'dot':
                self.attention = DotProductAttention()
            elif self.attention_method == 'scaled_dot':
                self.attention = ScaledDotProductAttention()

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    # def init_hidden(self):
    #     return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
    #             torch.randn(2, 1, self.hidden_dim // 2).to(device))
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # feats [L,9]
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # init_alphas [1,9]
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        # N: batch size
        # L: sequence length
        # D: 2 if bidirectional=True otherwise 1
        # H_in: input size
        # H_out: hidden size

        self.hidden = self.init_hidden()

        # sentence = Sentence(" ".join(sentence), use_tokenizer=False)
        # embeds = self.word_embeds.embed(sentence)
        # embeds = torch.stack([i.embedding for i in sentence])
        # embeds = embeds.view(len(sentence), 1, -1).to(device)

        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)

        # sentence sentence torch.Size([8]), (L)
        # embeds torch.Size([8, 1, 50]), (L,N,H_in)
        # lstm_out torch.Size([8, 1, 50]), (L,N,D*H_out)
        # self.hidden[0] torch.Size([2, 1, 25]), (D*num_layers, N, H_out)
        # self.hidden[1] torch.Size([2, 1, 25]), (D*num_layers, N, H_out)
        if self.attention_method == None:
            lstm_out, self.hidden = self.lstm(embeds, self.hidden)
            lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
            # lstm_out after view torch.Size([8, 50])
            lstm_feats = self.hidden2tag(lstm_out)
            # lstm_feats torch.Size([8, 9])

        # attention_out = self.attention(lstm_out)
        # attention_out = attention_out.view(len(sentence), self.hidden_dim)
        # lstm_feats = self.hidden2tag(attention_out)
        else:
            if self.attention_position == True:
                lstm_out, self.hidden = self.lstm_1(embeds, self.hidden)
                lstm_out = lstm_out.permute(1,0,2)
                q = self.linear_q(lstm_out)
                k = self.linear_k(lstm_out)
                v = self.linear_v(lstm_out)
                output, attention = self.attention(q,k,v)
                self.hidden = self.init_hidden()
                lstm_out, self.hidden = self.lstm_2(output, self.hidden)
                lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
                lstm_feats = self.hidden2tag(lstm_out)
            else:
                lstm_out, self.hidden = self.lstm(embeds, self.hidden)
                lstm_out = lstm_out.permute(1,0,2)
                q = self.linear_q(lstm_out)
                k = self.linear_k(lstm_out)
                v = self.linear_v(lstm_out)
                output, attention = self.attention(q,k,v)
                output = output.view(len(sentence), self.hidden_dim)
                lstm_feats = self.hidden2tag(output)

        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])

        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        # score, tag_seq = self._viterbi_decode(lstm_feats)

        if self.crf:
            # Find the best path, given the features.
            score, tag_seq = self._viterbi_decode(lstm_feats)
        else:
            score = None
            tag_seq = torch.max(F.softmax(lstm_feats, dim=1), dim=1).indices.tolist()

        return score, tag_seq

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

# model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, crf=True, attention_method=None, n_layer = 1, attention_position=True).to(device)

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                  crf = True, attention_method = None,
                  n_layer = 1, attention_position = False).to(device)

# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
# optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

# bptt = 100
dictionary_data = {}

for epoch in range(2):
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
    # for i, idxs in enumerate(train_data):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.

        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        # sentence_in = idxs
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()


    model.eval()
    # Call the cal_f1 functions you implemented as required
    _, _, train_each_f1, train_mean_f1 = cal_f1(model,train_input_index,train_output_index)
    _, _, val_each_f1, val_mean_f1 = cal_f1(model,val_input_index,val_output_index)


    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()
    dictionary_data[str(epoch)+'tloss'] = train_loss
    dictionary_data[str(epoch)+'vloss'] = val_loss
    dictionary_data[str(epoch)+'tef1'] = train_each_f1
    dictionary_data[str(epoch)+'vef1'] = val_each_f1
    dictionary_data[str(epoch)+'tmf1'] = train_mean_f1
    dictionary_data[str(epoch)+'vmf1'] = val_mean_f1

    print("Epoch:{:d}, Training loss: {:.2f}, train each f1: {}, train mean f1: {:.4f}, val loss: {:.2f}, val each f1: {}, val mean f1: {:.4f}, time: {:.2f}s".format(epoch+1, train_loss, train_each_f1.round(4), train_mean_f1, val_loss, val_each_f1.round(4), val_mean_f1, (time2-time1).total_seconds()))

    # The log below is the sample output for this section
    # Please make sure you keep your own running log for submission

    filename = 'baseline.pkl'
    a_file = open(filename, "wb")
    pickle.dump(dictionary_data, a_file)
    a_file.close()


# The log below is the sample output for this section
# Please make sure you keep your own running log for submission

Epoch:1, Training loss: 12183.13, train each f1: [0.9722 0.9912 0.9851 0.919  0.9988 0.9937], train mean f1: 0.9925, val loss: 1203.29, val each f1: [0.9688 0.9887 0.9727 0.9175 0.9987 0.9919], val mean f1: 0.9905, time: 875.52s
Epoch:2, Training loss: 2563.02, train each f1: [0.9852 0.9972 0.9944 0.9588 0.9995 0.9972], train mean f1: 0.9967, val loss: 836.34, val each f1: [0.9764 0.9941 0.9841 0.9542 0.9991 0.995 ], val mean f1: 0.9941, time: 832.26s


In [None]:
!pip install tabulate
from tabulate import tabulate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
dim_list = []

with open("baseline.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['baseline', vef1[0], vef1[1], vef1[2], vef1[3], vef1[4], vef1[5], vmf1])

with open("best_model.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['best_model', vef1[0], vef1[1], vef1[2], vef1[3], vef1[4], vef1[5], vmf1])

#define header names
col_names = ["Model", "T-F1(T)", "T-F1(S)", "T-F1(C)", "T-F1(D)", "T-F1(P)", "T-F1(O)", "T-F1"]

#display table
print(tabulate(dim_list, headers=col_names))

Model         T-F1(T)    T-F1(S)    T-F1(C)    T-F1(D)    T-F1(P)    T-F1(O)      T-F1
----------  ---------  ---------  ---------  ---------  ---------  ---------  --------
baseline     0.976438   0.994115   0.984088   0.954198   0.999111   0.994955  0.994094
best_model   0.975795   0.99487    0.983237   0.954315   0.999111   0.9949    0.994064


## Ablation Study - different input embedding model

In [None]:
import datetime
import pickle
configs = [
    {
        "embedding_method": ['semantic'],
        "attention_method": None,
        "crf": True,
        "n_layer": 1,
        "attention_position": False,
    },
    {
        "embedding_method": ['domain'],
        "attention_method": None,
        "crf": True,
        "n_layer": 1,
        "attention_position": False,
    },
    # Best model, already tested

    # {
    #     "embedding_method": ['semantic', 'domain'],
    #     "attention_method": None,
    #     "crf": True,
    #     "n_layer": 1,
    #     "attention_position": False,
    # },
    {
        "embedding_method": ['semantic', 'domain', 'syntactic'],
        "attention_method": None,
        "crf": True,
        "n_layer": 1,
        "attention_position": False,
    },
]

count = 0
for config in configs:
    count += 1
    dictionary_data = {}
    print(config)
    ### input embedding
    embedding_matrix = None
    for embedding_method in config['embedding_method']:
        if embedding_matrix is None:
            if embedding_method == 'semantic':
                embedding_matrix = semantic_embedding_matrix
            elif embedding_method == 'domain':
                embedding_matrix = domain_embedding_matrix
            elif embedding_method == 'syntactic':
                embedding_matrix = syntactic_embedding_matrix
        else:
            if embedding_method == 'semantic':
                embedding_matrix = np.concatenate((embedding_matrix, semantic_embedding_matrix), axis=1)
            elif embedding_method == 'domain':
                embedding_matrix = np.concatenate((embedding_matrix, domain_embedding_matrix), axis=1)
            elif embedding_method == 'syntactic':
                embedding_matrix = np.concatenate((embedding_matrix, syntactic_embedding_matrix), axis=1)


    EMBEDDING_DIM = embedding_matrix.shape[1]
    HIDDEN_DIM = 100
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                      crf = config['crf'], attention_method = config['attention_method'],
                      n_layer = config['n_layer'], attention_position = config['attention_position']).to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    print('Starting Training')

    for epoch in range(2):
        time1 = datetime.datetime.now()
        train_loss = 0

        model.train()
        for i, idxs in enumerate(train_input_index):
        # for i, idxs in enumerate(train_data):
            tags_index = train_output_index[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.

            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model.eval()
        # Call the cal_f1 functions you implemented as required
        _, _, train_each_f1, train_mean_f1 = cal_f1(model,train_input_index,train_output_index)
        _, _, val_each_f1, val_mean_f1 = cal_f1(model,val_input_index,val_output_index)


        val_loss = 0
        for i, idxs in enumerate(val_input_index):
            tags_index = val_output_index[i]
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(sentence_in, targets)
            val_loss+=loss.item()
        time2 = datetime.datetime.now()
        dictionary_data[str(epoch)+'tloss'] = train_loss
        dictionary_data[str(epoch)+'vloss'] = val_loss
        dictionary_data[str(epoch)+'tef1'] = train_each_f1
        dictionary_data[str(epoch)+'vef1'] = val_each_f1
        dictionary_data[str(epoch)+'tmf1'] = train_mean_f1
        dictionary_data[str(epoch)+'vmf1'] = val_mean_f1

        print("Epoch:{:d}, Training loss: {:.2f}, train each f1: {}, train mean f1: {:.4f}, val loss: {:.2f}, val each f1: {}, val mean f1: {:.4f}, time: {:.2f}s".format(epoch+1, train_loss, train_each_f1.round(4), train_mean_f1, val_loss, val_each_f1.round(4), val_mean_f1, (time2-time1).total_seconds()))
    # The log below is the sample output for this section
    # Please make sure you keep your own running log for submission

    filename = 'embedding' + str(count) + '.pkl'
    a_file = open(filename, "wb")
    pickle.dump(dictionary_data, a_file)
    a_file.close()



{'embedding_method': ['semantic'], 'attention_method': None, 'crf': True, 'n_layer': 1, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 16337.94, train each f1: [0.9651 0.9842 0.9647 0.832  0.998  0.9887], train mean f1: 0.9866, val loss: 2076.62, val each f1: [0.9642 0.9791 0.9467 0.8345 0.997  0.9865], val mean f1: 0.9839, time: 705.06s
Epoch:2, Training loss: 4225.64, train each f1: [0.9826 0.995  0.9904 0.9275 0.999  0.9956], train mean f1: 0.9948, val loss: 1410.38, val each f1: [0.9732 0.9899 0.9724 0.9016 0.9976 0.9917], val mean f1: 0.9903, time: 688.81s
{'embedding_method': ['domain'], 'attention_method': None, 'crf': True, 'n_layer': 1, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 11739.29, train each f1: [0.9709 0.993  0.987  0.9226 0.9987 0.9951], train mean f1: 0.9935, val loss: 1158.01, val each f1: [0.9628 0.9929 0.978  0.9253 0.9985 0.9934], val mean f1: 0.9918, time: 529.11s
Epoch:2, Training loss: 2140.28, train each f1

In [None]:
dim_list = []

with open("embedding1.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['sementic', vef1[0], vef1[1], vef1[2], vef1[3], vef1[4], vef1[5], vmf1])

with open("embedding2.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['domain', vef1[0], vef1[1], vef1[2], vef1[3], vef1[4], vef1[5], vmf1])

with open("embedding3.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['sementic_domain_syntactic', vef1[0], vef1[1], vef1[2], vef1[3], vef1[4], vef1[5], vmf1])

with open("best_model.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['sementic_domain', vef1[0], vef1[1], vef1[2], vef1[3], vef1[4], vef1[5], vmf1])

#define header names
col_names = ["Model", "T-F1(T)", "T-F1(S)", "T-F1(C)", "T-F1(D)", "T-F1(P)", "T-F1(O)", "T-F1"]

#display table
print(tabulate(dim_list, headers=col_names))

Model                        T-F1(T)    T-F1(S)    T-F1(C)    T-F1(D)    T-F1(P)    T-F1(O)      T-F1
-------------------------  ---------  ---------  ---------  ---------  ---------  ---------  --------
sementic                    0.973214   0.989896   0.972443   0.901554   0.99759    0.991693  0.990316
domain                      0.971546   0.995324   0.98564    0.945545   0.99873    0.995289  0.994094
sementic_domain_syntactic   0.974341   0.994876   0.982606   0.954082   0.999365   0.994717  0.993884
sementic_domain             0.975795   0.99487    0.983237   0.954315   0.999111   0.9949    0.994064


## Ablation Study - different attention strategy

In [None]:
import datetime
import pickle
configs = [

    # Best model, already tested

    # {
    #     "embedding_method": ['semantic', 'domain'],
    #     "attention_method": None,
    #     "crf": True,
    #     "n_layer": 1,
    #     "attention_position": False,
    # },


    # Attention score calculation

    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": 'cosine',
        "crf": True,
        "n_layer": 1,
        "attention_position": False,
    },
    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": 'dot',
        "crf": True,
        "n_layer": 1,
        "attention_position": False,
    },
    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": 'scaled_dot',
        "crf": True,
        "n_layer": 1,
        "attention_position": False,
    },

    # Attention position

    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": 'cosine',
        "crf": True,
        "n_layer": 2,
        "attention_position": False,
    },
    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": 'cosine',
        "crf": True,
        "n_layer": 2,
        "attention_position": True,
    },
]

count = 0
for config in configs:
    count += 1
    dictionary_data = {}
    print(config)
    ### input embedding
    embedding_matrix = None
    for embedding_method in config['embedding_method']:
        if embedding_matrix is None:
            if embedding_method == 'semantic':
                embedding_matrix = semantic_embedding_matrix
            elif embedding_method == 'domain':
                embedding_matrix = domain_embedding_matrix
            elif embedding_method == 'syntactic':
                embedding_matrix = syntactic_embedding_matrix
        else:
            if embedding_method == 'semantic':
                embedding_matrix = np.concatenate((embedding_matrix, semantic_embedding_matrix), axis=1)
            elif embedding_method == 'domain':
                embedding_matrix = np.concatenate((embedding_matrix, domain_embedding_matrix), axis=1)
            elif embedding_method == 'syntactic':
                embedding_matrix = np.concatenate((embedding_matrix, syntactic_embedding_matrix), axis=1)


    EMBEDDING_DIM = embedding_matrix.shape[1]
    HIDDEN_DIM = 100
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                      crf = config['crf'], attention_method = config['attention_method'],
                      n_layer = config['n_layer'], attention_position = config['attention_position']).to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    print('Starting Training')

    for epoch in range(2):
        time1 = datetime.datetime.now()
        train_loss = 0

        model.train()
        for i, idxs in enumerate(train_input_index):
        # for i, idxs in enumerate(train_data):
            tags_index = train_output_index[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.

            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model.eval()
        # Call the cal_f1 functions you implemented as required
        _, _, train_each_f1, train_mean_f1 = cal_f1(model,train_input_index,train_output_index)
        _, _, val_each_f1, val_mean_f1 = cal_f1(model,val_input_index,val_output_index)


        val_loss = 0
        for i, idxs in enumerate(val_input_index):
            tags_index = val_output_index[i]
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(sentence_in, targets)
            val_loss+=loss.item()
        time2 = datetime.datetime.now()
        dictionary_data[str(epoch)+'tloss'] = train_loss
        dictionary_data[str(epoch)+'vloss'] = val_loss
        dictionary_data[str(epoch)+'tef1'] = train_each_f1
        dictionary_data[str(epoch)+'vef1'] = val_each_f1
        dictionary_data[str(epoch)+'tmf1'] = train_mean_f1
        dictionary_data[str(epoch)+'vmf1'] = val_mean_f1

        print("Epoch:{:d}, Training loss: {:.2f}, train each f1: {}, train mean f1: {:.4f}, val loss: {:.2f}, val each f1: {}, val mean f1: {:.4f}, time: {:.2f}s".format(epoch+1, train_loss, train_each_f1.round(4), train_mean_f1, val_loss, val_each_f1.round(4), val_mean_f1, (time2-time1).total_seconds()))
    # The log below is the sample output for this section
    # Please make sure you keep your own running log for submission

    filename = 'attention' + str(count) + '.pkl'
    a_file = open(filename, "wb")
    pickle.dump(dictionary_data, a_file)
    a_file.close()


{'embedding_method': ['semantic', 'domain'], 'attention_method': 'cosine', 'crf': True, 'n_layer': 1, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 27135.23, train each f1: [0.9448 0.9757 0.9625 0.78   0.9946 0.9893], train mean f1: 0.9840, val loss: 2567.61, val each f1: [0.9396 0.9697 0.9427 0.7967 0.9948 0.987 ], val mean f1: 0.9811, time: 530.85s
Epoch:2, Training loss: 6112.79, train each f1: [0.965  0.9881 0.9805 0.9014 0.9964 0.995 ], train mean f1: 0.9916, val loss: 2028.54, val each f1: [0.952  0.9832 0.9634 0.8629 0.9962 0.9913], val mean f1: 0.9870, time: 508.64s
{'embedding_method': ['semantic', 'domain'], 'attention_method': 'dot', 'crf': True, 'n_layer': 1, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 48442.99, train each f1: [0.3488 0.608  0.555  0.0277 0.8459 0.8825], train mean f1: 0.8275, val loss: 19255.22, val each f1: [0.3529 0.6091 0.5458 0.0487 0.8353 0.8822], val mean f1: 0.8257, time: 499.01s
Epoch:2, Training 

In [None]:
dim_list = []

with open("attention1.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['cosine', vmf1])

with open("attention2.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['dot', vmf1])

with open("attention3.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['scaled_dot', vmf1])

#define header names
col_names = ["Model", "T-F1"]

#display table
print(tabulate(dim_list, headers=col_names))

Model           T-F1
----------  --------
cosine      0.986958
dot         0.83909
scaled_dot  0.992295


In [None]:
dim_list = []

with open("attention4.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['cosine_attention_after_layer_2', vmf1])

with open("attention5.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['cosine_attention_after_layer_1', vmf1])


#define header names
col_names = ["Model", "T-F1"]

#display table
print(tabulate(dim_list, headers=col_names))

Model                               T-F1
------------------------------  --------
cosine_attention_after_layer_2  0.989716
cosine_attention_after_layer_1  0.988427


## Ablation Study - different Stacked layer or # of encoder/decoder strategy

In [None]:
import datetime
import pickle
configs = [

    # Best model, already tested

    # {
    #     "embedding_method": ['semantic', 'domain'],
    #     "attention_method": None,
    #     "crf": True,
    #     "n_layer": 1,
    #     "attention_position": False,
    # },

    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": None,
        "crf": True,
        "n_layer": 2,
        "attention_position": False,
    },
    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": None,
        "crf": True,
        "n_layer": 3,
        "attention_position": False,
    },

]

count = 0
for config in configs:
    count += 1
    dictionary_data = {}
    print(config)
    ### input embedding
    embedding_matrix = None
    for embedding_method in config['embedding_method']:
        if embedding_matrix is None:
            if embedding_method == 'semantic':
                embedding_matrix = semantic_embedding_matrix
            elif embedding_method == 'domain':
                embedding_matrix = domain_embedding_matrix
            elif embedding_method == 'syntactic':
                embedding_matrix = syntactic_embedding_matrix
        else:
            if embedding_method == 'semantic':
                embedding_matrix = np.concatenate((embedding_matrix, semantic_embedding_matrix), axis=1)
            elif embedding_method == 'domain':
                embedding_matrix = np.concatenate((embedding_matrix, domain_embedding_matrix), axis=1)
            elif embedding_method == 'syntactic':
                embedding_matrix = np.concatenate((embedding_matrix, syntactic_embedding_matrix), axis=1)


    EMBEDDING_DIM = embedding_matrix.shape[1]
    HIDDEN_DIM = 100
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                      crf = config['crf'], attention_method = config['attention_method'],
                      n_layer = config['n_layer'], attention_position = config['attention_position']).to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    print('Starting Training')

    for epoch in range(2):
        time1 = datetime.datetime.now()
        train_loss = 0

        model.train()
        for i, idxs in enumerate(train_input_index):
        # for i, idxs in enumerate(train_data):
            tags_index = train_output_index[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.

            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model.eval()
        # Call the cal_f1 functions you implemented as required
        _, _, train_each_f1, train_mean_f1 = cal_f1(model,train_input_index,train_output_index)
        _, _, val_each_f1, val_mean_f1 = cal_f1(model,val_input_index,val_output_index)


        val_loss = 0
        for i, idxs in enumerate(val_input_index):
            tags_index = val_output_index[i]
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(sentence_in, targets)
            val_loss+=loss.item()
        time2 = datetime.datetime.now()
        dictionary_data[str(epoch)+'tloss'] = train_loss
        dictionary_data[str(epoch)+'vloss'] = val_loss
        dictionary_data[str(epoch)+'tef1'] = train_each_f1
        dictionary_data[str(epoch)+'vef1'] = val_each_f1
        dictionary_data[str(epoch)+'tmf1'] = train_mean_f1
        dictionary_data[str(epoch)+'vmf1'] = val_mean_f1

        print("Epoch:{:d}, Training loss: {:.2f}, train each f1: {}, train mean f1: {:.4f}, val loss: {:.2f}, val each f1: {}, val mean f1: {:.4f}, time: {:.2f}s".format(epoch+1, train_loss, train_each_f1.round(4), train_mean_f1, val_loss, val_each_f1.round(4), val_mean_f1, (time2-time1).total_seconds()))
    # The log below is the sample output for this section
    # Please make sure you keep your own running log for submission

    filename = 'layer' + str(count) + '.pkl'
    a_file = open(filename, "wb")
    pickle.dump(dictionary_data, a_file)
    a_file.close()


{'embedding_method': ['semantic', 'domain'], 'attention_method': None, 'crf': True, 'n_layer': 2, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 11983.37, train each f1: [0.9728 0.9932 0.9841 0.9261 0.999  0.9939], train mean f1: 0.9929, val loss: 1127.59, val each f1: [0.9705 0.9905 0.9749 0.9315 0.9991 0.9927], val mean f1: 0.9915, time: 906.04s
Epoch:2, Training loss: 2442.34, train each f1: [0.9864 0.9981 0.9952 0.9658 0.9995 0.9975], train mean f1: 0.9971, val loss: 829.60, val each f1: [0.978  0.9943 0.983  0.9486 0.999  0.9949], val mean f1: 0.9940, time: 888.38s
{'embedding_method': ['semantic', 'domain'], 'attention_method': None, 'crf': True, 'n_layer': 3, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 16251.68, train each f1: [0.9639 0.9899 0.9855 0.9119 0.9985 0.9929], train mean f1: 0.9914, val loss: 1355.67, val each f1: [0.9617 0.9891 0.9745 0.9016 0.998  0.991 ], val mean f1: 0.9895, time: 897.46s
Epoch:2, Training loss: 2

In [None]:
dim_list = []

with open("best_model.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['1_layer', vmf1])

with open("layer1.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['2_layer', vmf1])

with open("layer2.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['3_layer', vmf1])


#define header names
col_names = ["Model", "T-F1"]

#display table
print(tabulate(dim_list, headers=col_names))

Model        T-F1
-------  --------
1_layer  0.994064
2_layer  0.994004
3_layer  0.993554


## Ablation Study - with/without CRF

In [None]:
import datetime
import pickle
configs = [


    # Best model, already tested

    # {
    #     "embedding_method": ['semantic', 'domain'],
    #     "attention_method": None,
    #     "crf": True,
    #     "n_layer": 1,
    #     "attention_position": False,
    # },

    {
        "embedding_method": ['semantic', 'domain'],
        "attention_method": None,
        "crf": False,
        "n_layer": 1,
        "attention_position": False,
    },

]

count = 0
for config in configs:
    count += 1
    dictionary_data = {}
    print(config)
    ### input embedding
    embedding_matrix = None
    for embedding_method in config['embedding_method']:
        if embedding_matrix is None:
            if embedding_method == 'semantic':
                embedding_matrix = semantic_embedding_matrix
            elif embedding_method == 'domain':
                embedding_matrix = domain_embedding_matrix
            elif embedding_method == 'syntactic':
                embedding_matrix = syntactic_embedding_matrix
        else:
            if embedding_method == 'semantic':
                embedding_matrix = np.concatenate((embedding_matrix, semantic_embedding_matrix), axis=1)
            elif embedding_method == 'domain':
                embedding_matrix = np.concatenate((embedding_matrix, domain_embedding_matrix), axis=1)
            elif embedding_method == 'syntactic':
                embedding_matrix = np.concatenate((embedding_matrix, syntactic_embedding_matrix), axis=1)


    EMBEDDING_DIM = embedding_matrix.shape[1]
    HIDDEN_DIM = 100
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = BiGRU_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                      crf = config['crf'], attention_method = config['attention_method'],
                      n_layer = config['n_layer'], attention_position = config['attention_position']).to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    print('Starting Training')

    for epoch in range(2):
        time1 = datetime.datetime.now()
        train_loss = 0

        model.train()
        for i, idxs in enumerate(train_input_index):
        # for i, idxs in enumerate(train_data):
            tags_index = train_output_index[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.

            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model.eval()
        # Call the cal_f1 functions you implemented as required
        _, _, train_each_f1, train_mean_f1 = cal_f1(model,train_input_index,train_output_index)
        _, _, val_each_f1, val_mean_f1 = cal_f1(model,val_input_index,val_output_index)


        val_loss = 0
        for i, idxs in enumerate(val_input_index):
            tags_index = val_output_index[i]
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(sentence_in, targets)
            val_loss+=loss.item()
        time2 = datetime.datetime.now()
        dictionary_data[str(epoch)+'tloss'] = train_loss
        dictionary_data[str(epoch)+'vloss'] = val_loss
        dictionary_data[str(epoch)+'tef1'] = train_each_f1
        dictionary_data[str(epoch)+'vef1'] = val_each_f1
        dictionary_data[str(epoch)+'tmf1'] = train_mean_f1
        dictionary_data[str(epoch)+'vmf1'] = val_mean_f1

        print("Epoch:{:d}, Training loss: {:.2f}, train each f1: {}, train mean f1: {:.4f}, val loss: {:.2f}, val each f1: {}, val mean f1: {:.4f}, time: {:.2f}s".format(epoch+1, train_loss, train_each_f1.round(4), train_mean_f1, val_loss, val_each_f1.round(4), val_mean_f1, (time2-time1).total_seconds()))
    # The log below is the sample output for this section
    # Please make sure you keep your own running log for submission

    filename = 'crf' + str(count) + '.pkl'
    a_file = open(filename, "wb")
    pickle.dump(dictionary_data, a_file)
    a_file.close()

    # a_file = open("data.pkl", "rb")
    # output = pickle.load(a_file)
    # print(output)
    # {'a': 1, 'b': 2}
    # a_file.close()


{'embedding_method': ['semantic', 'domain'], 'attention_method': None, 'crf': False, 'n_layer': 1, 'attention_position': False}
Starting Training
Epoch:1, Training loss: 9962.71, train each f1: [0.9722 0.9921 0.9787 0.9256 0.998  0.9934], train mean f1: 0.9919, val loss: 1129.10, val each f1: [0.9693 0.9901 0.9691 0.9244 0.998  0.9919], val mean f1: 0.9903, time: 672.84s
Epoch:2, Training loss: 2280.04, train each f1: [0.9855 0.998  0.9912 0.9682 0.9987 0.9973], train mean f1: 0.9966, val loss: 838.07, val each f1: [0.9768 0.9946 0.9809 0.9497 0.998  0.9944], val mean f1: 0.9934, time: 673.03s


In [None]:
dim_list = []

with open("best_model.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['use_crf', vmf1])

with open("crf1.pkl", "rb") as f:
    output = pickle.load(f)
    vef1 = output['1vef1']
    vmf1 = output['1vmf1']
    dim_list.append(['no_crf', vmf1])


#define header names
col_names = ["Model", "T-F1"]

#display table
print(tabulate(dim_list, headers=col_names))

Model        T-F1
-------  --------
use_crf  0.994064
no_crf   0.993434
