## 01. Dependencies & Setup

In [1]:
# Load dependencies for entire notebook
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from nltk import word_tokenize, pos_tag, download, word_tokenize, download, corpus
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from gensim.models import Word2Vec
from gensim.models import FastText

import tensorflow_hub as hub
import tensorflow as tf

import datetime
from tabulate import tabulate
import string
import re
import numpy as np
import pickle
import pandas as pd
import gensim.downloader as api

import requests
from bs4 import BeautifulSoup
import csv

download('punkt')
download('averaged_perceptron_tagger')
download('wordnet')
download('stopwords')

stpwrds = corpus.stopwords.words('english')
stemmer = PorterStemmer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# prepare train, val and test datasets
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

train_doc_id = '1yKFzWzdYMrmEpCzp4H7I30pZgxuVgwdZ'
val_doc_id = '1oDPDN2LVeaRJ4YPeEhOn5T6T9fEdRTvv'
test_doc_id = '1Cdw2CZUV1IzHBMjy22GeqqYxmW1t9VXi'

downloaded = drive.CreateFile({'id':train_doc_id}) 
downloaded.GetContentFile('train.csv') 
downloaded = drive.CreateFile({'id':val_doc_id})
downloaded.GetContentFile('val.csv')
downloaded = drive.CreateFile({'id':test_doc_id}) 
downloaded.GetContentFile('test_without_labels.csv')

df_train = pd.read_csv("/content/train.csv")
df_val = pd.read_csv("/content/val.csv")
df_test = pd.read_csv("/content/test_without_labels.csv")

In [3]:
df_train.head(5)

Unnamed: 0,sents,labels
0,wow,O
1,WTF,T
2,wpe wpe,O O
3,hahaha,O
4,wtf,T


## 02. Pre-processing
1. Basic conversion to tokens

In [4]:
X_train = [sent.lower().split(' ') for sent in df_train['sents'].tolist() if sent not in ['', ' ']]
Y_train = [tags.split(' ') for tags in df_train['labels'].tolist() if tags not in ['', ' ']]

X_val = [sent.lower().split(' ') for sent in df_val['sents'].tolist() if sent not in ['', ' ']]
Y_val = [tags.split(' ') for tags in df_val['labels'].tolist() if tags not in ['', ' ']]

X_test = [sent.lower().split(' ') for sent in df_test['sents'].tolist() if sent not in ['', ' ']]

X_combine = X_train + X_val + X_test
Y_combine = Y_train + Y_val

## A. Input Embedding

### 1. Syntactic Textual Feature Embedding: **POS Tagging**

In [5]:
# X_train_pos = 
def create_pos_tags(input_list):
    pos = [pos_tag(sent) for sent in input_list]
    pos = [[pos for _, pos in l] for l in pos]
    return pos

X_train_pos = create_pos_tags(X_train)
X_val_pos = create_pos_tags(X_val)
X_test_pos = create_pos_tags(X_test)

print(X_train_pos[:6])
print(X_train[:6])

[['NN'], ['NN'], ['NN', 'NN'], ['NN'], ['NN'], ['NN', 'NN', 'NNP', 'NN', 'NN', 'IN', 'CD', 'NN']]
[['wow'], ['wtf'], ['wpe', 'wpe'], ['hahaha'], ['wtf'], ['i', 'cant', '[sepa]', 'play', '[sepa]', 'with', '4', 'trash']]


In [6]:
#  Create POS tags
pos_combine = X_train_pos + X_val_pos + X_test_pos
tag_list = list(sorted(set([tag for sublist in pos_combine for tag in sublist])))
tag_list.append('LS')
tag_list = sorted(tag_list)
pos_idx = {t:i for i,t in enumerate(tag_list)}
pos_idx['NN']

15

In [7]:
# Associate POS tag idx to source dataset
[[pos_idx[tag] for tag in doc] for doc in X_train_pos][:6]

[[15], [15], [15, 15], [15], [15], [15, 15, 16, 15, 15, 9, 5, 15]]

In [8]:
# Convert feature to one-hot encoded
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(tag_list)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In [9]:
def get_one_hot_pos_encoding(tag):
    """ returns pos tag as a one hot vector """
    idx = pos_idx[tag]
    return onehot_encoded[idx]

In [10]:
get_one_hot_pos_encoding('LS')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

### 2. Semantic Textual Feature Embedding: **FastText Skip-Gram**

In [11]:
# Build FastText Skip-Gram Model
fast_text_sg_plain = FastText(sentences=X_combine,
                            size=100,
                            window=5,
                            min_count=5,
                            workers=4,
                            sg=1)

### 3. Domain Feature Embedding:

In [12]:
# Step 1: Build a list of Dota strategy guides, glossaries, blogs, and discussion forums
# along with the html tag that has relevant data embedded

urls = {"https://www.pcinvasion.com/a-beginners-guide-to-dota-2-part-one-the-basics/2/": 'p', 
        "https://www.pcinvasion.com/a-beginners-guide-to-dota-2-part-one-the-basics/": 'p',
        "https://www.pcgamesn.com/dota/dota-2-beginner-s-guide-everything-you-need-know": 'p',
        "https://www.killping.com/blog/dota2-guide-tips-to-improve-gameplay/": 'p',
        "https://dota2.fandom.com/wiki/Glossary": ["dd", "dt"]}

headers = {'User-Agent': 
           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Step 2: Add to the list of URLS the in depth guide for each Dota hero
url_hero_directory = "https://dota2.fandom.com/wiki/Heroes"
response = requests.get(url_hero_directory, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
url_heroes = ["https://dota2.fandom.com/" + a['href'] + '/Guide' for a in 
              soup.find('body').find_all("a") 
              if a.find("img", {'class': "lazyload"})]

for url in url_heroes:
    urls[url] = 'li'

# Preview list of URLS
dict(list(urls.items())[:10])

{'https://dota2.fandom.com//wiki/Axe/Guide': 'li',
 'https://dota2.fandom.com//wiki/Beastmaster/Guide': 'li',
 'https://dota2.fandom.com//wiki/Brewmaster/Guide': 'li',
 'https://dota2.fandom.com//wiki/Bristleback/Guide': 'li',
 'https://dota2.fandom.com//wiki/Centaur_Warrunner/Guide': 'li',
 'https://dota2.fandom.com/wiki/Glossary': ['dd', 'dt'],
 'https://www.killping.com/blog/dota2-guide-tips-to-improve-gameplay/': 'p',
 'https://www.pcgamesn.com/dota/dota-2-beginner-s-guide-everything-you-need-know': 'p',
 'https://www.pcinvasion.com/a-beginners-guide-to-dota-2-part-one-the-basics/': 'p',
 'https://www.pcinvasion.com/a-beginners-guide-to-dota-2-part-one-the-basics/2/': 'p'}

In [13]:
try:
    # Try preload saved corpus data 

    dota_corpus_id = '1w-PrA7yAvzj68Nn1BM6TKiUz4Ludpfk2'
    downloaded = drive.CreateFile({'id':dota_corpus_id}) 
    downloaded.GetContentFile('corpus.pickle') 
    infile = open('corpus.pickle','rb')
    dota_corpus = pickle.load(infile)
    infile.close()

except:
    # Otherwise generate corpus from scratch

    dota_corpus = []

    # Loop through and retreive relevant data
    for url, tag in urls.items():
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        text = [para.get_text() for para in soup.find('body').find_all(tag)]
        dota_corpus.append(text)

    # Rough cleaning procedure for html tags
    c1 = [[txt for txt in l if txt[:1] != '\n'] for l in dota_corpus]
    c2 = [[txt for txt in l if not txt[:1].isdigit()] for l in c1]

In [14]:
# Step 1 - Flatten Array + add words from train, val and test
dota_corpus = [doc for sublist in dota_corpus for doc in sublist if doc != '']
# Step 2 - convert to lower case
dota_corpus = [sent.lower() for sent in dota_corpus]
# Step 3 - tockenize sentences
dota_corpus = [word_tokenize(sent) for sent in dota_corpus] + X_combine
# Step 4 - remove digits
dota_corpus = [[wrd for wrd in l if not wrd.isdigit()] for l in dota_corpus]
# Step 5 - remove stop words
dota_corpus = [[wrd for wrd in l if wrd not in stpwrds] for l in dota_corpus]
# Step 6 - Stemming
dota_corpus = [[stemmer.stem(wrd) for wrd in l ] for l in dota_corpus]
# Step 7 - remove punctuation & symbols
# dota_corpus = [[wrd for wrd in l if wrd not in string.punctuation] for l in dota_corpus]
# cleaned corpus
dota_corpus[:2]

[['stuff', 'mention', '?', 'tower', ',', 'rainforest', ',', 'roshan', '?'],
 ['tower', ',', 'jungl', ',', 'roshan', '.', 'tower', '–']]

In [15]:
# Build FastText Semantic basis Skip-Gram Model
fast_text_dota_2v_model = FastText(sentences=dota_corpus,
                            size=100,
                            window=5,
                            min_count=5,
                            workers=4,
                            sg=1)

In [16]:
# Load specialsied list for custom features
d2h_doc_id = '1h3v3h4wEkUaWD6KW2ZwJ-EtTVzjxYAZ_'
d2t_doc_id = '1R550xevyE4T2UD0P9nv1JkOXq9vCYckb'
bw_doc_id = '1ek241QhWqXuQoa6BGBXHu91yKQg5lq3W'

downloaded = drive.CreateFile({'id':d2h_doc_id}) 
downloaded.GetContentFile('d2_heroes.pickle') 
downloaded = drive.CreateFile({'id':d2t_doc_id})
downloaded.GetContentFile('d2_terms.pickle')
downloaded = drive.CreateFile({'id':bw_doc_id}) 
downloaded.GetContentFile('bad_wrds.pickle')

infile = open('d2_heroes.pickle','rb')
d2_heroes = pickle.load(infile) # list of dota heroes
infile.close()
infile = open('d2_terms.pickle','rb')
d2_terms = pickle.load(infile) # dota terms and acronyms
infile.close()
infile = open('bad_wrds.pickle','rb')
bad_wrds = pickle.load(infile) # warning explicit! - list of offensive words in gaming domain
infile.close()

# prepare
d2_heroes = [hero.lower() for hero in d2_heroes]

In [17]:
# create helper function that builds customised features
def extract_features(word):

    is_pronoun = pos_tag([word])[0][1] in ['PRP', 'PRP$']
    is_profane = word in bad_wrds
    is_hero = word in d2_heroes
    is_dota2_term = word in d2_terms

    features = [int(is_pronoun), int(is_profane), int(is_hero), int(is_dota2_term)]

    return np.array(features)


### Concatenate Inputs & Build embeddings

In [18]:
# Word statistics

## Define universal function to generate word_to_ix objects with
#  CRF, N to M and Attention compatability
def get_word_statistics(all_docs, add_bos_eos=False):
    """
    Take all docs and returns word index, word list and vocab size
    for the purposes of building a memory efficient embedding lookup table
    """

    word_set = set() 
    for doc in all_docs:
        for word in doc:
            word_set.add(word)

    # Sort for safety to ensure order is preserved
    word_list = list(word_set)
    word_list.sort()

    #  IF attention add BOE EOS indicators
    word_index = {} if not add_bos_eos else {BOS:0, EOS:1}

    # Create and return integer based lookup table
    for i, word in enumerate(word_list, len(word_index)):
        word_index[word] = i

    vocab_size = len(word_list)

    return word_index, word_list, vocab_size

word_index, word_list, vocab_size = get_word_statistics(X_combine)

In [19]:
def build_pos_ft_embedding_table(word_list, model_ft):

    # Builds an embedding table from FT + POS tags

    embedding_table = []

    dim_size = model_ft.vector_size + onehot_encoded.shape[1]

    for word in word_list:

            # Case 1: word is not present in pre-trained model == out of vocab error
            # apply associative array of zeros
            if word not in model_ft:
                ft_vector = np.zeros((model_ft.vector_size))
                pos_vector = get_one_hot_pos_encoding(pos_tag([word])[0][1])
                embedding = np.concatenate((ft_vector, pos_vector))
                embedding_table.append(embedding)

            else:
                # get POS vector 
                pos_vector = get_one_hot_pos_encoding(pos_tag([word])[0][1])
                # get fast text vector
                ft_vector = model_ft.wv[word]
                # concat
                embedding = np.concatenate((ft_vector, pos_vector))
                # append to embedding
                embedding_table.append(embedding)

    return np.array(embedding_table), dim_size

In [20]:
# Build domain specific input embedding - FastText + websites + custom features
def build_custom_ft_embedding_table(word_list, model_ft):

    # Builds an embedding table from FT + custom features

    embedding_table = []

    dim_size = model_ft.vector_size + 4

    for word in word_list:

            # Case 1: word is not present in pre-trained model == out of vocab error
            # apply associative array of zeros
            if word not in model_ft:
                ft_vector = np.zeros((model_ft.vector_size))
                custom_feats = extract_features(word)
                embedding = np.concatenate((ft_vector, custom_feats))
                embedding_table.append(embedding)

            else:
                # get POS vector 
                custom_feats = extract_features(word)
                # get fast text vector
                ft_vector = model_ft.wv[word]
                # concat
                embedding = np.concatenate((ft_vector, custom_feats))
                # append to embedding
                embedding_table.append(embedding)

    return np.array(embedding_table), dim_size

### Universal Supporting Functions

In [21]:
START_TAG = '<START>'
STOP_TAG = '<STOP>'

BOS = '<BOS>'
EOS = '<EOS>'

## Define universal function to generate tag_to_ix objects with
#  CRF compatability
def get_tag_statistics(all_labels):
    """ Takes all domain labels and generates a reference index list """

    tag_index = {'<START>':0, '<STOP>':1}

    tags = set([doc for sublist in all_labels for doc in sublist])

    for ix, tag in enumerate(tags, 2):
        tag_index[tag] = ix
    return tag_index

## Define universal function to generate word_to_ix objects with
#  CRF, N to M and Attention compatability
def get_word_statistics(all_docs, add_bos_eos=False):
    """
    Take all docs and returns word index, word list and vocab size
    for the purposes of building a memory efficient embedding lookup table
    """

    word_set = set() 
    for doc in all_docs:
        for word in doc:
            word_set.add(word)

    # Sort for safety to ensure order is preserved
    word_list = list(word_set)
    word_list.sort()

    #  IF attention add BOE EOS indicators
    word_index = {} if not add_bos_eos else {BOS:0, EOS:1}

    # Create and return integer based lookup table
    for i, word in enumerate(word_list, len(word_index)):
        word_index[word] = i

    vocab_size = len(word_list)

    return word_index, word_list, vocab_size

## Define a universal helper function that will create and 
# embedding table for one or more models
def build_embedding_table(word_list, model_a, model_b=None):
    """ 
    Builds an embedding table from one or more models

    If two models is supplied function will assume requirements is to concatenate
    embeddings

    If concatenating 2+ models call function recursively

    return embedding table and resultant vector size
    """

    embedding_table = []

    dim_size = (model_a.vector_size + model_b.vector_size) if model_b else model_a.vector_size

    # If model_b has been supplied - assume input concatenation for embeddings
    for word in word_list:

        # Case 1: word is not present in pre-trained model == out of vocab error
        # apply associative array of zeros
        if word not in model_a:
            embedding_table.append(np.zeros((dim_size)))

        # Case 2: word is present in pre-trained model concatenate arrays if model_b
        # is supplied else just append model_a embeddings
        else:
            if model_b and word in model_b:
                embedding = np.concatenate((model_a.wv[word], model_b.wv[word]))
            elif model_b and word not in model_b:
                embedding_table.append(np.zeros((dim_size)))
                continue
            else:
                embedding = model_a.wv[word]

            embedding_table.append(embedding)

    return np.array(embedding_table), dim_size

# Create universal helper function that will take a coporpus or labels and
#  convert dataset to their relevant indexes
def convert_to_idx(docs, idx_reference):
    """ Converts a word or tag to its reference based on given idx reference """

    input_index_list = []

    for sent in docs:
        input_index_list.append([idx_reference[wrd] for wrd in sent])
    return input_index_list

# EOS and BOE tagger for attention based models
def label_sentence_bos_eos(docs):
    """ appends eos (<STOP>) and bos (<START>) labels to sentences """
    output = [[START_TAG] + s for s in docs]
    output = [s + [STOP_TAG] for s in docs]
    return output


# Saves predictions in kaggle readable format
def save_predictions_to_csv(predictions):
    """ convenience function that converts prediction dict to csv """

    with open('281.csv', 'w', newline='') as csvfile:
        fieldnames = ['ID', 'Predicted']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for key in predictions:
            writer.writerow({'ID': key, 'Predicted': predictions[key]})

# Generates test predictions in format ready for kaggle
def make_test_preds(model, crf=False):
    """ generates prediction dict for fwd functions """
    
    predicted = {}
    i = 0

    if crf:
        for idxs in test_input_index:
            _, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
            for p in pred:
                tag = index_2_tag[p]
                predicted[i] = tag
                i += 1

    else:

        for idxs in test_input_index:
            input = torch.tensor(idxs, dtype=torch.long).to(device)
            pred = model(input)
            pred = pred.argmax(dim=1).cpu()
            for p in pred:
                tag = index_2_tag[int(p)]
                predicted[i] = tag
                i += 1

    return predicted

# Save baseline model
def save_pytorch_model(model, name):
    torch.save(model, name + '.pt')

In [22]:
# Generate word, tag and vocabulary indexs and statistics
word_index, word_list, vocab_size = get_word_statistics(X_combine)
tag_index = get_tag_statistics(Y_combine)

train_input_index =  convert_to_idx(X_train, word_index)
train_output_index = convert_to_idx(Y_train, tag_index)
val_input_index = convert_to_idx(X_val, word_index)
val_output_index = convert_to_idx(Y_val, tag_index)
test_input_index = convert_to_idx(X_test ,word_index)

index_2_tag = dict([(value, key) for key, value in tag_index.items()])

## B. Baseline Model

Reference: code used in this section adapted from:
1. https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html 
2. COMP5046_Lab09
3. Pytorch: ADVANCED: MAKING DYNAMIC DECISIONS AND THE BI-LSTM CRF

In [23]:
# Generate word, tag and vocabulary indexs and statistics
word_index, word_list, vocab_size = get_word_statistics(X_combine)
tag_index = get_tag_statistics(Y_combine)

train_input_index =  convert_to_idx(X_train, word_index)
train_output_index = convert_to_idx(Y_train, tag_index)
val_input_index = convert_to_idx(X_val, word_index)
val_output_index = convert_to_idx(Y_val, tag_index)
test_input_index = convert_to_idx(X_test ,word_index)

index_2_tag = dict([(value, key) for key, value in tag_index.items()])

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# baseline_embedding_matrix, dim_size = build_embedding_table(word_list, fast_text_sg_plain)
# baseline_embedding_matrix, dim_size = build_pos_ft_embedding_table(word_list, fast_text_sg_plain)
custom_embedding_matrix, dim_size = build_custom_ft_embedding_table(word_list, fast_text_dota_2v_model)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, embeddings):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embeddings))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

HIDDEN_DIM = 50
model_bi_ltsm_crf = BiLSTM_CRF(vocab_size, tag_index, dim_size, HIDDEN_DIM, custom_embedding_matrix).to(device)
optimizer = optim.SGD(model_bi_ltsm_crf.parameters(), lr=0.1)

  


In [25]:
def calculate_f1_accuracy_crf(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        _, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    f1 = f1_score(ground_truth, predicted, average='weighted')
    return f1, accuracy

In [26]:
def calculate_f1_classes_crf(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        _, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    f1_classes = f1_score(ground_truth, predicted, average=None, labels=list(tag_index.values())[2:])
    return f1_classes

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_baseline(optimizer, model_bi_ltsm_crf):

    for epoch in range(2):  
        train_loss = 0

        model_bi_ltsm_crf.train()
        for i, idxs in enumerate(train_input_index): # use training data
            tags_index = train_output_index[i]

            model_bi_ltsm_crf.zero_grad()

            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            loss = model_bi_ltsm_crf.neg_log_likelihood(sentence_in, targets)

            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model_bi_ltsm_crf.eval()

        train_f1, train_acc = calculate_f1_accuracy_crf(model_bi_ltsm_crf,train_input_index,train_output_index)
        val_f1, val_acc = calculate_f1_accuracy_crf(model_bi_ltsm_crf,val_input_index,val_output_index)

        print(f'Epoch: {epoch + 1}, train F1: {train_f1}, validation F1: {val_f1}')

    baseline_f1 = calculate_f1_classes_crf(model_bi_ltsm_crf,val_input_index,val_output_index)

    return val_f1, baseline_f1

## C. Model Design

### 2. Stacked Seq2Seq: Bi_LTSM Multi Stacks

Reference: code used in this section adapted from:
1. https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html 
2. COMP5046_Lab09

In [28]:
# Generate word, tag and vocabulary indexs and statistics
word_index, word_list, vocab_size = get_word_statistics(X_combine)
tag_index = get_tag_statistics(Y_combine)

train_input_index =  convert_to_idx(X_train, word_index)
train_output_index = convert_to_idx(Y_train, tag_index)
val_input_index = convert_to_idx(X_val, word_index)
val_output_index = convert_to_idx(Y_val, tag_index)
test_input_index = convert_to_idx(X_test ,word_index)

index_2_tag = dict([(value, key) for key, value in tag_index.items()])

In [29]:
def calculate_f1_accuracy_non_crf(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred.argmax(dim=1).cpu()
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    f1 = f1_score(ground_truth, predicted, average='weighted')
    return f1, accuracy

In [30]:
def calculate_f1_classes_non_crf(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred.argmax(dim=1).cpu()
    f1_classes = f1_score(ground_truth, predicted, average=None, labels=list(tag_index.values())[2:])
    return f1_classes

In [31]:
baseline_embedding_matrix, dim_size = build_embedding_table(word_list, fast_text_sg_plain)
# baseline_embedding_matrix, dim_size = build_custom_ft_embedding_table(word_list, fast_text_dota_2v_model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BiLSTM_Seq2Seq(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, embeddings, stacks=1):
        super(BiLSTM_Seq2Seq, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(baseline_embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=stacks, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeds(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

model_bi_ltsm_s2s = BiLSTM_Seq2Seq(vocab_size, tag_index, dim_size, 50, baseline_embedding_matrix, 1).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model_bi_ltsm_s2s.parameters(), lr=0.1)



In [32]:
def train_stacks(optimizer, model_bi_ltsm_s2s):

    for epoch in range(2):  
        time1 = datetime.datetime.now()
        train_loss = 0

        model_bi_ltsm_s2s.train()
        for i, idxs in enumerate(train_input_index):
            tags_index = train_output_index[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model_bi_ltsm_s2s.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            # Step 3. Run our forward pass.
            tag_scores = model_bi_ltsm_s2s(sentence_in).to(device)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model_bi_ltsm_s2s.eval()
        train_f1, train_acc = calculate_f1_accuracy_non_crf(model_bi_ltsm_s2s,train_input_index,train_output_index)
        val_f1, val_acc = calculate_f1_accuracy_non_crf(model_bi_ltsm_s2s,val_input_index,val_output_index)

        print(f'Epoch: {epoch + 1}, train F1: {train_f1}, validation F1: {val_f1}')

    baseline_f1 = calculate_f1_classes_non_crf(model_bi_ltsm_s2s,val_input_index,val_output_index)

    return val_f1, baseline_f1

### 2. Attention

Reference: code used in this section adapted from:
1. https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
2. COMP5046_Lab10

In [33]:
input_token_list = X_train + X_val
answer_token_list = Y_train + Y_val
output_token_list = [["<BOS>"] + s for s in answer_token_list]
target_token_list = [s + ["<EOS>"] for s in answer_token_list]
n_data = len(X_train)
MAX_LENGTH = max([len(s) for s in input_token_list] + [len(s) for s in target_token_list])

In [None]:
word_to_ix = {"<BOS>": 0, "<EOS>":1}
for sentence in input_token_list+output_token_list:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

In [None]:
tag_index = get_tag_statistics(Y_combine)

def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

input_index = to_index(input_token_list, word_to_ix)
output_index = to_index(output_token_list, word_to_ix)
target_index = to_index(target_token_list, word_to_ix)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = embedding
        self.lstm = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden) 
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)    

class AttnDecoderRNN(nn.Module):
    ATTN_TYPE_DOT_PRODUCT = "Dot Product"
    ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product" 
    ATTN_TYPE_COSINE = "Cosine" 

    def __init__(self, hidden_size, output_size, embedding, method, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.method= method

        self.embedding = embedding
        self.lstm = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size*2, self.output_size)


    def cal_attention(self, hidden, encoder_hiddens, method = ATTN_TYPE_DOT_PRODUCT):
        # Dot Product Attention
        if method == AttnDecoderRNN.ATTN_TYPE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(hidden, encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)
        # Scaled Dot Product Attention
        elif method == AttnDecoderRNN.ATTN_TYPE_SCALE_DOT_PRODUCT:
            attn_weights = F.softmax(1/np.sqrt(self.hidden_size)*torch.bmm(hidden, encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)
        # Cosine
        elif method == AttnDecoderRNN.ATTN_TYPE_COSINE:
            cos = nn.CosineSimilarity(dim=2, eps=1e-6)
            attn_weights = F.softmax(cos(encoder_hiddens.unsqueeze(0),hidden),dim=-1)
            attn_weights = F.softmax(torch.bmm(hidden, encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        return concat_output

    def forward(self, input, hidden, encoder_hiddens):
        embedded = self.embedding(input).view(1, 1, -1)

        _, hidden = self.lstm(embedded, hidden)

        concat_output = self.cal_attention(hidden, encoder_hiddens, self.method)

        output = F.log_softmax(self.out(concat_output), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_hiddens[i] = encoder_hidden[0, 0]

    decoder_input = torch.tensor([[0]], device=device)

    decoder_hidden = encoder_hidden

    # Teacher forcing: Feed the target as the next input
    for i in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hiddens)
        loss += criterion(decoder_output, target_tensor[i])
        decoder_input = target_tensor[i]  # Teacher forcing

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

import random
def trainIters(encoder, decoder, learning_rate=0.1):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for epoch in range(2):
        
        for ix, idxs in enumerate(input_index):
            input_index_r = [[ind] for ind in input_index[ix]]
            target_index_r = [[ind] for ind in target_index[ix]]
            
            input_tensor = torch.LongTensor(input_index_r).to(device)
            target_tensor = torch.LongTensor(target_index_r).to(device)
            loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

        print(f'Epoch train: {epoch}, Loss: {plot_loss_total}')

In [None]:
# |hidden_size = 50
# embedding = nn.Embedding(len(word_to_ix), hidden_size)
# encoder1 = EncoderRNN(hidden_size, embedding).to(device)
# attn_decoder1 = AttnDecoderRNN(hidden_size, len(word_to_ix), embedding,"Dot Product").to(device)
# trainIters(encoder1, attn_decoder1)

In [None]:
def evaluate(encoder, decoder, input_sent, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_index = [word_to_ix[word] for word in input_sent]
        input_tensor = torch.LongTensor([[ind] for ind in input_index]).to(device)

        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_hiddens[ei] += encoder_hidden[0, 0]

        decoder_input = torch.tensor([[0]], device=device) 

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(input_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hiddens)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                if di != input_length:
                    decoded_words.append('O')
                else:
                    decoded_words.append('<EOS>')
                    break
            else:
                decoded_words.append(word_list[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
def calculate_f1_att_crf(ground_truth, predicted):
    f1 = f1_score(ground_truth, predicted, average='weighted')
    return f1

def calculate_f1_classes_att_crf(ground_truth, predicted):
    f1_classes = f1_score(ground_truth, predicted, average=None, labels=list(tag_index.values())[2:])
    return f1_classes

In [None]:
def evaluate_attention(encoder, decoder):
    predictions = [evaluate(encoder, decoder, input, max_length=MAX_LENGTH) for input in X_val]
    predictions = [[token for token in sublist if token !='<EOS>'] for sublist in predictions]
    predictions = [tag_index[doc] for sublist in predictions for doc in sublist]
    ground_truth = [[tag_index[token] for token in sublist] for sublist in Y_val]
    ground_truth = [doc for sublist in ground_truth for doc in sublist]
    f1 = calculate_f1_att_crf(ground_truth, predictions)
    f1_classes = calculate_f1_classes_att_crf(ground_truth, predictions)
    return f1, f1_classes

### 3. CRF Attachment

In [None]:
# Could not successfully implement as mentioned in report

## E. Ablation Studies

**NOTE**: First run *all* cells in the following sections before running code in this section (view table of contents in colab for best results):
1. .01 Dependencies & Setup
2. .02 Pre-processing
3. A. Input Embedding
4. B. Baseline Model
5. C. Model Design

### 0. Input Embeddings

In [None]:
# Generate word, tag and vocabulary indexs and statistics
word_index, word_list, vocab_size = get_word_statistics(X_combine)
tag_index = get_tag_statistics(Y_combine)

train_input_index =  convert_to_idx(X_train, word_index)
train_output_index = convert_to_idx(Y_train, tag_index)
val_input_index = convert_to_idx(X_val, word_index)
val_output_index = convert_to_idx(Y_val, tag_index)
test_input_index = convert_to_idx(X_test ,word_index)

index_2_tag = dict([(value, key) for key, value in tag_index.items()])

ft_embedding_matrix, ft_dim_size = build_embedding_table(word_list, fast_text_sg_plain)
ft_pos_embedding_matrix, ft_pos_dim_size = build_pos_ft_embedding_table(word_list, fast_text_sg_plain)
domain_embedding_matrix, domain_dim_size = build_custom_ft_embedding_table(word_list, fast_text_dota_2v_model)

  del sys.path[0]
  


In [None]:
# Test FastText EMbedding
HIDDEN_DIM = 50
baseline_ft = BiLSTM_CRF(vocab_size, tag_index, ft_dim_size, HIDDEN_DIM, ft_embedding_matrix).to(device)
optimizer = optim.SGD(baseline_ft.parameters(), lr=0.1)
f1_ft, f1_class_ft = train_baseline(optimizer, baseline_ft)

Epoch: 1, train F1: 0.9985813183353329, validation F1: 0.9950886016756344
Epoch: 2, train F1: 0.9998493860107002, validation F1: 0.9956652084932874


In [None]:
# Test FastText + POS EMbedding
HIDDEN_DIM = 50
baseline_ft_pos = BiLSTM_CRF(vocab_size, tag_index, ft_pos_dim_size, HIDDEN_DIM, ft_pos_embedding_matrix).to(device)
optimizer = optim.SGD(baseline_ft_pos.parameters(), lr=0.1)
f1_ft_pos, f1_class_ft_pos = train_baseline(optimizer, baseline_ft_pos)

Epoch: 1, train F1: 0.9985396195485442, validation F1: 0.9953262782721041
Epoch: 2, train F1: 0.9997993051918642, validation F1: 0.9957256107591622


In [None]:
#  Test custom embeddings
HIDDEN_DIM = 50
baseline_domain = BiLSTM_CRF(vocab_size, tag_index, domain_dim_size, HIDDEN_DIM, domain_embedding_matrix).to(device)
optimizer = optim.SGD(baseline_domain.parameters(), lr=0.1)
f1_domain, f1_class_domain  = train_baseline(optimizer, baseline_domain)

Epoch: 1, train F1: 0.9990163573677321, validation F1: 0.9959790003483713
Epoch: 2, train F1: 0.9999297357323451, validation F1: 0.9969927310695462


In [None]:
headers = ['Model Variant', 'T-F1(C)', 'T-F1(D)', 'T-F1(O)', 'T-F1(P)', 'T-F1(S)', 'T-F1(T)', 'F1-Weighted']
model_name = ['FastText (CONDA Corpus)', 'FastText (CONDA Corpus) + POS', 'Domain Specific Embeddings']
all_res = [f1_class_ft, f1_class_ft_pos, f1_class_domain]
results = [[f'{res[0]*100:.2f}%', f'{res[1]*100:.2f}%', f'{res[2]*100:.2f}%', 
             f'{res[3]*100:.2f}%', f'{res[4]*100:.2f}%', f'{res[6]*100:.2f}%'] for res in all_res]
all_f1 = [f1_ft, f1_ft_pos, f1_domain]
[res.append(f'{f1*100:.2f}%') for res, f1 in zip(results, all_f1)]

table = [(model, res[0], res[1], res[2], res[3], res[4], res[5], res[6]) for model, res in zip(model_name, results)]

print('\n')
print('Table 1 Metrics: Emmbedding performance on baseline Bi-LSTM CRF Model')
print('\n')
print(tabulate(table, headers, tablefmt="github"))
print('\n')



Table 1 Metrics: Emmbedding performance on baseline Bi-LSTM CRF Model


| Model Variant                 | T-F1(C)   | T-F1(D)   | T-F1(O)   | T-F1(P)   | T-F1(S)   | T-F1(T)   | F1-Weighted   |
|-------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|---------------|
| FastText (CONDA Corpus)       | 99.62%    | 99.89%    | 98.58%    | 98.07%    | 97.95%    | 100.00%   | 99.57%        |
| FastText (CONDA Corpus) + POS | 99.65%    | 99.94%    | 98.64%    | 98.11%    | 97.95%    | 100.00%   | 99.57%        |
| Domain Specific Embeddings    | 99.68%    | 99.94%    | 98.86%    | 99.11%    | 98.21%    | 100.00%   | 99.70%        |




In [None]:
#save models
save_pytorch_model(baseline_ft, 'baseline_ft')
save_pytorch_model(baseline_ft_pos, 'baseline_ft_pos')
save_pytorch_model(baseline_domain, 'BEST_MODEL_baseline_domain')

### 1. Different Attention Strategy

In [None]:
# Note - All cells in Attention section in table of contents must be run before running below

In [None]:
# Dot Product
hidden_size = 50
embedding = nn.Embedding(len(word_to_ix), hidden_size)
encoder_dprod = EncoderRNN(hidden_size, embedding).to(device)
attn_decoder_dprod = AttnDecoderRNN(hidden_size, len(word_to_ix), embedding,"Dot Product").to(device)
trainIters(encoder_dprod, attn_decoder_dprod)
f1_att_dprod, f1_class_att_dprod = evaluate_attention(encoder_dprod, attn_decoder_dprod)

Epoch train: 0, Loss: 319447.7608731874
Epoch train: 1, Loss: 661008.0890895978


In [None]:
# Scale Dot Product
hidden_size = 50
embedding = nn.Embedding(len(word_to_ix), hidden_size)
encoder_sdp = EncoderRNN(hidden_size, embedding).to(device)
attn_decoder_sdp = AttnDecoderRNN(hidden_size, len(word_to_ix), embedding,"Scale Dot Product").to(device)
trainIters(encoder_sdp, attn_decoder_sdp)
f1_att_sdp, f1_class_att_sdp = evaluate_attention(encoder_sdp, attn_decoder_sdp)

Epoch train: 0, Loss: 183314.9353084668
Epoch train: 1, Loss: 353773.96675233793


In [None]:
# Cosine
hidden_size = 50
embedding = nn.Embedding(len(word_to_ix), hidden_size)
encoder_cos = EncoderRNN(hidden_size, embedding).to(device)
attn_decoder_cos = AttnDecoderRNN(hidden_size, len(word_to_ix), embedding,"Cosine").to(device)
trainIters(encoder_cos, attn_decoder_cos)
f1_att_cos, f1_class_att_cos = evaluate_attention(encoder_cos, attn_decoder_cos)

Epoch train: 0, Loss: 355756.92934421136
Epoch train: 1, Loss: 645679.6649882921


In [None]:
headers = ['Model Variant', 'T-F1(C)', 'T-F1(D)', 'T-F1(O)', 'T-F1(P)', 'T-F1(S)', 'T-F1(T)', 'F1-Weighted']
model_name = ['Attention - Dot Product', 'Attention - Scaled Dot Product', 'Attention - Cosine']
all_res = [f1_class_att_dprod, f1_class_att_sdp, f1_class_att_cos]
results = [[f'{res[0]*100:.2f}%', f'{res[1]*100:.2f}%', f'{res[2]*100:.2f}%', 
             f'{res[3]*100:.2f}%', f'{res[4]*100:.2f}%', f'{res[6]*100:.2f}%'] for res in all_res]
all_f1 = [f1_att_dprod, f1_att_sdp, f1_att_cos]
[res.append(f'{f1*100:.2f}%') for res, f1 in zip(results, all_f1)]

table = [(model, res[0], res[1], res[2], res[3], res[4], res[5], res[6]) for model, res in zip(model_name, results)]

print('\n')
print('Table 3 Metrics: GRU Attention Based Scores')
print('\n')
print(tabulate(table, headers, tablefmt="github"))
print('\n')



Table 3 Metrics: GRU Attention Based Scores


| Model Variant                  | T-F1(C)   | T-F1(D)   | T-F1(O)   | T-F1(P)   | T-F1(S)   | T-F1(T)   | F1-Weighted   |
|--------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|---------------|
| Attention - Dot Product        | 38.90%    | 10.96%    | 10.38%    | 3.01%     | 1.36%     | 13.93%    | 44.14%        |
| Attention - Scaled Dot Product | 35.82%    | 21.18%    | 1.36%     | 2.16%     | 0.00%     | 8.22%     | 38.34%        |
| Attention - Cosine             | 33.68%    | 5.80%     | 5.95%     | 9.15%     | 0.00%     | 10.71%    | 42.63%        |




In [None]:
# save models
save_pytorch_model(encoder_dprod, 'encoder_dprod')
save_pytorch_model(attn_decoder_dprod, 'attn_decoder_dprod')
save_pytorch_model(encoder_sdp, 'encoder_sdp')
save_pytorch_model(attn_decoder_sdp, 'attn_decoder_sdp')
# save_pytorch_model(encoder_cos, 'encoder_cos')
# save_pytorch_model(attn_decoder_cos, 'attn_decoder_cos')

### 2. Different Stacked layer

In [None]:
# Note - Stacked Seq2Seq section in table of contents must be run before running below

In [None]:
# 1 layers LSTM
model_bi_ltsm_s2s_1 = BiLSTM_Seq2Seq(vocab_size, tag_index, dim_size, 50, domain_embedding_matrix, 1).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model_bi_ltsm_s2s_1.parameters(), lr=0.1)
f1_s1, f1_class_s1  = train_stacks(optimizer, model_bi_ltsm_s2s_1)

Epoch: 1, train F1: 0.9962362394263239, validation F1: 0.9934645308777651
Epoch: 2, train F1: 0.99910722508622, validation F1: 0.9950956929487292


In [None]:
# 2 layers LSTM
model_bi_ltsm_s2s_2 = BiLSTM_Seq2Seq(vocab_size, tag_index, dim_size, 50, domain_embedding_matrix, 2).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model_bi_ltsm_s2s_2.parameters(), lr=0.1)
f1_s2, f1_class_s2  = train_stacks(optimizer, model_bi_ltsm_s2s_2)

Epoch: 1, train F1: 0.9945056230519578, validation F1: 0.9916305304132739
Epoch: 2, train F1: 0.9983135606993488, validation F1: 0.9941024199688637


In [None]:
headers = ['Model Variant', 'T-F1(C)', 'T-F1(D)', 'T-F1(O)', 'T-F1(P)', 'T-F1(S)', 'T-F1(T)', 'F1-Weighted']
model_name = ['Bi-LSTM - 1 Stack', 'Bi-LSTM - 2 Stack']
all_res = [f1_class_s1, f1_class_s2]
results = [[f'{res[0]*100:.2f}%', f'{res[1]*100:.2f}%', f'{res[2]*100:.2f}%', 
             f'{res[3]*100:.2f}%', f'{res[4]*100:.2f}%', f'{res[6]*100:.2f}%'] for res in all_res]
all_f1 = [f1_s1, f1_s2]
[res.append(f'{f1*100:.2f}%') for res, f1 in zip(results, all_f1)]

table = [(model, res[0], res[1], res[2], res[3], res[4], res[5], res[6]) for model, res in zip(model_name, results)]

print('\n')
print('Table 2 Metrics: Stacked Bi-LSTM Models')
print('\n')
print(tabulate(table, headers, tablefmt="github"))
print('\n')



Table 2 Metrics: Stacked Bi-LSTM Models


| Model Variant     | T-F1(C)   | T-F1(D)   | T-F1(O)   | T-F1(P)   | T-F1(S)   | T-F1(T)   | F1-Weighted   |
|-------------------|-----------|-----------|-----------|-----------|-----------|-----------|---------------|
| Bi-LSTM - 1 Stack | 99.59%    | 99.86%    | 98.37%    | 98.00%    | 97.34%    | 100.00%   | 99.51%        |
| Bi-LSTM - 2 Stack | 99.53%    | 99.75%    | 97.94%    | 97.64%    | 95.23%    | 100.00%   | 99.41%        |




In [None]:
# save models
save_pytorch_model(model_bi_ltsm_s2s_1, 'model_bi_ltsm_s2s_1')
save_pytorch_model(model_bi_ltsm_s2s_2, 'model_bi_ltsm_s2s_2')

### 3. With/without CRF 

In [None]:
headers = ['Model Variant', 'T-F1(C)', 'T-F1(D)', 'T-F1(O)', 'T-F1(P)', 'T-F1(S)', 'T-F1(T)', 'F1-Weighted']
model_name = ['Without CRF - Attention - Dot Product', 'With CRF - Attention - Dot Product']
all_res = [f1_class_att_dprod, [0,0,0,0,0,0,0]]
results = [[f'{res[0]*100:.2f}%', f'{res[1]*100:.2f}%', f'{res[2]*100:.2f}%', 
             f'{res[3]*100:.2f}%', f'{res[4]*100:.2f}%', f'{res[6]*100:.2f}%'] for res in all_res]
all_f1 = [f1_att_dprod, 'error']
[res.append(f1) for res, f1 in zip(results, all_f1)]

table = [(model, res[0], res[1], res[2], res[3], res[4], res[5], res[6]) for model, res in zip(model_name, results)]

print('\n')
print('Table 4 Metrics: With CRF Attachment')
print('\n')
print(tabulate(table, headers, tablefmt="github"))
print('\n')



Table 4 Metrics: With CRF Attachment


| Model Variant                         | T-F1(C)   | T-F1(D)   | T-F1(O)   | T-F1(P)   | T-F1(S)   | T-F1(T)   | F1-Weighted         |
|---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|---------------------|
| Without CRF - Attention - Dot Product | 38.90%    | 10.96%    | 10.38%    | 3.01%     | 1.36%     | 13.93%    | 0.44143093999186306 |
| With CRF - Attention - Dot Product    | 0.00%     | 0.00%     | 0.00%     | 0.00%     | 0.00%     | 0.00%     | error               |




## F. Kaggle Leaderboard Submission Format / Best Model

**NOTE**: First run *all* cells in the following sections before running code in this section (view table of contents in colab for best results):
1. .01 Dependencies & Setup
2. .02 Pre-processing
3. A. Input Embedding
4. B. Baseline Model

### Replication of Best Model: Due to stachastic optimisation outputs may vary to results in Kaggle

In [34]:
# Train Model
# Generate word, tag and vocabulary indexs and statistics
word_index, word_list, vocab_size = get_word_statistics(X_combine)
tag_index = get_tag_statistics(Y_combine)

train_input_index =  convert_to_idx(X_train, word_index)
train_output_index = convert_to_idx(Y_train, tag_index)
val_input_index = convert_to_idx(X_val, word_index)
val_output_index = convert_to_idx(Y_val, tag_index)
test_input_index = convert_to_idx(X_test ,word_index)

index_2_tag = dict([(value, key) for key, value in tag_index.items()])

# Embedding Matrix
domain_embedding_matrix, domain_dim_size = build_custom_ft_embedding_table(word_list, fast_text_dota_2v_model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train for 2 epoch
HIDDEN_DIM = 50
BEST_MODEL_baseline_domain = BiLSTM_CRF(vocab_size, tag_index, domain_dim_size, HIDDEN_DIM, domain_embedding_matrix).to(device)
optimizer = optim.SGD(BEST_MODEL_baseline_domain.parameters(), lr=0.1)
f1_domain, f1_class_domain  = train_baseline(optimizer, BEST_MODEL_baseline_domain)

  


Epoch: 1, train F1: 0.9989868782670633, validation F1: 0.9959436309835752
Epoch: 2, train F1: 0.999899607376557, validation F1: 0.9967251948316809


In [35]:
# Using trained model save model and generate test set predictions
save_pytorch_model(BEST_MODEL_baseline_domain, 'BEST_MODEL_baseline_domain')
save_predictions_to_csv(make_test_preds(BEST_MODEL_baseline_domain, True)) # saves predictions to 281.csv

In [None]:
# load model
# For some reason saving and loading model generates very different results. Have tried loading state dict and full model
# suggest training model to validate kaggle score instead (see above)

In [None]:
word_index, word_list, vocab_size = get_word_statistics(X_combine)
tag_index = get_tag_statistics(Y_combine)

train_input_index =  convert_to_idx(X_train, word_index)
train_output_index = convert_to_idx(Y_train, tag_index)
val_input_index = convert_to_idx(X_val, word_index)
val_output_index = convert_to_idx(Y_val, tag_index)
test_input_index = convert_to_idx(X_test ,word_index)

index_2_tag = dict([(value, key) for key, value in tag_index.items()])

domain_embedding_matrix, domain_dim_size = build_custom_ft_embedding_table(word_list, fast_text_dota_2v_model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = '1n6BlmP3zzOA6VE0SAlt7yX7f8T32VTOM'
downloaded = drive.CreateFile({'id':model_id}) 
downloaded.GetContentFile('best_model.pt')
model = BiLSTM_CRF(vocab_size, tag_index, domain_dim_size, HIDDEN_DIM, domain_embedding_matrix).to(device)
model.load_state_dict(torch.load("best_model.pt"))
model.train(False)
make_test_preds(model, True)
save_predictions_to_csv(make_test_preds(model, True)) # saves predictions to 281.csv

  
