# Data And Input

## Data Download and Load

In [69]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:    
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])

In [70]:
#Download file if not existing
try:
  _ = open("train.csv", "r")
except:
  downloadFiles([["train.csv", "1pRTJ3aTh1yZV2ZN7Fof2NSt1c137SzYw"]])

try:
  _ = open("val.csv", "r")
except:
  downloadFiles([["val.csv", "1khO0wHBC8bBzLVH4G09NhKTHHfAwBIna"]])

try:
  _ = open("test.csv", "r")
except:
  downloadFiles([["test.csv", "1-E3dhTaMhG5oRKS9HGF9ZTTzsOKc_D14"]])

try:
  _ = open("dota.csv", "r")
except:
  downloadFiles([["dota.csv", "1DCsO0uICmtabGiy8MQAYPl0xeDwhFP5S"]])

# dimension 25
# try:
#   _ = open("domain.model", "r")
# except:
#   downloadFiles([["domain.model", "1KHFu7kJBwlopIqlKrVaujvRt1AoJzcbt"]])

# dimension50
try:
  _ = open("domain.model", "r")
except:
  downloadFiles([["domain.model", "14zkvDnEk4dZZ4vzdK1RJbpGAgw_G_rty"]])


In [71]:
import pandas as pd


def read_data(file_name, test=False):
    f = pd.read_csv(file_name)

    temp1 = f['sents'].tolist()
    input_data = [sent.lower().split() for sent in temp1]

    if not test:
      temp2 = f['labels'].tolist()
      target_data = [sent.split() for sent in temp2]

      return input_data, target_data

    return input_data

train_data, target_y_train = read_data('train.csv')
validation_data, target_y_validation = read_data('val.csv')
test_data = read_data('test.csv', True)

print(len(train_data))
print(type(train_data[2]))
print(train_data[2])
print(target_y_train[2])

26078
<class 'list'>
['wpe', 'wpe']
['O', 'O']


## Data Preprocessing

#### Generate word_to_ix and tag_to_ix

In [72]:
word_to_ix = {}
for sentence in train_data+validation_data+test_data:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

## Input Embedding

### Aspect 1) Syntactic Textual Feature Embedding: PoS tag information, Dependency Path, etc.


In [73]:
import spacy
import numpy as np
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")

labels = [nlp(word)[0].tag_ for word in word_list]

# one-hot encoding
syntactic_labels = dict()
for idx,label in enumerate(set(labels)):
  syntactic_labels[label] = [0]*len(set(labels))
  syntactic_labels[label][idx] = 1

syntactic_embedding_dim = len(set(labels))

syntactic_embedding_matrix = []
for label in labels:
    try:
        syntactic_embedding_matrix.append(syntactic_labels[label])
    except:
        syntactic_embedding_matrix.append([0]*len(set(labels)))
syntactic_embedding_matrix = np.array(syntactic_embedding_matrix)
syntactic_embedding_matrix.shape

(11243, 38)

### Aspect 2) Semantic Textual Feature Embedding: Word Embeddings (Word2Vec, ELMO, etc.)

In [74]:
import numpy as np
import gensim.downloader as api

word_emb_model = api.load("glove-twitter-200") 

semantic_embedding_dim = 200

semantic_embedding_matrix = []
for word in word_list:
    try:
        semantic_embedding_matrix.append(word_emb_model.wv[word])
    except:
        semantic_embedding_matrix.append([0]*semantic_embedding_dim)
semantic_embedding_matrix = np.array(semantic_embedding_matrix)
semantic_embedding_matrix.shape



  # This is added back by InteractiveShellApp.init_path()


(11243, 200)

### Aspect 3) Domain Feature Embedding: Your own new feature embedding to solve this in-game chat word slot filling(tagging).

In [75]:
dota = pd.read_csv('dota.csv').dropna(subset=['key'])
corpus = dota['key'].tolist()

# Import libraries
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import FastText

# Prepare training corpus for Gensim Word2Vec Skip-Gram model - List of List
sentences=[]
sentences=[word_tokenize(sentence.lower()) for sentence in corpus]

# Train the Gensim Word2Vec Skip-Gram model
# domain_model = Word2Vec(sentences=sentences, size=50, window=2, min_count=1, workers=2, sg=1)
domain_model = FastText(sentences=sentences, size=200, window=2, min_count=1, workers=2, sg=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [76]:
# Save and load model
# domain_model.save("domain.model")
# from gensim.models import Word2Vec
# domain_model = Word2Vec.load("domain.model")

In [77]:
domain_embedding_dim = 200

domain_embedding_matrix = []
for word in word_list:
    try:
        domain_embedding_matrix.append(domain_model[word])
    except:
        domain_embedding_matrix.append([0]*domain_embedding_dim)
domain_embedding_matrix = np.array(domain_embedding_matrix)
domain_embedding_matrix.shape

  


(11243, 200)

## Generate Embedding Matrix

In [78]:
# EMBEDDING_DIM = syntactic_embedding_dim + semantic_embedding_dim + domain_embedding_dim
# embedding_matrix = np.concatenate((syntactic_embedding_matrix, semantic_embedding_matrix, domain_embedding_matrix), axis=1)
# embedding_matrix.shape
EMBEDDING_DIM = semantic_embedding_dim + domain_embedding_dim
embedding_matrix = np.concatenate((semantic_embedding_matrix, domain_embedding_matrix), axis=1)
embedding_matrix.shape
# EMBEDDING_DIM = domain_embedding_dim
# embedding_matrix = domain_embedding_matrix
# embedding_matrix.shape

(11243, 400)

## convert dataset into idxs

In [79]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)
train_val_input_index = train_input_index + val_input_index
train_val_output_index = train_output_index + val_output_index
# test_output_index = to_index(target_y_test,tag_to_ix)

# Model

## Attention

In [80]:
import torch
import torch.nn as nn
class CosineAttention(nn.Module):
    def __init__(self, dropout_rate=0.0, eps=1e-10, **kwargs):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.eps = eps

    def forward(self, q, k, v, attn_mask=None):
        q_norm = q / (q.norm(p=2, dim=-1, keepdim=True) + self.eps)  # (B, T_q, D)
        k_norm = k / (k.norm(p=2, dim=-1, keepdim=True) + self.eps)  # (B, T_k, D)
        attention = torch.bmm(q_norm, k_norm.permute(0, 2, 1))  # (B, T_q, T_k)
        if attn_mask is not None:
            attention.masked_fill_(attn_mask, -np.inf)  # positions that require masking are now -np.inf
        attention = F.softmax(attention, dim=-1)
        attention = self.dropout(attention)
        output = attention.bmm(v)  # (B, T_q, D_v)
        return output, attention

## Slot Filling/Tagging model

In [81]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# from flair.data import Sentence

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, crf=True):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.crf = crf

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        # """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))

        
        # self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
        #                     num_layers=3, bidirectional=True)
        self.lstm = nn.GRU(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        
        # self.attention = nn.Transformer(hidden_dim, nhead=10, num_encoder_layers=1).encoder
        # self.attention_method = 'cosine'
        # if self.attention_method != None:
        #     self.linear_q = nn.Linear(self.hidden_dim, self.hidden_dim)
        #     self.linear_k = nn.Linear(self.hidden_dim, self.hidden_dim)
        #     self.linear_v = nn.Linear(self.hidden_dim, self.hidden_dim)

        # self.attention = CosineAttention()

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    # def init_hidden(self):
    #     return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
    #             torch.randn(2, 1, self.hidden_dim // 2).to(device))
    def init_hidden(self):
        return torch.randn(2, 1, self.hidden_dim // 2).to(device)

    def _forward_alg(self, feats):
        # feats [L,9]
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # init_alphas [1,9]
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        # N: batch size
        # L: sequence length
        # D: 2 if bidirectional=True otherwise 1
        # H_in: input size
        # H_out: hidden size

        self.hidden = self.init_hidden()

        # sentence = Sentence(" ".join(sentence), use_tokenizer=False)
        # embeds = self.word_embeds.embed(sentence)
        # embeds = torch.stack([i.embedding for i in sentence])
        # embeds = embeds.view(len(sentence), 1, -1).to(device)

        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)

        # sentence sentence torch.Size([8]), (L)
        # embeds torch.Size([8, 1, 50]), (L,N,H_in)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # lstm_out torch.Size([8, 1, 50]), (L,N,D*H_out)
        # self.hidden[0] torch.Size([2, 1, 25]), (D*num_layers, N, H_out)
        # self.hidden[1] torch.Size([2, 1, 25]), (D*num_layers, N, H_out)

        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        # lstm_out after view torch.Size([8, 50])
        lstm_feats = self.hidden2tag(lstm_out)
        # lstm_feats torch.Size([8, 9])

        # attention_out = self.attention(lstm_out)
        # attention_out = attention_out.view(len(sentence), self.hidden_dim)
        # lstm_feats = self.hidden2tag(attention_out)

        # lstm_out = lstm_out.permute(1,0,2) 
        # q = self.linear_q(lstm_out)
        # k = self.linear_k(lstm_out)
        # v = self.linear_v(lstm_out)
        # output, attention = self.attention(q,k,v)
        # output = output.view(len(sentence), self.hidden_dim)
        # attention_feats = self.hidden2tag(output)

        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])

        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        # score, tag_seq = self._viterbi_decode(lstm_feats)

        if self.crf:
            # Find the best path, given the features.
            score, tag_seq = self._viterbi_decode(lstm_feats)
        else:
            tag_seq = torch.max(F.softmax(lstm_feats, dim=1), dim=1).indices.tolist()
            score = None

        return score, tag_seq

#### Function for accuracy

In [82]:
def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy
# def cal_acc(model, input_index, output_index):
#     ground_truth = []
#     predicted = []
#     for i,idxs in enumerate(input_index):
#         ground_truth += output_index[i]
#         score, pred = model(idxs)
#         predicted += pred
#     accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
#     return predicted, ground_truth, accuracy

#### Initialize model

In [83]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
# HIDDEN_DIM = 818
# EMBEDDING_DIM = 818
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
# optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)

#### Train the model

In [84]:
"""Each epoch will take about 1-2 minutes"""

import datetime

# bptt = 100
for epoch in range(2):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_val_input_index):
    # for i, idxs in enumerate(train_data):
        tags_index = train_val_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.

        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        # sentence_in = idxs
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    # # for i, idxs in enumerate(train_input_index):
    # for i, idxs in enumerate(range(0,len(train_data)-1,bptt)):
    #     seq_len = min(bptt, len(train_data) - idxs)
    #     sentence_in = train_data[idxs: idxs+seq_len]
    #     tags_index = train_output_index[idxs: idxs+seq_len]

    #     # Step 1. Remember that Pytorch accumulates gradients.
    #     # We need to clear them out before each instance
    #     model.zero_grad()

    #     # Step 2. Get our inputs ready for the network, that is,
    #     # turn them into Tensors of word indices.

    #     # sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
    #     targets = torch.tensor([tags_index], dtype=torch.long).to(device)

    #     # Step 3. Run our forward pass.
    #     loss = model.neg_log_likelihood(sentence_in, targets)

    #     # Step 4. Compute the loss, gradients, and update the parameters by
    #     # calling optimizer.step()
    #     loss.backward()
    #     optimizer.step()

    #     train_loss+=loss.item()

    model.eval()
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)
    # _, _, train_acc = cal_acc(model,train_data,train_output_index)
    # _, _, val_acc = cal_acc(model,validation_data,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

# The log below is the sample output for this section
# Please make sure you keep your own running log for submission

Epoch:1, Training loss: 10681.41, train acc: 0.9947, val loss: 509.13, val acc: 0.9964, time: 1210.00s
Epoch:2, Training loss: 2198.06, train acc: 0.9979, val loss: 195.85, val acc: 0.9992, time: 1192.55s


In [85]:
# for i, idxs in enumerate(range(0,511,100)):
#   seq_len = min(100, 511 - 1 - idxs)
#   print(i,idxs,seq_len,511 - 1 - idxs)
#   print()

In [86]:
import os
import torch
torch.save(model,'model')
model = torch.load('model')
model.eval()

BiLSTM_CRF(
  (word_embeds): Embedding(11243, 400)
  (lstm): GRU(400, 50, bidirectional=True)
  (hidden2tag): Linear(in_features=100, out_features=9, bias=True)
)

## Test

In [87]:
# Call the cal_acc functions you implemented as required

def test(model, input_index):
    predicted = []
    for i,idxs in enumerate(input_index):
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    return predicted

y_pred = test(model,test_input_index)

# def test(model, input_index):
#     predicted = []
#     for i,idxs in enumerate(input_index):
#         score, pred = model(test_data[i])
#         predicted += pred
#     return predicted

# y_pred = test(model,test_data)



In [88]:
len(test_input_index)

500

In [89]:
def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_pred_decode = decode_output(y_pred)

In [90]:
results = []
for i in range(len(y_pred_decode)):
  for token in y_pred_decode[i].split():
    results.append(token)

In [91]:
len(results)

2326

In [92]:
results.insert(0, 'Predicted')

In [93]:
len(results)

2327

In [94]:
id = [item for item in range(0, 2326)]

In [95]:
id.insert(0, 'Id')

In [98]:
import csv
from google.colab import drive
drive.mount('/content/gdrive')
file = open("/content/gdrive/My Drive/COMP5046/2022-comp5046-a2/sample_200+200+fasttext+sgd+lr=0.01+crf+true.csv", "w")
writer = csv.writer(file)

for w in range(len(results)):

  writer.writerow([id[w], results[w]])

file.close()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Slot Filling/Tagging Model

### Baseline model

### Stacked Seq2Seq model

### Attention

### CRF Attachment

# Testing and Evaluation