<a href="https://colab.research.google.com/github/shaangao/neural-net-pos-tagging/blob/main/NNPOS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import joblib
import math
from copy import deepcopy

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

## load raw datasets

### load tweets

In [2]:
# func to load_dataset into a list of lists of (word, tag) tuples (each inner list is a tweet)

def load_dataset(data_path):

  tweets = []
  vocab = set()
  tags = set()

  with open(data_path, 'r') as file:

    tweet = []

    for i, line in enumerate(file):

      # if line is empty, store current tweet and start a new tweet
      if line in ['\n']:
        tweets.append(tweet)
        tweet = []

      # otherwise, append new word and tag to current tweet as a tuple
      else:
        word, tag = line.strip('\n').split('\t')  # split string into word and tag
        vocab.add(word)
        tags.add(tag)
        tweet.append((word, tag))

  return tweets, vocab, tags

In [3]:
# load datasets

twpos_train, vocab_train, tags_train = load_dataset('/content/drive/MyDrive/postag/data/twpos-data/twpos-train.tsv')
twpos_dev, vocab_dev, tags_dev = load_dataset('/content/drive/MyDrive/postag/data/twpos-data/twpos-dev.tsv')
twpos_devtest, vocab_devtest, tags_devtest = load_dataset('/content/drive/MyDrive/postag/data/twpos-data/twpos-devtest.tsv')

print(f'twpos_train: {len(twpos_train)}, vocab_train: {len(vocab_train)}\ntwpos_dev: {len(twpos_dev)}, vocab_dev: {len(vocab_dev)}\ntwpos_devtest: {len(twpos_devtest)}, vocab_devtest: {len(vocab_devtest)}')

twpos_train: 1173, vocab_train: 4420
twpos_dev: 327, vocab_dev: 1750
twpos_devtest: 327, vocab_devtest: 1705


In [4]:
# get all_vocab in train, dev, and devtest
all_vocab = list(vocab_train.union(vocab_dev).union(vocab_devtest))
all_vocab += ['<s>', '</s>']   # add beginning and end of sentence markers
print(len(all_vocab))

# get all_tags in train, dev, and devtest
all_tags = list(tags_train.union(tags_dev).union(tags_devtest))
print(len(all_tags))

5991
25


### load embeddings

In [5]:
# load pretrained embeddings

emb_pretrained_vocab = []
emb_pretrained = []

with open('/content/drive/MyDrive/postag/data/twitter-embeddings.txt', 'r') as file:

  for i, line in enumerate(file):

    line_split = line.strip().split()

    emb_pretrained_vocab.append(line_split[0])
    emb_pretrained.append(list(map(float, line_split[1:])))

emb_pretrained = torch.tensor(emb_pretrained)
print(len(emb_pretrained_vocab), emb_pretrained.shape)

30001 torch.Size([30001, 50])


## construct data class with context window

### word & tag encoders

In [6]:
# func: get idx in emb matrix given a word
def get_word2idx(vocab_list):
  word2idx = {}
  for i, word in enumerate(vocab_list):
    word2idx[word] = i
  return word2idx

In [7]:
# for encoding words in context windows
word2idx_all_vocab = get_word2idx(all_vocab)
word2idx_emb_pretrained_vocab = get_word2idx(emb_pretrained_vocab)
# tag2idx = get_word2idx(all_tags)

In [8]:
# for encoding targets
le = LabelEncoder()
le.fit(all_tags)

### data class

In [9]:
# reference: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html


class POSDataset(Dataset):


    def __init__(self, dataset:list, word2idx:dict, tag2idx:LabelEncoder(), w:int):

        """
        wins; center_words; tags; tags_encoded
        """

        wins = []
        center_words = []
        tags = []

        # encode context words in each window with idx in emb
        for tweet in dataset:

            # process every center word in each tweet
            for i, (word, tag) in enumerate(tweet):

                # center word for curr obs
                center_words.append(word)

                # target of curr obs
                tags.append(tag)

                # idx for words in context window
                win = []
                for i in range(i-w, i+w+1):
                    if i < 0:   # if before fist token
                        try: win.append(word2idx['<s>'])
                        except: win.append(word2idx['</s>'])   # if '<s>' not in emb vocab, use emb for '</s>'
                    elif i > len(tweet)-1:    # if after last token
                        win.append(word2idx['</s>'])
                    else:
                        try: win.append(word2idx[tweet[i][0]])
                        except: win.append(word2idx['UUUNKKK'])  # use emb for unknown words
                wins.append(win)

        # encode all target tags
        tags_encoded = tag2idx.transform(tags)

        # set attributes
        self.wins = torch.tensor(wins)
        self.center_words = np.array(center_words)
        self.tags_encoded = torch.tensor(tags_encoded)
        self.tags = np.array(tags)


    def __len__(self):
        return len(self.wins)


    def __getitem__(self, idx):
        return self.wins[idx], self.tags_encoded[idx]

### instantiate encoded datasets

In [10]:
# encode datasets


# w = 0, all vocab encoding
train_w0_allvocab = POSDataset(dataset=twpos_train, word2idx=word2idx_all_vocab, tag2idx=le, w=0)
dev_w0_allvocab = POSDataset(dataset=twpos_dev, word2idx=word2idx_all_vocab, tag2idx=le, w=0)
devtest_w0_allvocab = POSDataset(dataset=twpos_devtest, word2idx=word2idx_all_vocab, tag2idx=le, w=0)

# w = 1, all vocab encoding
train_w1_allvocab = POSDataset(dataset=twpos_train, word2idx=word2idx_all_vocab, tag2idx=le, w=1)
dev_w1_allvocab = POSDataset(dataset=twpos_dev, word2idx=word2idx_all_vocab, tag2idx=le, w=1)
devtest_w1_allvocab = POSDataset(dataset=twpos_devtest, word2idx=word2idx_all_vocab, tag2idx=le, w=1)


# w = 0, pretrained 30k vocab encoding
train_w0_30k = POSDataset(dataset=twpos_train, word2idx=word2idx_emb_pretrained_vocab, tag2idx=le, w=0)
dev_w0_30k = POSDataset(dataset=twpos_dev, word2idx=word2idx_emb_pretrained_vocab, tag2idx=le, w=0)
devtest_w0_30k = POSDataset(dataset=twpos_devtest, word2idx=word2idx_emb_pretrained_vocab, tag2idx=le, w=0)

# w = 1, pretrained 30k vocab encoding
train_w1_30k = POSDataset(dataset=twpos_train, word2idx=word2idx_emb_pretrained_vocab, tag2idx=le, w=1)
dev_w1_30k = POSDataset(dataset=twpos_dev, word2idx=word2idx_emb_pretrained_vocab, tag2idx=le, w=1)
devtest_w1_30k = POSDataset(dataset=twpos_devtest, word2idx=word2idx_emb_pretrained_vocab, tag2idx=le, w=1)

## 1.1 baseline neural network tagger

### model architecture

In [11]:
# references:
# - https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html
# - https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
# - https://discuss.pytorch.org/t/how-to-create-mlp-model-with-arbitrary-number-of-hidden-layers/13124/2
# - https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_feedforward_neuralnetwork/
# - https://machinelearningmastery.com/activation-functions-in-pytorch/


class FeedForwardNN(nn.Module):


    def __init__(self, w, vocab_size, emb_dim, layer_sizes:list, layer_acts:list, pretrained_emb=None, emb_freeze=False):

        # call parent constructor
        super(FeedForwardNN, self).__init__()

        # set initial embeddings
        if pretrained_emb is not None:
            self.emb = nn.Embedding.from_pretrained(pretrained_emb, freeze=emb_freeze)
        else:   # randomly init embeddings
            self.emb = nn.Embedding(vocab_size, emb_dim)
            self.emb.weight.data.uniform_(-0.01, 0.01)

        # set embeddings' dimensionality
        self.emb_dim = self.emb.weight.shape[1]

        # set input layer dimensionality
        in_size = (1 + 2 * w) * self.emb_dim

        # construct layers (last layer is output layer)
        self.layers = nn.ModuleList()
        for i, layer_size in enumerate(layer_sizes):
            if i == 0:
                layer = nn.Linear(in_size, layer_size)
                layer.weight.data.uniform_(-0.01, 0.01)
                layer.bias.data.zero_()
                self.layers.append(layer)
                # self.layers.append(nn.Linear(in_size, layer_size))
            else:
                layer = nn.Linear(layer_sizes[i-1], layer_size)
                layer.weight.data.uniform_(-0.01, 0.01)
                layer.bias.data.zero_()
                self.layers.append(layer)
                # self.layers.append(nn.Linear(layer_sizes[i-1], layer_size))

        # set each layer's activation function
        self.layer_acts = layer_acts


    def forward(self, x):

        # retrieve context word embeddings and concat horizontally
        x = self.emb(x).view((x.shape[0], -1))

        # forward pass
        for i, layer in enumerate(self.layers):
            x = layer(x)
            x = self.layer_acts[i](x)

        return x

### train and eval func

In [12]:
# run one epoch of training

def train1epoch(model, optimizer, criterion, train_dataloader):

    # turn on training mode
    model.train()

    # reset epoch_loss tracker
    epoch_loss = 0

    # iterate through mini-batches
    for xtrain_batch, ytrain_batch in train_dataloader:

        optimizer.zero_grad()   # zero the gradient buffers
        output = model(xtrain_batch)
        loss = criterion(output, ytrain_batch)
        loss.backward()
        optimizer.step()   # does the update

        epoch_loss += loss.item()

    print(f'  epoch loss: {epoch_loss}')
    return epoch_loss

In [13]:
# eval

def eval(model, eval_data):

    # turn on eval mode
    model.eval()

    # turn off gradient calc to reduce memory consumption
    with torch.no_grad():

        # get model predictions on eval_data
        ypred = torch.argmax(model(eval_data.wins), dim=1)

        # count correct predictions
        ycorrect = torch.sum(torch.eq(ypred, eval_data.tags_encoded)).item()

        # total num of obs in eval_data
        ytotal = len(eval_data.tags_encoded)

        # compute accuracy
        yaccu = ycorrect / ytotal

    print(f'  accuracy: {yaccu}')
    return yaccu

### train & eval wrapper

In [14]:
# wrapper for train & eval
# reference: https://pytorch.org/tutorials/beginner/saving_loading_models.html

def main_process(model, name, optimizer, criterion, train_data, batch_size, shuffle, val_data, test_data, max_epochs=10, early_stopping=3):


    # create batched iterator for train_data
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)


    # initialize vars: track metrics
    epoch_losses = []
    train_evals = []
    dev_evals = []

    # initialize vars: track best model
    best_dev_eval = 0
    best_model_epoch = -1

    # train and eval
    for epoch in range(max_epochs):

        print(f'epoch {epoch+1}')

        # train
        epoch_loss = train1epoch(
                          model=model,
                          optimizer=optimizer,
                          criterion=criterion,
                          train_dataloader=train_dataloader
                    )
        epoch_losses.append(epoch_loss)

        # eval on training set
        train_eval = eval(model=model, eval_data=train_data)
        train_evals.append(train_eval)

        # eval on dev set
        dev_eval = eval(model=model, eval_data=val_data)
        dev_evals.append(dev_eval)

        # update best model based on dev eval
        if dev_eval > best_dev_eval:

            # save state_dict of best model so far
            torch.save(model.state_dict(), '/content/drive/MyDrive/postag/models/'+name+'_best.pth.tar')

            # update which epoch best_model is from
            best_model_epoch = epoch

            # update best_dev_accu
            best_dev_eval = dev_eval

        print(f'  best model from epoch {best_model_epoch+1}')

        # early stopping based on dev eval
        if early_stopping is not None:
            if epoch - best_model_epoch >= early_stopping:
                print('========= EARLY STOPPING =========')
                break


    # load state_dict of best model (modifies input model in place)
    print(f'load best model')
    model.load_state_dict(torch.load('/content/drive/MyDrive/postag/models/'+name+'_best.pth.tar'))

    # eval best model on devtest set
    print(f'eval best model on devtest')
    devtest_eval = eval(model=model, eval_data=test_data)


    return epoch_losses, train_evals, dev_evals, devtest_eval


### run model: w=0, all vocab

In [15]:
# instantiate model: single hidden layer 128 with tanh nonlinearity, w=0, all vocab (random init)
tagger_w0 = FeedForwardNN(w=0, vocab_size=len(all_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],  # last layer is the output layer
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=None, emb_freeze=False)

# instantiate optimizer
sgd = optim.SGD(tagger_w0.parameters(), lr=0.02)

# train and eval
epoch_losses, train_evals, dev_evals, devtest_eval = main_process(
                                                          model=tagger_w0,
                                                          name='tagger_w0',  # file name used for using checkpoint
                                                          optimizer=sgd,
                                                          criterion=nn.CrossEntropyLoss(),   # objective: log loss
                                                          train_data=train_w0_allvocab,
                                                          batch_size=1,
                                                          shuffle=True,
                                                          val_data=dev_w0_allvocab,
                                                          test_data=devtest_w0_allvocab,
                                                          max_epochs=20,
                                                          early_stopping=3   # when dev eval doesn't improve for 3 consecutive epochs
                                                      )

epoch 1
  epoch loss: 46347.013986587524
  accuracy: 0.13736135434909516
  accuracy: 0.13690105787181084
  best model from epoch 1
epoch 2
  epoch loss: 34884.007500339765
  accuracy: 0.5943374197314653
  accuracy: 0.5627463181912467
  best model from epoch 2
epoch 3
  epoch loss: 23141.247784628024
  accuracy: 0.7304728546409808
  accuracy: 0.6755859780128604
  best model from epoch 3
epoch 4
  epoch loss: 15073.23813212157
  accuracy: 0.8451838879159369
  accuracy: 0.750674133997096
  best model from epoch 4
epoch 5
  epoch loss: 10207.699771025225
  accuracy: 0.9008756567425569
  accuracy: 0.7678904791537026
  best model from epoch 5
epoch 6
  epoch loss: 7751.990728748206
  accuracy: 0.9253356684179802
  accuracy: 0.7751503837378138
  best model from epoch 6
epoch 7
  epoch loss: 6622.862369486495
  accuracy: 0.9164623467600701
  accuracy: 0.7749429578925534
  best model from epoch 6
epoch 8
  epoch loss: 5879.283340677779
  accuracy: 0.914302393461763
  accuracy: 0.764986517320058

With a window size of 0, the best taggging accuracy on DEV is 77.52% from epoch 6; this best model has a tagging accuracy of 78.72% on DEVTEST.   
(In the cell outputs above, the first accuracy score in each epoch is the accuracy on TRAIN, and the second accuracy score in each epoch is the accuracy on DEV.)

### run model: w=1, all vocab

In [16]:
# instantiate model: single hidden layer 128 with tanh nonlinearity, w=1, all vocab (random init)
tagger_w1 = FeedForwardNN(w=1, vocab_size=len(all_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],  # last layer is the output layer
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=None, emb_freeze=False)

# instantiate optimizer
sgd = optim.SGD(tagger_w1.parameters(), lr=0.02)

# train and eval
epoch_losses, train_evals, dev_evals, devtest_eval = main_process(
                                                          model=tagger_w1,
                                                          name='tagger_w1',  # file name used for using checkpoint
                                                          optimizer=sgd,
                                                          criterion=nn.CrossEntropyLoss(),   # objective: log loss
                                                          train_data=train_w1_allvocab,
                                                          batch_size=1,
                                                          shuffle=True,
                                                          val_data=dev_w1_allvocab,
                                                          test_data=devtest_w1_allvocab,
                                                          max_epochs=20,
                                                          early_stopping=3    # when dev eval doesn't improve for 3 consecutive epochs
                                                      )

epoch 1
  epoch loss: 46249.25761397183
  accuracy: 0.16351430239346176
  accuracy: 0.15308027380211575
  best model from epoch 1
epoch 2
  epoch loss: 26344.955107870977
  accuracy: 0.7582603619381203
  accuracy: 0.716863721219664
  best model from epoch 2
epoch 3
  epoch loss: 12880.483928764414
  accuracy: 0.8875072971395213
  accuracy: 0.7660236465463597
  best model from epoch 3
epoch 4
  epoch loss: 7795.16006897964
  accuracy: 0.925569176882662
  accuracy: 0.791537025513379
  best model from epoch 4
epoch 5
  epoch loss: 5983.9946808077175
  accuracy: 0.9436660828955049
  accuracy: 0.7994192076332711
  best model from epoch 5
epoch 6
  epoch loss: 4676.601857609816
  accuracy: 0.9543490951546993
  accuracy: 0.8043974279195187
  best model from epoch 6
epoch 7
  epoch loss: 3786.175114842786
  accuracy: 0.9607705779334501
  accuracy: 0.7940261356565028
  best model from epoch 6
epoch 8
  epoch loss: 2924.863934574468
  accuracy: 0.9640396964389959
  accuracy: 0.7969300974901473
 

With a window size of 1, the best taggging accuracy on DEV is 80.44% from epoch 6; this best model has a tagging accuracy of 82.07% on DEVTEST.   
(In the cell outputs above, the first accuracy score in each epoch is the accuracy on TRAIN, and the second accuracy score in each epoch is the accuracy on DEV.)

## 1.2 feature engineering

In [25]:
# check the errors the best tagger so far made on DEV

# get a mask for correct predictions
ycorrect_mask = torch.eq(
    torch.argmax(tagger_w1(dev_w1_allvocab.wins), dim=1),  # pred
    dev_w1_allvocab.tags_encoded  # true
)

# get center words with wrong pred
yerror = dev_w1_allvocab.center_words[~ycorrect_mask]

In [77]:
best_model(dev_w0_allvocab.wins)

tensor([[ 14.9545,   7.8195,  -3.3007,  ...,   4.8590,   5.0649,  -7.5874],
        [ -4.2738,  -6.1849,   4.4028,  ...,  -7.7645,   5.7791, -10.8850],
        [ 11.2687,   4.8644,   1.0903,  ...,   0.6950,   9.4577, -15.1235],
        ...,
        [ -1.0855,  -1.5283,   4.3029,  ...,  -1.9557,   5.5269,  -6.1479],
        [ -7.5095,   0.5310,   4.1376,  ...,   1.6492,   1.7842,  11.8887],
        [  5.6387,   2.1828,   3.6610,  ...,  -0.0710,   8.9411, -10.3145]],
       grad_fn=<AddmmBackward0>)

In [78]:
tagger_w0(dev_w0_allvocab.wins)

tensor([[ 13.7580,   7.4531,  -0.7973,  ...,   3.3517,   0.4652,  -9.2348],
        [ -4.9620,  -5.3379,   2.0708,  ...,  -6.7571,   7.6977,  -6.9138],
        [  8.4014,   5.4101,   1.4895,  ...,  -0.2391,   7.4384, -13.2695],
        ...,
        [  0.6930,  -2.8784,   3.5722,  ...,  -2.9129,   5.8869,  -7.7423],
        [ -7.5352,  -0.0978,   1.8020,  ...,   3.1513,   5.3963,  11.3258],
        [  4.7839,   2.3918,   2.8345,  ...,  -0.3393,   9.4155,  -9.6965]],
       grad_fn=<AddmmBackward0>)

In [79]:
eval(best_model, dev_w0_allvocab)

  accuracy: 0.7718315702136486


0.7718315702136486

In [80]:
eval(tagger_w0, dev_w0_allvocab)

  accuracy: 0.7641568139390168


0.7641568139390168

In [64]:
# TODO
# - model state saving
# - additional feature

In [67]:
# wrap encoded datasets in dataloaders: w = 0, all vocab
train_w0_allvocab_dataloader = DataLoader(train_w0_allvocab, batch_size=1, shuffle=True)


# instantiate model w=0, all vocab
tagger_w0 = FeedForwardNN(w=0, vocab_size=len(all_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=None, emb_freeze=False)


# instantiate optimizer
sgd = optim.SGD(tagger_w0.parameters(), lr=0.02)



# train and eval
max_epochs = 10

epoch_losses = []
train_accus = []
dev_accus = []

best_dev_accu = 0
best_model = tagger_w0
best_model_epoch = -1

for epoch in range(max_epochs):

    print(f'epoch {epoch+1}')

    # train
    epoch_loss = train1epoch(
                      model=tagger_w0,
                      optimizer=sgd,
                      criterion=nn.CrossEntropyLoss(),
                      train_dataloader=train_w0_allvocab_dataloader
                 )
    epoch_losses.append(epoch_loss)

    # eval on training set
    train_accu = eval(model=tagger_w0, eval_data=train_w0_allvocab)
    train_accus.append(train_accu)

    # eval on dev set
    dev_accu = eval(model=tagger_w0, eval_data=dev_w0_allvocab)
    dev_accus.append(dev_accu)

    # update best model
    if dev_accu > best_dev_accu:
        best_model = deepcopy(tagger_w0)  # save best_model so far
        best_model_epoch = epoch  # update which epoch best_model is from
        best_dev_accu = dev_accu  # update best_dev_accu
        print(f'  best model from epoch {best_model_epoch+1}')


# eval best model on devtest set
print(f'best model on devtest')
devtest_accu = eval(model=best_model, eval_data=devtest_w0_allvocab)


epoch 1
  epoch loss: 46368.374962091446
  accuracy: 0.11722124927028604
  accuracy: 0.10371292263015972
  best model from epoch 1
epoch 2
  epoch loss: 34184.30587075767
  accuracy: 0.6823701109165208
  accuracy: 0.6533914125700062
  best model from epoch 2
epoch 3
  epoch loss: 18312.380577613396
  accuracy: 0.8474022183304145
  accuracy: 0.7307612528521054
  best model from epoch 3
epoch 4
  epoch loss: 12563.383699736762
  accuracy: 0.8927028604786924
  accuracy: 0.7542003733665215
  best model from epoch 4
epoch 5
  epoch loss: 9912.72281500377
  accuracy: 0.9061879743140688
  accuracy: 0.765608794855839
  best model from epoch 5
epoch 6
  epoch loss: 8242.175429728086
  accuracy: 0.9116170461179218
  accuracy: 0.7703795892968264
  best model from epoch 6
epoch 7
  epoch loss: 6970.244936785886
  accuracy: 0.9151196730881495
  accuracy: 0.7670607757726613
epoch 8
  epoch loss: 6286.66907957042
  accuracy: 0.9092819614711033
  accuracy: 0.7539929475212611
epoch 9
  epoch loss: 5822

In [None]:
# # wrap encoded datasets in dataloaders


# # w = 0, all vocab
# train_w0_allvocab_dataloader = DataLoader(train_w0_allvocab, batch_size=1, shuffle=True)
# dev_w0_allvocab_dataloader = DataLoader(dev_w0_allvocab, batch_size=1, shuffle=True)
# devtest_w0_allvocab_dataloader = DataLoader(devtest_w0_allvocab, batch_size=1, shuffle=True)

# # w = 1, all vocab
# train_w1_allvocab_dataloader = DataLoader(train_w1_allvocab, batch_size=1, shuffle=True)
# dev_w1_allvocab_dataloader = DataLoader(dev_w1_allvocab, batch_size=1, shuffle=True)
# devtest_w1_allvocab_dataloader = DataLoader(devtest_w1_allvocab, batch_size=1, shuffle=True)


# # w = 0, pretrained 30k
# train_w0_30k_dataloader = DataLoader(train_w0_30k, batch_size=1, shuffle=True)
# dev_w0_30k_dataloader = DataLoader(dev_w0_30k, batch_size=1, shuffle=True)
# devtest_w0_30k_dataloader = DataLoader(devtest_w0_30k, batch_size=1, shuffle=True)

# # w = 1, pretrained 30k
# train_w1_30k_dataloader = DataLoader(train_w1_30k, batch_size=1, shuffle=True)
# dev_w1_30k_dataloader = DataLoader(dev_w1_30k, batch_size=1, shuffle=True)
# devtest_w1_30k_dataloader = DataLoader(devtest_w1_30k, batch_size=1, shuffle=True)

In [None]:
# # temb = nn.Embedding.from_pretrained(emb_pretrained, freeze=True)
# temb = nn.Embedding(len(all_vocab), 50)
# temb.weight.data.uniform_(-0.01, 0.01)
# temb.weight
# # print(temb.weight.shape)

# curr_input = Xtrain_w1[0].unsqueeze(0)
# # curr_input = Xtrain_w1[:1]
# print(curr_input.shape)
# concatemb = temb(curr_input).view((curr_input.shape[0], -1))
# print(concatemb.shape)

Parameter containing:
tensor([[ 1.0425e-03, -5.8820e-03, -8.6644e-03,  ..., -7.9195e-03,
         -3.7283e-03,  7.7972e-03],
        [ 6.4962e-03, -6.2669e-03, -3.7780e-03,  ...,  2.8969e-03,
          7.5352e-03, -4.0234e-03],
        [-9.9592e-03, -4.3420e-03, -5.1301e-03,  ...,  4.6443e-04,
         -6.4169e-03, -6.4338e-03],
        ...,
        [ 5.9787e-03,  1.8511e-03,  8.4176e-03,  ...,  6.5441e-03,
         -5.4045e-03, -2.6694e-03],
        [ 8.0650e-03, -3.3898e-03,  4.1751e-03,  ..., -3.1981e-04,
          5.9205e-03,  3.9072e-03],
        [-5.9091e-03, -6.2013e-03, -2.4727e-03,  ..., -3.6859e-03,
          5.9371e-04, -7.3646e-05]], requires_grad=True)

In [None]:
# concatemb_flat = concatemb.view((concatemb.shape[0], -1))
# concatemb_flat.shape

torch.Size([2, 150])

In [None]:
# instantiate model w=0
tagger_w0 = FeedForwardNN(w=0, vocab_size=len(all_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=None, emb_freeze=False)
# len(list(tagger_w0.parameters()))


# instantiate optimizer
sgd = optim.SGD(tagger_w0.parameters(), lr=0.02)
# set loss func
criterion = nn.CrossEntropyLoss()


max_epochs = 10
for epoch in range(max_epochs):


    # run 1 epoch
    print(f'epoch {epoch+1}')

    tagger_w0.train()  # turn on training mode
    epoch_loss = 0


    for xtrain_batch, ytrain_batch in train_w0_allvocab_dataloader:

        # print(xtrain_batch)
        # print(ytrain_batch)

        sgd.zero_grad()   # zero the gradient buffers
        output = tagger_w0(xtrain_batch)
        loss = criterion(output, ytrain_batch)
        loss.backward()
        sgd.step()   # does the update

        epoch_loss += loss.item()

    print(f'  epoch loss: {epoch_loss}')


    # eval on training set
    tagger_w0.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ytrain_pred = torch.argmax(tagger_w0(train_w0_allvocab.wins), dim=1)
        ytrain_correct = torch.sum(torch.eq(ytrain_pred, train_w0_allvocab.tags_encoded)).item()
        ytrain_total = len(train_w0_allvocab.tags_encoded)
        ytrain_accu = ytrain_correct / ytrain_total
    print(f'  train accu: {ytrain_accu}')

    # eval on dev set
    tagger_w0.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ydev_pred = torch.argmax(tagger_w0(dev_w0_allvocab.wins), dim=1)
        ydev_correct = torch.sum(torch.eq(ydev_pred, dev_w0_allvocab.tags_encoded)).item()
        ydev_total = len(dev_w0_allvocab.tags_encoded)
        ydev_accu = ydev_correct / ydev_total
    print(f'  dev accu: {ydev_accu}')


# eval best model on devtest set



epoch 1
  epoch loss: 46359.56888842583
  train accu: 0.1514302393461763
  dev accu: 0.1557768097904999
epoch 2
  epoch loss: 36511.22461539088
  train accu: 0.6244016345592528
  dev accu: 0.6050611906243518
epoch 3
  epoch loss: 20647.9830187872
  train accu: 0.7897256275539989
  dev accu: 0.7313835303878863
epoch 4
  epoch loss: 13142.984029729269
  train accu: 0.883420899007589
  dev accu: 0.751296411532877
epoch 5
  epoch loss: 9927.773577833897
  train accu: 0.9056625802685347
  dev accu: 0.7635345364032359
epoch 6
  epoch loss: 8590.880470508913
  train accu: 0.9119089316987741
  dev accu: 0.7624974071769343
epoch 7
  epoch loss: 7707.652470903413
  train accu: 0.9249270286047869
  dev accu: 0.7712092926778676
epoch 8
  epoch loss: 6816.784264039488
  train accu: 0.9195563339171045
  dev accu: 0.7660236465463597
epoch 9
  epoch loss: 6216.87197396993
  train accu: 0.9262697022767076
  dev accu: 0.7778469197261979
epoch 10
  epoch loss: 5744.520718641375
  train accu: 0.9217746643

In [None]:
# instantiate model w=1
tagger_w1 = FeedForwardNN(w=1, vocab_size=len(all_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=None, emb_freeze=False)
# len(list(tagger_w1.parameters()))


# instantiate optimizer
sgd = optim.SGD(tagger_w1.parameters(), lr=0.02)
# set loss func
criterion = nn.CrossEntropyLoss()


max_epochs = 10
for epoch in range(max_epochs):


    # run 1 epoch
    print(f'epoch {epoch+1}')

    tagger_w1.train()  # turn on training mode
    epoch_loss = 0


    for xtrain_batch, ytrain_batch in train_w1_allvocab_dataloader:

        # print(xtrain_batch)
        # print(ytrain_batch)

        sgd.zero_grad()   # zero the gradient buffers
        output = tagger_w1(xtrain_batch)
        loss = criterion(output, ytrain_batch)
        loss.backward()
        sgd.step()   # does the update

        epoch_loss += loss.item()

    print(f'  epoch loss: {epoch_loss}')


    # eval on training set
    tagger_w1.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ytrain_pred = torch.argmax(tagger_w1(train_w1_allvocab.wins), dim=1)
        ytrain_correct = torch.sum(torch.eq(ytrain_pred, train_w1_allvocab.tags_encoded)).item()
        ytrain_total = len(train_w1_allvocab.tags_encoded)
        ytrain_accu = ytrain_correct / ytrain_total
    print(f'  train accu: {ytrain_accu}')

    # eval on dev set
    tagger_w1.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ydev_pred = torch.argmax(tagger_w1(dev_w1_allvocab.wins), dim=1)
        ydev_correct = torch.sum(torch.eq(ydev_pred, dev_w1_allvocab.tags_encoded)).item()
        ydev_total = len(dev_w1_allvocab.tags_encoded)
        ydev_accu = ydev_correct / ydev_total
    print(f'  dev accu: {ydev_accu}')


# eval best model on devtest set



epoch 1
  epoch loss: 36041.978227217594
  train accu: 0.5828371278458844
  dev accu: 0.5718730553827007
epoch 2
  epoch loss: 16047.907971054286
  train accu: 0.8486281377699941
  dev accu: 0.7600082970338105
epoch 3
  epoch loss: 9350.579022063437
  train accu: 0.9163455925277292
  dev accu: 0.796100394109106
epoch 4
  epoch loss: 6425.25555942486
  train accu: 0.9440163455925278
  dev accu: 0.8048122796100394
epoch 5
  epoch loss: 4911.637813769848
  train accu: 0.9469352014010508
  dev accu: 0.7981746525617092
epoch 6
  epoch loss: 4130.616831530634
  train accu: 0.9600700525394046
  dev accu: 0.8025305953121759
epoch 7
  epoch loss: 3199.3462827699595
  train accu: 0.9615878575598366
  dev accu: 0.7987969300974902
epoch 8
  epoch loss: 2535.824724089276
  train accu: 0.9718038528896672
  dev accu: 0.8029454470026965
epoch 9
  epoch loss: 2146.7702989513787
  train accu: 0.9804436660828955
  dev accu: 0.7998340593237917
epoch 10
  epoch loss: 1670.3467638214952
  train accu: 0.9821

In [None]:
# instantiate model w=1, fixed pretrained embedding
tagger_w1_fixedpretrained = FeedForwardNN(w=1, vocab_size=len(emb_pretrained_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=emb_pretrained, emb_freeze=True)
# len(list(tagger_w1_fixedpretrained.parameters()))


# instantiate optimizer
sgd = optim.SGD(tagger_w1_fixedpretrained.parameters(), lr=0.02)
# set loss func
criterion = nn.CrossEntropyLoss()


max_epochs = 10
for epoch in range(max_epochs):


    # run 1 epoch
    print(f'epoch {epoch+1}')

    tagger_w1_fixedpretrained.train()  # turn on training mode
    epoch_loss = 0


    for xtrain_batch, ytrain_batch in train_w1_30k_dataloader:

        # print(xtrain_batch)
        # print(ytrain_batch)

        sgd.zero_grad()   # zero the gradient buffers
        output = tagger_w1_fixedpretrained(xtrain_batch)
        loss = criterion(output, ytrain_batch)
        loss.backward()
        sgd.step()   # does the update

        epoch_loss += loss.item()

    print(f'  epoch loss: {epoch_loss}')


    # eval on training set
    tagger_w1_fixedpretrained.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ytrain_pred = torch.argmax(tagger_w1_fixedpretrained(train_w1_30k.wins), dim=1)
        ytrain_correct = torch.sum(torch.eq(ytrain_pred, train_w1_30k.tags_encoded)).item()
        ytrain_total = len(train_w1_30k.tags_encoded)
        ytrain_accu = ytrain_correct / ytrain_total
    print(f'  train accu: {ytrain_accu}')

    # eval on dev set
    tagger_w1_fixedpretrained.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ydev_pred = torch.argmax(tagger_w1_fixedpretrained(dev_w1_30k.wins), dim=1)
        ydev_correct = torch.sum(torch.eq(ydev_pred, dev_w1_30k.tags_encoded)).item()
        ydev_total = len(dev_w1_30k.tags_encoded)
        ydev_accu = ydev_correct / ydev_total
    print(f'  dev accu: {ydev_accu}')


# eval best model on devtest set



epoch 1
  epoch loss: 16369.005858714794
  train accu: 0.8449503794512551
  dev accu: 0.8344741754822651
epoch 2
  epoch loss: 9320.458782576461
  train accu: 0.8655575014594279
  dev accu: 0.8477494295789255
epoch 3
  epoch loss: 8422.924471781236
  train accu: 0.8650321074138938
  dev accu: 0.8471271520431446
epoch 4
  epoch loss: 7934.014341046579
  train accu: 0.880502043199066
  dev accu: 0.8574984443061605
epoch 5
  epoch loss: 7540.962700240924
  train accu: 0.8785755983654407
  dev accu: 0.8531425015556938
epoch 6
  epoch loss: 7192.139447728374
  train accu: 0.8861646234676007
  dev accu: 0.8514830947936113
epoch 7
  epoch loss: 6820.51915438598
  train accu: 0.8892002335084647
  dev accu: 0.8581207218419415
epoch 8
  epoch loss: 6506.891840068924
  train accu: 0.8973730297723292
  dev accu: 0.8657954781165733
epoch 9
  epoch loss: 6227.353143249253
  train accu: 0.906713368359603
  dev accu: 0.8684920141049575
epoch 10
  epoch loss: 5917.889376106414
  train accu: 0.909573847

In [None]:
tagger_w1_fixedpretrained.emb.weight

Parameter containing:
tensor([[ 8.0050e-03,  8.8390e-03, -7.6610e-03,  ...,  3.3940e-03,
          4.0300e-04,  2.6620e-03],
        [ 2.0712e-01, -3.1345e-02,  9.1379e-02,  ...,  1.5570e-01,
         -6.7304e-02, -2.5445e-02],
        [-3.2129e-01,  5.0717e-02,  2.1766e-01,  ..., -1.6058e-01,
          5.7263e-02,  3.8416e-01],
        ...,
        [-8.6111e-01,  3.4025e-01,  1.6014e-01,  ...,  4.0112e-01,
         -3.7185e-01, -4.0742e-01],
        [-1.0521e-01, -2.7858e-01,  4.9961e-01,  ..., -2.5552e-01,
         -2.2169e-01,  5.1709e-01],
        [-2.3829e-01,  5.1810e-02,  3.3805e-01,  ...,  1.7722e-01,
         -6.0529e-02, -1.6389e-01]])

In [None]:
emb_pretrained

tensor([[ 8.0050e-03,  8.8390e-03, -7.6610e-03,  ...,  3.3940e-03,
          4.0300e-04,  2.6620e-03],
        [ 2.0712e-01, -3.1345e-02,  9.1379e-02,  ...,  1.5570e-01,
         -6.7304e-02, -2.5445e-02],
        [-3.2129e-01,  5.0717e-02,  2.1766e-01,  ..., -1.6058e-01,
          5.7263e-02,  3.8416e-01],
        ...,
        [-8.6111e-01,  3.4025e-01,  1.6014e-01,  ...,  4.0112e-01,
         -3.7185e-01, -4.0742e-01],
        [-1.0521e-01, -2.7858e-01,  4.9961e-01,  ..., -2.5552e-01,
         -2.2169e-01,  5.1709e-01],
        [-2.3829e-01,  5.1810e-02,  3.3805e-01,  ...,  1.7722e-01,
         -6.0529e-02, -1.6389e-01]])

In [None]:
# instantiate model w=1, fine-tuned pretrained embedding
tagger_w1_tunedpretrained = FeedForwardNN(w=1, vocab_size=len(emb_pretrained_vocab), emb_dim=50,
                          layer_sizes=[128, len(all_tags)],
                          layer_acts=[nn.Tanh(), nn.Identity()],   # nn.CrossEntropyLoss() already includes softmax transformation
                          pretrained_emb=emb_pretrained, emb_freeze=False)
# len(list(tagger_w1_tunedpretrained.parameters()))


# instantiate optimizer
sgd = optim.SGD(tagger_w1_tunedpretrained.parameters(), lr=0.02)
# set loss func
criterion = nn.CrossEntropyLoss()


max_epochs = 10
for epoch in range(max_epochs):


    # run 1 epoch
    print(f'epoch {epoch+1}')

    tagger_w1_tunedpretrained.train()  # turn on training mode
    epoch_loss = 0


    for xtrain_batch, ytrain_batch in train_w1_30k_dataloader:

        # print(xtrain_batch)
        # print(ytrain_batch)

        sgd.zero_grad()   # zero the gradient buffers
        output = tagger_w1_tunedpretrained(xtrain_batch)
        loss = criterion(output, ytrain_batch)
        loss.backward()
        sgd.step()   # does the update

        epoch_loss += loss.item()

    print(f'  epoch loss: {epoch_loss}')


    # eval on training set
    tagger_w1_tunedpretrained.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ytrain_pred = torch.argmax(tagger_w1_tunedpretrained(train_w1_30k.wins), dim=1)
        ytrain_correct = torch.sum(torch.eq(ytrain_pred, train_w1_30k.tags_encoded)).item()
        ytrain_total = len(train_w1_30k.tags_encoded)
        ytrain_accu = ytrain_correct / ytrain_total
    print(f'  train accu: {ytrain_accu}')

    # eval on dev set
    tagger_w1_tunedpretrained.eval()  # turn on eval mode
    with torch.no_grad():  # turn off gradient calc to reduce memory consumption
        ydev_pred = torch.argmax(tagger_w1_tunedpretrained(dev_w1_30k.wins), dim=1)
        ydev_correct = torch.sum(torch.eq(ydev_pred, dev_w1_30k.tags_encoded)).item()
        ydev_total = len(dev_w1_30k.tags_encoded)
        ydev_accu = ydev_correct / ydev_total
    print(f'  dev accu: {ydev_accu}')


# eval best model on devtest set



epoch 1
  epoch loss: 14670.984842691923
  train accu: 0.891009924109749
  dev accu: 0.8550093341630367
epoch 2
  epoch loss: 6710.139597005856
  train accu: 0.9273204903677759
  dev accu: 0.8682845882596971
epoch 3
  epoch loss: 4973.363695524482
  train accu: 0.9360186806771745
  dev accu: 0.8624766645924082
epoch 4
  epoch loss: 4005.304377012074
  train accu: 0.9389375364856976
  dev accu: 0.8651732005807924
epoch 5
  epoch loss: 3292.6318096834343
  train accu: 0.9558669001751313
  dev accu: 0.8695291433312591
epoch 6
  epoch loss: 2835.4244742415253
  train accu: 0.9660828955049621
  dev accu: 0.8732628085459448
epoch 7
  epoch loss: 2353.4613265826006
  train accu: 0.9709281961471103
  dev accu: 0.8579132959966812
epoch 8
  epoch loss: 2089.4233974509316
  train accu: 0.9732049036777584
  dev accu: 0.8660029039618337
epoch 9
  epoch loss: 1726.4095906020575
  train accu: 0.9776999416228839
  dev accu: 0.8572910184609003
epoch 10
  epoch loss: 1580.652004197805
  train accu: 0.98

In [None]:
tagger_w1_tunedpretrained.emb.weight

Parameter containing:
tensor([[ 5.5893e-02, -2.1001e-01, -2.9847e-02,  ..., -1.1455e-01,
         -1.9788e-01,  2.1989e-01],
        [ 2.8332e-01, -2.4272e-01, -6.2081e-03,  ...,  2.9925e-01,
         -6.3570e-02,  5.4027e-02],
        [-4.9772e-01, -1.5044e-01, -1.3602e-02,  ..., -2.1467e-01,
         -2.3371e-03,  4.4186e-01],
        ...,
        [-8.6111e-01,  3.4025e-01,  1.6014e-01,  ...,  4.0112e-01,
         -3.7185e-01, -4.0742e-01],
        [-1.0521e-01, -2.7858e-01,  4.9961e-01,  ..., -2.5552e-01,
         -2.2169e-01,  5.1709e-01],
        [-4.5604e-03,  8.8334e-02,  1.7233e-01,  ..., -4.9501e-04,
          4.8545e-02,  1.7906e-01]], requires_grad=True)