# Data Preparation

In [454]:
def read_file(file_path):
    fileobj = open(file_path, 'r', encoding='utf-8')
    samples = []
    tokens = []
    tags = []

    for content in fileobj:
 
        content = content.strip('\n')

        if content == '' or content == '\t':
            if len(tokens) != 0 :
                samples.append((tokens, tags))
                tokens = []
                tags = []
        else:
            contents = content.split('\t')
            tokens.append(contents[0])
            tags.append(contents[-1])
                
    return samples



In [455]:
path = '.\emerging_entities_17-master\emerging_entities_17-master\wnut17train.conll'
train = read_file(path)

path2 = '.\emerging_entities_17-master\emerging_entities_17-master\emerging.test.conll'
test = read_file(path2)

In [456]:
length = len(test)
for i in range(length):
    
    l = len(test[i][1])
    for j in range(l):
        la = test[i][1][j].split(',')[0]
        test[i][1][j]=la
#test[7]

# Q1a. Named Entity Recognition (NER) using CRF

In [457]:
def word2features2(sent, i):
    
    word = sent[0][i]
    #postag = sent[i][1]


    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    
    if i > 0:
        word1 = sent[0][i-1]
        #postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        # Indicate that it is the 'beginning of a document'
        features['BOS'] = True
        
    
    if i < len(sent[0])-1:
        word1 = sent[0][i+1]
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        # Features for words that are not at the end of a document
        features['EOS'] = True

    return features

In [458]:
def sent2features2(sent):
    j=len(sent[0])
    return [word2features2(sent, i) for i in range(j)]

In [459]:
def sent2labels2(sent):
    return sent[1]

### Feature Extraction:

In [460]:
# sentence representations for sequence labelling
X_train = [sent2features2(s) for s in train]
y_train = [sent2labels2(s) for s in train]

X_test = [sent2features2(s) for s in test]
y_test = [sent2labels2(s) for s in test]

### training

In [461]:
# train CRF model
!pip install sklearn_crfsuite
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)





In [462]:
crf.fit(X_train, y_train)
# training model parameters

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

### Evaluation

In [463]:
labels = ['B-location',
 'B-product',
 'O',
 'I-corporation',
 'I-creative-work',
 'I-person',
 'B-person',
 'B-corporation',
 'I-group',
 'I-product',
 'B-creative-work',
 'B-group',
 'I-location']

In [464]:
# evaluate CRF model
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.9164468422523492

### Inspect per-class results in more detail:

In [465]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

              O      0.945     0.997     0.970     21934
  B-corporation      0.000     0.000     0.000       117
  I-corporation      0.000     0.000     0.000        24
B-creative-work      0.455     0.022     0.041       231
I-creative-work      0.474     0.038     0.070       237
        B-group      0.200     0.031     0.054        97
        I-group      0.267     0.103     0.148        39
     B-location      0.294     0.164     0.211       122
     I-location      0.300     0.154     0.203        39
       B-person      0.500     0.101     0.169       355
       I-person      0.462     0.178     0.257       101
      B-product      0.000     0.000     0.000        56
      I-product      0.000     0.000     0.000        42

       accuracy                          0.939     23394
      macro avg      0.300     0.137     0.163     23394
   weighted avg      0.908     0.939     0.916     23394



# Q1b. Named Entity Recognition (NER) using Softmax

In [466]:
def sent2labels2(sent):
    return sent[1]

In [467]:
def sent2tokens2(sent):
    return sent[0]

In [468]:
X_train2 = [sent2tokens2(s) for s in train]
y_train2 = [sent2labels2(s) for s in train]

X_test2 = [sent2tokens2(s) for s in test]
y_test2 = [sent2labels2(s) for s in test]

In [469]:
len(X_train2)

3394

In [470]:
# sentence representations for sequence labelling
#train_sent_tokens = [sent2tokens(s) for s in train_sents]
#train_labels = [sent2labels(s) for s in train_sents]

train_sent_tokens=X_train2

train_labels = y_train2
train_labels = [i for i in train_labels  if i != '']

train_id_2_label = list(set([label for sent in train_labels for label in sent]))
#train_id_2_label = [i for i in train_id_2_label  if i != '']
train_label_2_id = {label:i for i, label in enumerate(train_id_2_label)}
print("Number of unique labels in training data:", len(train_id_2_label))

def convert_labels_to_inds(sent_labels, label_2_id):
    return [label_2_id[label] for label in sent_labels]

train_label_inds = [convert_labels_to_inds(sent_labels, train_label_2_id) for sent_labels in train_labels]

#test_sent_tokens = [sent2tokens(s) for s in test_sents]
#test_labels = [sent2labels(s) for s in test_sents]

test_sent_tokens = X_test2
test_labels =y_test2
test_label_inds = [convert_labels_to_inds(s, train_label_2_id) for s in test_labels]
test_id_2_label = list(set([label for sent in test_labels for label in sent]))


Number of unique labels in training data: 13


In [471]:
window_size = 2

# converting tokenized sentence lists to vocabulary indices
id_2_word = list(set([token for sent in train_sent_tokens for token in sent])) + ["<pad>", "<unk>"]
word_2_id = {w:i for i,w in enumerate(id_2_word)}

def convert_tokens_to_inds(sentence, word_2_id):
    return [word_2_id.get(t, word_2_id["<unk>"]) for t in sentence]

# padding for windows
def pad_sentence_for_window(sentence, window_size, pad_token="<pad>"):
    return [pad_token]*window_size + sentence + [pad_token]*window_size 


In [472]:
import pprint
pp = pprint.PrettyPrinter()

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

In [473]:
# Batching sentences together with a DataLoader

def my_collate(data, window_size, word_2_id):
    """
    For some chunk of sentences and labels
        -add winow padding
        -pad for lengths using pad_sequence
        -convert our labels to one-hots
        -return padded inputs, one-hot labels, and lengths
    """
    
    x_s, y_s = zip(*data)

    # deal with input sentences as we've seen
    window_padded = [convert_tokens_to_inds(pad_sentence_for_window(sentence, window_size), word_2_id)
                                                                                  for sentence in x_s]
    # append zeros to each list of token ids in batch so that they are all the same length
    padded = nn.utils.rnn.pad_sequence([torch.LongTensor(t) for t in window_padded], batch_first=True)
    
    # convert labels to one-hots
    labels = []
    lengths = []
    for y in y_s:
        lengths.append(len(y))
        one_hot = torch.zeros(len(y), len(train_id_2_label))
        y = torch.tensor(y)
        y = y.unsqueeze(1)
        label = one_hot.scatter_(1, y, 1)
        labels.append(label)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
    
    return padded.long(), padded_labels, torch.LongTensor(lengths)

In [474]:
batch_size = 4

# Shuffle True is good practice for train loaders.
# Use functools.partial to construct a partially populated collate function
train_loader = DataLoader(list(zip(train_sent_tokens, train_label_inds)), 
                            batch_size=batch_size, shuffle=True, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [475]:
'''
for batched_input, batched_labels, batch_lengths in train_loader:
    pp.pprint(("inputs", batched_input, batched_input.size()))
    pp.pprint(("labels", batched_labels, batched_labels.size()))
    pp.pprint(batch_lengths)
    break
'''

'\nfor batched_input, batched_labels, batch_lengths in train_loader:\n    pp.pprint(("inputs", batched_input, batched_input.size()))\n    pp.pprint(("labels", batched_labels, batched_labels.size()))\n    pp.pprint(batch_lengths)\n    break\n'

In [476]:
class SoftmaxWordWindowClassifier(nn.Module):
    """
    A one-layer, binary word-window classifier.
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        
        """
        Embedding layer
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False
        
        """
        Hidden layer
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        self.hidden_layer = nn.Sequential(nn.Linear(self.window_size*self.embed_dim, 
                                                    self.hidden_dim), 
                                          nn.Tanh())
        
        """
        Output layer
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)
        
        """
        Softmax
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim
            
        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()
        
        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()
        
        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)
        
        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S) 
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)
        
        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        
        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)
        
        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)
        
        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)
        
        return output

In [477]:
def loss_function(outputs, labels, lengths):
    """Computes negative LL loss on a batch of model predictions."""
    B, L, num_classes = outputs.size()
    num_elems = lengths.sum().float()
        
    # get only the values with non-zero labels
    loss = outputs*labels
    
    # rescale average
    return -loss.sum() / num_elems

In [478]:
def train_epoch(loss_function, optimizer, model, train_data):
    
    ## For each batch, we must reset the gradients
    ## stored by the model.   
    total_loss = 0
    for batch, labels, lengths in train_data:
        # clear gradients
        optimizer.zero_grad()
        # evoke model in training mode on batch
        outputs = model.forward(batch)
        # compute loss w.r.t batch
        loss = loss_function(outputs, labels, lengths)
        # pass gradients back, startiing on loss value
        loss.backward()
        # update parameters
        optimizer.step()
        total_loss += loss.item()
    
    # return the total to keep track of how you did this time around
    return total_loss

In [479]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings": False,
         }
learning_rate = .0002
num_epochs = 5
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [480]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)

0 2019.0639123916626
1 1827.551206946373
2 1640.8150030374527
3 1462.7690185308456
4 1294.7638195753098


### Evaluation
There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [481]:
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [482]:
test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())


In [483]:
y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


In [484]:
# evaluate Softmax model
!pip install sklearn-crfsuite
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)



0.8977781306441406

In [485]:
#print(y_pred[223:1582])


### Inspect per-class results in more detail:

In [486]:
# group B and I results
sorted_labels = sorted(
    test_id_2_label,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

              O      0.937     0.978     0.957     21934
  B-corporation      0.000     0.000     0.000       117
  I-corporation      0.000     0.000     0.000        24
B-creative-work      0.000     0.000     0.000       231
I-creative-work      0.000     0.000     0.000       237
        B-group      0.008     0.010     0.009        97
        I-group      0.000     0.000     0.000        39
     B-location      0.000     0.000     0.000       122
     I-location      0.000     0.000     0.000        39
       B-person      0.042     0.003     0.005       355
       I-person      0.000     0.000     0.000       101
      B-product      0.000     0.000     0.000        56
      I-product      0.000     0.000     0.000        42

       accuracy                          0.917     23394
      macro avg      0.076     0.076     0.075     23394
   weighted avg      0.880     0.917     0.898     23394



# Q1c. alternative values of the hyper-parameters using Softmax

### tuning hyper-parameters 'batch size':

In [487]:
config = {"batch_size": 8,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings": False,
         }
learning_rate = .0002
num_epochs = 5
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [488]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)

0 2042.8751380443573
1 1879.1225023269653
2 1718.3085926771164
3 1560.2284051179886
4 1405.0115276575089


In [489]:
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [490]:
test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())


In [491]:
y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


In [492]:
# evaluate Softmax model
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)

0.898585932490903

### tuning hyper-parameters 'embedding size':

In [493]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 50,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings":False,
         }
learning_rate = .0002
num_epochs = 5
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [494]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)

0 2123.584525823593
1 1916.9141519069672
2 1709.5205612182617
3 1503.984883069992
4 1309.1455084085464


In [495]:
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [496]:
test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())


In [497]:
y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


In [498]:
# evaluate Softmax model
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)

0.8776186523574464

### tuning hyper-parameters 'hidden layer size':

In [499]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 50,
          "num_classes": 13,
          "freeze_embeddings":False,
         }
learning_rate = .0002
num_epochs = 5
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [500]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)
    
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())

y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


0 1989.6052763462067
1 1812.6522456407547
2 1637.9923903942108
3 1466.5012859106064
4 1301.9076025485992


In [501]:
# evaluate Softmax model
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)

0.8986037288753662

### tuning hyper-parameters 'Freeze embeddings or not':

In [502]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings":True,
         }
learning_rate = .0002
num_epochs = 5
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [503]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)
    
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())

y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


0 2073.004519701004
1 1899.479402065277
2 1730.0330951213837
3 1564.0777504444122
4 1400.8539677858353


In [504]:
# evaluate Softmax model
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)

0.8867768645400548

### tuning hyper-parameters 'learning rate':

In [505]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings":False,
         }
learning_rate = .0005
num_epochs = 5
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [506]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)
    
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())

y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


0 1982.3485270738602
1 1515.8935358524323
2 1111.8003361225128
3 822.4663741588593
4 639.1291683316231


In [507]:
# evaluate Softmax model
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)

0.9045238301676479