In [1]:
from collections import Counter
import os
import json

def build_vocab(data_dir, min_count_word=1, min_count_tag=1):
    PAD_WORD = '<pad>'
    PAD_TAG = 'O'
    UNK_WORD = 'UNK'

    def save_vocab_to_txt_file(vocab, txt_path):
        with open(txt_path, "w") as f:
            for token in vocab:
                f.write(token + '\n')

    def save_dict_to_json(d, json_path):
        with open(json_path, 'w') as f:
            d = {k: v for k, v in d.items()}
            json.dump(d, f, indent=4)

    def update_vocab(txt_path, vocab):
        with open(txt_path) as f:
            for i, line in enumerate(f):
                vocab.update(line.strip().split(' '))
        return i + 1

    words = Counter()
    size_train_sentences = update_vocab(os.path.join(data_dir, 'train/sentences.txt'), words)
    size_dev_sentences = update_vocab(os.path.join(data_dir, 'valid/sentences.txt'), words)
    size_test_sentences = update_vocab(os.path.join(data_dir, 'test/sentences.txt'), words)

    tags = Counter()
    size_train_tags = update_vocab(os.path.join(data_dir, 'train/tags.txt'), tags)
    size_dev_tags = update_vocab(os.path.join(data_dir, 'valid/tags.txt'), tags)
    size_test_tags = update_vocab(os.path.join(data_dir, 'test/tags.txt'), tags)

    assert size_train_sentences == size_train_tags
    assert size_dev_sentences == size_dev_tags
    assert size_test_sentences == size_test_tags

    words = [tok for tok, count in words.items() if count >= min_count_word]
    tags = [tok for tok, count in tags.items() if count >= min_count_tag]

    if PAD_WORD not in words: words.append(PAD_WORD)
    if PAD_TAG not in tags: tags.append(PAD_TAG)

    words.append(UNK_WORD)

    save_vocab_to_txt_file(words, os.path.join(data_dir, 'words.txt'))
    save_vocab_to_txt_file(tags, os.path.join(data_dir, 'tags.txt'))

    sizes = {
        'train_size': size_train_sentences,
        'dev_size': size_dev_sentences,
        'test_size': size_test_sentences,
        'vocab_size': len(words),
        'number_of_tags': len(tags),
        'pad_word': PAD_WORD,
        'pad_tag': PAD_TAG,
        'unk_word': UNK_WORD
    }
    save_dict_to_json(sizes, os.path.join(data_dir, 'dataset_params.json'))

    to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
    return to_print

# Example usage:
data_dir = 'data/'
result = build_vocab(data_dir)
print("Characteristics of the dataset:\n{}".format(result))

Characteristics of the dataset:
- train_size: 14041
- dev_size: 3250
- test_size: 3453
- vocab_size: 30291
- number_of_tags: 10
- pad_word: <pad>
- pad_tag: O
- unk_word: UNK


In [2]:
vocab = {}
words_path = 'data/words.txt'
with open(words_path) as f:
    for i, l in enumerate(f.read().splitlines()):
        vocab[l] = i

In [3]:
#write vocab to a json file
vocab_json_path = 'data/vocab.json'
with open(vocab_json_path, 'w') as f:
    json.dump(vocab, f, indent=4)
    

### Preparing data


In [4]:
train_sentences = []
train_tags = []

train_sentences_path = 'data/train/sentences.txt'

with open(train_sentences_path) as f:
    for sentence in f.read().splitlines():
        s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split()]
        train_sentences.append(s)

train_tags_path = 'data/train/tags.txt'

In [5]:
tag_map = {}
train_tags_path = 'data/tags.txt'
with open(train_tags_path) as f:
    for i, tag in enumerate(f.read().splitlines()):
        tag_map[tag] = i

tag_map

{'B-ORG': 0,
 '0': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8,
 'O': 9}

In [6]:
train_tags_file = 'data/train/tags.txt'
train_tags = []
with open(train_tags_file) as f:
    for sentence in f.read().splitlines():
        temp = []
        for tag in sentence.split(' '):
            temp.append(tag_map[tag])
        train_tags.append(temp)


In [7]:
train_tags

[[0, 1, 2, 1, 1, 1, 2, 1, 1],
 [3, 4],
 [5, 1],
 [1,
  0,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [5,
  1,
  1,
  1,
  1,
  0,
  6,
  1,
  1,
  1,
  3,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  5,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  3,
  4,
  4,
  4,
  1,
  1,
  1,
  1,
  1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  3,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [3,
  1,
  2,
  1,
  1,
  1,
  1,
  5,
  1,
  5,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  7,
  7,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1],
 [1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,

In [8]:
import torch 
import numpy as np
from torch.autograd import Variable

batch_sentences = train_sentences
batch_tags = train_tags

#compute length of longest sentence in batch
batch_max_len = max([len(s) for s in batch_sentences])

#prepare a numpy array with the data, initializing the data with 'PAD' 
#and all labels with -1; initializing labels to -1 differentiates tokens 
#with tags from 'PAD' tokens
vocab['PAD'] = len(vocab)
batch_data = vocab['PAD']*np.ones((len(batch_sentences), batch_max_len))
batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

#copy the data to the numpy array
for j in range(len(batch_sentences)):
    cur_len = len(batch_sentences[j])
    batch_data[j][:cur_len] = batch_sentences[j]
    batch_labels[j][:cur_len] = batch_tags[j]

#since all data are indices, we convert them to torch LongTensors
batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

#convert Tensors to Variables
batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)

In [9]:
batch_data, batch_labels

(tensor([[    0,     1,     2,  ..., 30291, 30291, 30291],
         [    9,    10, 30291,  ..., 30291, 30291, 30291],
         [   11,    12, 30291,  ..., 30291, 30291, 30291],
         ...,
         [ 4124,   693, 10550,  ..., 30291, 30291, 30291],
         [ 3105,  1049, 30291,  ..., 30291, 30291, 30291],
         [10567,  1396,  5452,  ..., 30291, 30291, 30291]]),
 tensor([[ 0,  1,  2,  ..., -1, -1, -1],
         [ 3,  4, -1,  ..., -1, -1, -1],
         [ 5,  1, -1,  ..., -1, -1, -1],
         ...,
         [ 0,  1,  0,  ..., -1, -1, -1],
         [ 1,  1, -1,  ..., -1, -1, -1],
         [ 0,  1,  0,  ..., -1, -1, -1]]))

In [10]:
from mixture_of_experts import MoE
import torch.nn as nn
moe = MoE(
    dim = 512,
    num_experts = 16,               # increase the experts (# parameters) of your model without increasing computation
    hidden_dim = 512 * 4,           # size of hidden dimension in each expert, defaults to 4 * dimension
    activation = nn.LeakyReLU,      # use your preferred activation, will default to GELU
    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
    second_threshold_train = 0.2,
    second_threshold_eval = 0.2,
    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
    loss_coef = 1e-2                # multiplier on the auxiliary expert balancing auxiliary loss
)


In [11]:
inputs = torch.randn(16, 3, 512)
moe(inputs)[0].shape

torch.Size([16, 3, 512])

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Expert(nn.Module):
    def __init__(self, input_size=512, hidden_size=1024, output_size=512):
        super(Expert, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class MoE(nn.Module):
    def __init__(self, num_experts=10, input_size=512, hidden_size=1024, output_size=512, dropout=0.5):
        super(MoE, self).__init__()
        self.num_experts = num_experts
        self.experts = nn.ModuleList([Expert(input_size, hidden_size, output_size) for _ in range(num_experts)])
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)  # Stack expert outputs along dim=1
        gate_values = self.sigmoid(expert_outputs)  # Apply sigmoid to get gate values
        weighted_sum = torch.sum(gate_values * expert_outputs, dim=1)  # Weighted sum of expert outputs
        output = self.dropout(weighted_sum)  # Apply dropout
        return output


In [13]:
import torch.nn as nn
import torch.nn.functional as F

class MyNet(nn.Module):
    def __init__(self, params, num_experts):
        super(MyNet, self).__init__()

        # Maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(params.vocab_size, params.embedding_dim)

        # The first LSTM layer takes embedded sentence
        self.lstm1 = nn.LSTM(params.embedding_dim, params.lstm_hidden_dim, batch_first=True)

        # Mixture of Experts layer
        self.moe = MoE(num_experts=num_experts, input_size=params.lstm_hidden_dim, hidden_size=params.lstm_hidden_dim, output_size=params.lstm_hidden_dim)
        
        self.expert_layers = nn.ModuleList([nn.Linear(params.lstm_hidden_dim, params.lstm_hidden_dim) for _ in range(num_experts)])

        # The second LSTM layer
        self.lstm2 = nn.LSTM(params.lstm_hidden_dim, params.lstm_hidden_dim, batch_first=True)

        # Fully connected layer transforms the output to give the final output layer
        self.fc = nn.Linear(params.lstm_hidden_dim, params.number_of_tags)

    def forward(self, s):
        # Apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)
        #print("Shape of s after embedding:", s.shape)

        # Run the first LSTM along the sentences of length s
        s, _ = self.lstm1(s)
        #print("Shape of s after LSTM1:", s.shape)

        # Apply the Mixture of Experts layer
        s = self.moe(s)
        #print("Shape of s after MoE:", s.shape)
        
        # Run the second LSTM layer
        s, _ = self.lstm2(s)
        #print("Shape of s after LSTM2:", s.shape)
        

        # Reshape the Variable so that each row contains one token
        s = s.contiguous().view(-1, s.shape[2])
        #print("Shape of s after reshaping:", s.shape)
        
        # Apply the fully connected layer and obtain the output for each token
        s = self.fc(s)
        #print("Shape of s after FC:", s.shape)

        return F.log_softmax(s, dim=1)



In [14]:
def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens

In [15]:
#create dataloader for my dataset
from torch.utils.data import Dataset
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

params = {'vocab_size': len(vocab), 'embedding_dim': 150, 'lstm_hidden_dim': 64, 'number_of_tags': len(tag_map) , 'intermediate_dim': 64, 'num_experts': 25}
params = type('params', (object,), params)

#initialize the model
model = MyNet(params , params.num_experts)

#initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

#initialize the dataset and dataloader
dataset = MyDataset(batch_data, batch_labels)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)


In [19]:
%%time
# training loop
for epoch in range(10):
    av_loss = 0
    for i, (data, labels) in enumerate(train_loader):
        model.zero_grad()
        outputs = model(data)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        av_loss += loss.item()
    av_loss /= i
    print(f'Epoch: {epoch}, loss: {loss.item()}')

#save the model
torch.save(model, 'model3.pth')

Epoch: 0, loss: 0.23989692330360413
Epoch: 1, loss: 0.2870084047317505
Epoch: 2, loss: 0.24411998689174652
Epoch: 3, loss: 0.2580723166465759
Epoch: 4, loss: 0.10683100670576096
Epoch: 5, loss: 0.25379642844200134
Epoch: 6, loss: 0.15155842900276184
Epoch: 7, loss: 0.1501251459121704
Epoch: 8, loss: 0.3469434976577759
Epoch: 9, loss: 0.12868747115135193
CPU times: user 27min 57s, sys: 32min 29s, total: 1h 27s
Wall time: 17min 8s


In [20]:
def sentence_eval(sentence, labels=None):
    model.eval()
    line = sentence
    sentence = [vocab[token] if token in vocab
                else vocab['UNK']
                for token in sentence.split(' ')]   

    #prepare input
    sentence = torch.LongTensor(sentence).view(1, -1)
    sentence = Variable(sentence)

    #predict
    outputs = model(sentence)
    outputs = outputs.view(1, -1, len(tag_map))
    outputs = torch.argmax(outputs, 2)

    #convert the outputs to tags
    outputs = outputs.view(-1).numpy()
    outputs = [list(tag_map.keys())[list(tag_map.values()).index(output)] for output in outputs]

    if labels is None:
        return outputs
    
    label = [list(tag_map.keys())[list(tag_map.values()).index(label)] for label in labels]

    corr = 0
    for i in range(len(line.split(' '))):
        print(f'{line.split(" ")[i]} - {outputs[i]} - {label[i]}')
        if outputs[i] == label[i]:
            corr += 1
    
    return corr


# write a loop that will go through each sentence in the test set and call sentence_Eval function

test_sentences = []
test_labels = []

test_sentences_file = 'data/test/sentences.txt'
test_labels_file = 'data/test/tags.txt'

with open(test_sentences_file) as f:
    for sentence in f.read().splitlines():
        test_sentences.append(sentence)

with open(test_labels_file) as f:
    for sentence in f.read().splitlines():
        l = [tag_map[label] for label in sentence.split(' ')]
        test_labels.append(l)

correct = 0

for i in range(len(test_sentences)):
    correct += sentence_eval(test_sentences[i], test_labels[i])

total = 0 
for i in range(len(test_sentences)):
    total += len(test_sentences[i].split(' '))

print(f'Wrong predictions: {correct}')
print(f'total predictions: {total}')
print(f'F1 score: {(correct/total)*100}')

SOCCER - 0 - 0
- - 0 - 0
JAPAN - B-PER - B-LOC
GET - 0 - 0
LUCKY - B-PER - 0
WIN - 0 - 0
, - 0 - 0
CHINA - B-ORG - B-PER
IN - 0 - 0
SURPRISE - 0 - 0
DEFEAT - B-PER - 0
. - 0 - 0
Nadim - B-MISC - B-PER
Ladki - I-MISC - I-PER
AL-AIN - 0 - B-LOC
, - 0 - 0
United - B-ORG - B-LOC
Arab - I-LOC - I-LOC
Emirates - I-LOC - I-LOC
1996-12-06 - I-PER - 0
Japan - B-LOC - B-LOC
began - 0 - 0
the - 0 - 0
defence - 0 - 0
of - 0 - 0
their - 0 - 0
Asian - B-LOC - B-MISC
Cup - I-MISC - I-MISC
title - 0 - 0
with - 0 - 0
a - 0 - 0
lucky - 0 - 0
2-1 - 0 - 0
win - 0 - 0
against - 0 - 0
Syria - B-LOC - B-LOC
in - 0 - 0
a - 0 - 0
Group - 0 - 0
C - 0 - 0
championship - 0 - 0
match - 0 - 0
on - 0 - 0
Friday - 0 - 0
. - 0 - 0
But - 0 - 0
China - B-ORG - B-LOC
saw - 0 - 0
their - 0 - 0
luck - 0 - 0
desert - B-PER - 0
them - 0 - 0
in - 0 - 0
the - 0 - 0
second - 0 - 0
match - 0 - 0
of - 0 - 0
the - 0 - 0
group - 0 - 0
, - 0 - 0
crashing - 0 - 0
to - 0 - 0
a - 0 - 0
surprise - 0 - 0
2-0 - 0 - 0
defeat - 0 - 0
to - 0

In [21]:
model.parameters

<bound method Module.parameters of MyNet(
  (embedding): Embedding(30292, 150)
  (lstm1): LSTM(150, 64, batch_first=True)
  (moe): MoE(
    (experts): ModuleList(
      (0-24): 25 x Expert(
        (fc1): Linear(in_features=64, out_features=64, bias=True)
        (fc2): Linear(in_features=64, out_features=64, bias=True)
      )
    )
    (sigmoid): Sigmoid()
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (expert_layers): ModuleList(
    (0-24): 25 x Linear(in_features=64, out_features=64, bias=True)
  )
  (lstm2): LSTM(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=10, bias=True)
)>

In [22]:
model.state_dict()

OrderedDict([('embedding.weight',
              tensor([[ 1.4567,  1.0668, -0.2822,  ...,  1.5922,  0.6758, -0.3296],
                      [-0.1364,  1.3730,  3.1861,  ...,  0.5842, -0.4462,  1.9046],
                      [-0.9111,  2.1084,  0.7806,  ..., -0.5133,  0.9992, -0.1061],
                      ...,
                      [-0.3167, -1.3075,  0.9658,  ...,  0.0783,  0.1280, -1.4817],
                      [ 0.5962, -1.3913, -2.3620,  ...,  1.7516, -1.0086, -0.4323],
                      [-0.4600,  0.4281, -1.5992,  ...,  0.3814, -0.9793, -0.8416]])),
             ('lstm1.weight_ih_l0',
              tensor([[ 1.0315, -0.3099,  0.4278,  ...,  0.3111, -0.6435,  0.8631],
                      [ 0.4188, -0.4876,  0.6634,  ...,  0.3398,  1.1406,  0.8840],
                      [ 0.7097,  0.3698,  0.1400,  ..., -0.2507,  0.7502, -0.4720],
                      ...,
                      [ 1.5731,  0.8867, -0.0671,  ...,  0.7128,  0.8839, -0.4627],
                      [ 0.1859, -