In [1]:
import os
import sys
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import brown

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.vocab import vocab
from torchinfo import summary
from torch.utils.data import DataLoader



## Define Paths

In [2]:
base_path = Path("C:/Users/shaur/Desktop/Research/low_resource_training/pos_tagging")
artifacts_path = os.path.join(base_path, "artifacts")

### Download dataset

In [3]:
brown_corpus = brown.tagged_sents(tagset='universal')
tagged_sentences = brown_corpus

In [4]:
X = [] # store input sequence
Y = [] # store output sequence

for sentence in tagged_sentences:

    for entity in sentence:         
        X.append(entity[0].lower())  # entity[0] contains the word
        Y.append(entity[1])  # entity[1] contains corresponding tag

### Develop Tokenizer

#### Create Vocab

In [5]:
def get_vocab(dataset, min_freq=1):

    counter_word = Counter()
    for word in dataset: 
        counter_word.update([word])
    my_vocab_word = vocab(counter_word, min_freq=min_freq)
    my_vocab_word.insert_token('<unk>', 0)
    my_vocab_word.set_default_index(0)

    counter_character = Counter()
    for word in dataset:
        for i in range(0, len(word)):
            counter_character.update([word[i]])
    my_vocab_character = vocab(counter_character, min_freq=min_freq)
    my_vocab_character.insert_token('<unk>', 0)
    my_vocab_character.set_default_index(0)

    counter_bigram = Counter()
    for word in dataset:
        for i in range(0, len(word)):
            if i<len(word)-1:
                counter_bigram.update([word[i:i+2]])
    my_vocab_bigram = vocab(counter_bigram, min_freq=min_freq)
    my_vocab_bigram.insert_token('<unk>', 0)
    my_vocab_bigram.set_default_index(0)

    counter_trigram = Counter()
    for word in dataset:
        for i in range(0, len(word)):
            if i<len(word)-2:
                counter_trigram.update([word[i:i+3]])
    my_vocab_trigram = vocab(counter_trigram, min_freq=min_freq)
    my_vocab_trigram.insert_token('<unk>', 0)
    my_vocab_trigram.set_default_index(0)

    return my_vocab_word, my_vocab_character, my_vocab_bigram, my_vocab_trigram

In [6]:
vocab_word, vocab_character, vocab_bigram, vocab_trigram = get_vocab(X, min_freq=2)

### Collate Function for Dataloaders

In [7]:
def tokenizer(x):
    """Converts text to a list of indices using a vocabulary dictionary"""
    word_indices_list, ch_indices_lists, bigram_indices_lists, trigram_indices_lists = [], [], [], []

    for word in x:
        word_indices_list.append(vocab_word(word))
        ch_indices_list, bigram_indices_list, trigram_indices_list = [], [], []
        for i in range(0, len(word)):
            ch_indices_list.append(word[i])
            if i<len(word)-1: bigram_indices_list.append(word[i:i+2])
            if i<len(word)-2: trigram_indices_list.append(word[i:i+3])

        ch_indices_lists.append(ch_indices_list)
        bigram_indices_lists.append(bigram_indices_list)
        trigram_indices_lists.append(trigram_indices_list)
    
    return word_indices_list, ch_indices_lists, bigram_indices_lists, trigram_indices_lists
        


In [8]:
def collate_batch_emb(batch):

    labels, texts = zip(*batch)

### Deine Custom Model

In [11]:
class SimpleMLP(nn.Module):
    def __init__(self, 
                 word_vocab_size, 
                 ch_vocab_size, 
                 bigram_vocab_size, 
                 trigram_vocab_size, 
                 embedding_dim, 
                 hidden_dim1, 
                 hidden_dim2, 
                 drop_prob1, 
                 drop_prob2, 
                 num_outputs):
        
        super().__init__()

        self.embedding_word = nn.Embedding(word_vocab_size, embedding_dim)
        self.embedding_ch = nn.Embedding(ch_vocab_size, embedding_dim)
        self.embedding_bigram = nn.Embedding(bigram_vocab_size, embedding_dim)
        self.embedding_trigram = nn.Embedding(trigram_vocab_size, embedding_dim)

        self.linear1 = nn.Linear(embedding_dim*4, hidden_dim1)
        # Batch normalization for first linear layer
        self.batchnorm1 = nn.BatchNorm1d(num_features=hidden_dim1)
        # Dropout for first linear layer
        self.dropout1 = nn.Dropout(p=drop_prob1)

        # Second Linear layer
        self.linear2 = nn.Linear(hidden_dim1, hidden_dim2)
        # Batch normalization for second linear layer
        self.batchnorm2 = nn.BatchNorm1d(num_features=hidden_dim2)
        # Dropout for second linear layer
        self.dropout2 = nn.Dropout(p=drop_prob2)

        # Final Linear layer
        self.linear3 = nn.Linear(hidden_dim2, num_outputs)

    def forward(self, input_tuple):

        indices_word, indices_ch, indices_bigram, indices_trigram = input_tuple

        emb_word, emb_ch, emb_bigram, emb_trigram = self.embedding_word(indices_word), self.embedding_ch(indices_ch), self.embedding_bigram(indices_bigram), self.embedding_trigram(indices_trigram)
        x = torch.cat((emb_word, emb_ch, emb_bigram, emb_trigram), dim=-1)
        
        x = self.linear1(x)
        x = nn.ReLU()(x)
        x = self.batchnorm1(x)
        x = self.dropout1(x)

        x = self.linear2(x)
        x = nn.ReLU()(x)
        x = self.batchnorm2(x)
        x = self.dropout2(x)

        x = self.linear3(x)

        return x



In [12]:
# Define the device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Define the sequential model
# this will invoke the __init__() function of the model
model = SimpleMLP(word_vocab_size = 7, 
                 ch_vocab_size = 7, 
                 bigram_vocab_size = 7, 
                 trigram_vocab_size = 7, 
                 embedding_dim = 20, 
                 hidden_dim1 = 10, 
                 hidden_dim2 = 5, 
                 drop_prob1 = 0.5, 
                 drop_prob2 = 0.5, 
                 num_outputs = 2)

# Move the model to the device
model = model.to(device)

# Generate some dummy input data and offsets, and move them to the device
word_ind = torch.tensor([1, 2, 4, 5, 4], dtype = torch.int32).to(device)
ch_ind = torch.tensor([1, 2, 4, 5, 4], dtype = torch.int32).to(device)
bigram_ind = torch.tensor([1, 2, 4, 5, 4], dtype = torch.int32).to(device)
trigram_ind = torch.tensor([1, 2, 4, 5, 4], dtype = torch.int32).to(device)

# Generate summary
summary(model, input_data=[(word_ind, ch_ind, bigram_ind, trigram_ind)], device=device, depth =10, verbose = False)


Layer (type:depth-idx)                   Output Shape              Param #
SimpleMLP                                [5, 2]                    --
├─Embedding: 1-1                         [5, 20]                   140
├─Embedding: 1-2                         [5, 20]                   140
├─Embedding: 1-3                         [5, 20]                   140
├─Embedding: 1-4                         [5, 20]                   140
├─Linear: 1-5                            [5, 10]                   810
├─BatchNorm1d: 1-6                       [5, 10]                   20
├─Dropout: 1-7                           [5, 10]                   --
├─Linear: 1-8                            [5, 5]                    55
├─BatchNorm1d: 1-9                       [5, 5]                    10
├─Dropout: 1-10                          [5, 5]                    --
├─Linear: 1-11                           [5, 2]                    12
Total params: 1,467
Trainable params: 1,467
Non-trainable params: 0
Total mult-a

In [13]:
output = model((word_ind, ch_ind, bigram_ind, trigram_ind))

print(output)

tensor([[-0.4844,  0.3000],
        [ 0.3034,  0.4267],
        [ 0.3154,  0.0952],
        [-0.0282,  0.1706],
        [ 0.5052,  0.1715]], device='cuda:0', grad_fn=<AddmmBackward0>)
