# **POS Tagging with Penn Treebank dataset**

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 01. Penn Treebank dataset preparation





In [12]:
# Penn Treebank English Dataset Preparation
import pickle
import gzip

class PennTreeBankDataset():

    def load_file(self, filename:str): # given file names, returns each line
        file = open(filename)
        roh_daten = file.readlines()
        return roh_daten

    def preprocessing(self, file:str):
        #step 1: fetch file contents
        raw_dataset = self.load_file(file)

        #step 2: form sentences and corresponding POS tags:
        all_samples, all_labels = [], []
        for sample in raw_dataset:

            sample = sample.replace('\n','')           
            sentence_dirty = list(filter(None,sample.split(')'))) # spliiting on closing brackets and viewing a subset of the result 
            sentence_clean, tags_clean = [],[]
            
            for word in sentence_dirty:
                word = word.replace("(","")
                sentence_clean.append(word.split(' ')[-1])
                tags_clean.append(word.split(' ')[-2]) if len(word) > 2 else ''
            
            if len(sentence_clean) != len(tags_clean):
                print("Mismatch in no. of tokens in the line!")
                break
            
            # add these two to the big list
            all_samples.append(sentence_clean) 
            all_labels.append(tags_clean)

        if len(all_samples) != len(all_labels):
            print(f"Total no. of samples: {len(all_samples)} \nTotal no. of POS tags: {len(all_labels)}")
            print("Mismatch in no. of lines")
            exit

        return list(zip(all_samples,all_labels))

    def export_files(self): # one time to create datasets

        list_of_files = ["PennTreeBankTrain.pklz","PennTreeBankTest.pklz","PennTreeBankValid.pklz"]
        list_of_files_tags = ["PennTreeBankTrainPOS.pkl","PennTreeBankTestPOS.pkl","PennTreeBankValidPOS.pkl"]
        list_of_datasets = ["02-21.10way.clean.txt","23.auto.clean.txt","22.auto.clean.txt"]

        for final,pos,raw in zip(list_of_files,list_of_files_tags,list_of_datasets):
            # print(f"Processing: {raw} => {final} \t {pos}")
            dataset = self.preprocessing(file="/content/"+str(raw))
            # print(len(dataset))
            with gzip.open('/content/' +str(final), 'wb') as f:
                pickle.dump(dataset, f)
                f.close()

ds = PennTreeBankDataset()
a = ds.export_files()

## 02. Prepare Dictionary:

In [28]:


class PennTreeBankDictionary():

    def load_corpus(self):
        ds1 = PennTreeBankDataset()

        print("preparing train/test/valid datasets")
        valid_ds = ds1.preprocessing(file="/content/22.auto.clean.txt")
        test_ds = ds1.preprocessing(file="/content/23.auto.clean.txt")
        train_ds = ds1.preprocessing(file="/content/02-21.10way.clean.txt")
        complete_ds = valid_ds + test_ds + train_ds
        print("done")

        return complete_ds

    def tokens_and_tags(self):

        #Step 1: fetch dataset
        dataset = self.load_corpus()
        tokens, tags = [], []

        #Step 2: split sentences and pos tags to two separate list
        for sample in dataset:
            tokens.append(sample[0])
            tags.append(sample[1])

        #Step 3: list of lists to a single flat list and take unique words and pos tags
        all_tokens = [item for sublist in tokens for item in sublist]
        all_tokens = list(set(all_tokens))
        all_tags = [item for sublist in tags for item in sublist]
        all_tags = list(set(all_tags))

        return all_tokens, all_tags

    def vocabulary(self):
        
        print("preparing look-up dictionaries")
        words_in_corpus, pos_tags_in_corpus = self.tokens_and_tags()
        words_in_corpus.append('PADDING')
        pos_tags_in_corpus.append('PADDING')
        word_to_idx, pos_to_idx = {}, {}
        idx_to_word, idx_to_pos = {}, {} # for reverse look-up

        for idx, word in enumerate(words_in_corpus):
            word_to_idx[word] = idx
            idx_to_word[idx] = word

        for idx,pos in enumerate(pos_tags_in_corpus):
            pos_to_idx[pos] = idx
            idx_to_pos[idx] = pos

        print("done!")
        print(f"Total words in the dictionary: {len(word_to_idx)} \nTotal POS tags in the dictionary: {len(pos_to_idx)}")
        return word_to_idx, idx_to_word, pos_to_idx, idx_to_pos

# ex = PennTreeBankDictionary()
# a,b,c,d = ex.vocabulary()
# print(list(a.items())[:10])
# print(list(c.items())[:10])


## 03. Fetch dataset

In [32]:
from typing import List, Tuple
# from prepareDictionary import PennTreeBankDictionary
ds = PennTreeBankDictionary()
# from prepareDataset import PennTreeBankDataset
import torch
import pickle, gzip
from torch.utils.data import Dataset, DataLoader

class myDataset(Dataset):

    def compose_dictionaries(self): 

        word_to_idx, idx_to_word, pos_to_idx, idx_to_pos = ds.vocabulary()
        return word_to_idx, idx_to_word, pos_to_idx, idx_to_pos

    def file_parser(self,filename) -> Tuple[List,List]:

        filepath = "/content/"+filename+'.pklz'
        samples_with_labels = []
        file = gzip.open(filepath,'rb')
        samples_with_labels = pickle.load(file)
        
        return samples_with_labels

    def file_tensor(self, sentences_and_tags) -> Tuple[List, List]:

        def token_pipeline(x):
            if len(x) < 50:
                for i in range(0,50-len(x)):
                    x.append('PADDING')
            return [self.word_to_idx[tok] for tok in x]

        def pos_pipeline(x):
            if len(x) < 50:
                for i in range(0,50-len(x)):
                    x.append('PADDING')
            return [self.pos_to_idx[pos] for pos in x]

        sent_to_idx, tags_to_idx = [], []
        for sent_tag in sentences_and_tags:
            if len(sent_tag[0]) >50:
                continue
            sent_to_idx.append(torch.tensor(token_pipeline(sent_tag[0])))
            tags_to_idx.append(torch.tensor(pos_pipeline(sent_tag[1])))

        return sent_to_idx, tags_to_idx

    def __init__(self, raw_dataset=None):

        print("STEP 01: Look-up Tables...")
        self.word_to_idx, self.idx_to_word, self.pos_to_idx, self.idx_to_pos = self.compose_dictionaries()
        print("dictionaries ready!")

        print("STEP 02: Fetching the dataset...")
        self.samples_with_labels = self.file_parser(raw_dataset)
        print("done!")

        print("STEP 03: Tokens and tags to Numbers...")
        self.samples_to_idx, self.labels_to_idx = self.file_tensor(self.samples_with_labels)

    def __len__(self):    
        return len(self.samples_to_idx)

    def __getitem__(self, index):
        return self.samples_to_idx[index], self.labels_to_idx[index]

# validation_dataset = DataLoader(dataset=myDataset("PennTreeBankValid")
#                                         ,shuffle=False
#                                         ,batch_size=16)



## 04. Neural Network Model

In [48]:
# from typing import final
from torch import nn
import torch

"""RNN Many-to-many multi-class classification neural network model structure definition"""

class RNNPOSTagger(nn.Module):

    def __init__(self, 
                embedding_dimension, 
                vocabulary_size,
                hidden_dimension,
                num_of_layers,
                dropout,
                output_dimension
                ):
        super(RNNPOSTagger, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocabulary_size,
                                    embedding_dim=embedding_dimension,
                                    padding_idx=45)

        self.lstm = nn.LSTM(embedding_dimension,
                            hidden_dimension,
                            num_of_layers,
                            dropout=dropout,
                            batch_first=True)
                            # bidirectional=True)

        self.fc = nn.Linear(hidden_dimension, output_dimension)

        # self.activation_fn = nn.Tanh()
        self.activation_fn = nn.LogSoftmax(dim=1)
        
        # self.dropout = nn.Dropout(dropout)

    def forward(self, sample):

        # (1)- Embedding layer
        embedded = self.embedding(sample)

        #-------------------------------------------------------------------------

        #(2)- LSTM layer 1
        output, (hidden, cell) = self.lstm(embedded)       

        #-------------------------------------------------------------------------

        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-1,:,:], hidden[0,:,:]), dim = 1)


        #(3)- LSTM to linear layer: Final set of tags
        dense_output = self.fc(output)

        #activation function
        outputs=self.activation_fn(dense_output)
 
        return outputs
        # return dense_output

## 05. Train Model

In [145]:
from torch.utils.data import DataLoader
# from fetchDataset import myDataset
train_test_ds = myDataset
# from prepareDictionary import PennTreeBankDictionary
ds = PennTreeBankDictionary()
import time
import os
# from model import RNNPOSTagger

import torch
from torch import nn
import torch.optim as optim

################################### 01. Train/Test dataset  ########################################
print("="*100)
print("01. Preparing train/test datasets:")

train_dataset = DataLoader(dataset=train_test_ds("PennTreeBankTrain"), batch_size=8, shuffle=True)
test_dataset = DataLoader(dataset=train_test_ds("PennTreeBankTest"), batch_size=8, shuffle=True)
# validation_dataset = DataLoader(dataset=myDataset("PennTreeBankValid"),batch_size=16,shuffle=True)
print("datasets ready!")
print("="*100)

################################# 02.Model Parameters ####################################
print("02. Loading Model Parameters:")
word_to_idx, idx_to_word, pos_to_idx, idx_to_pos = ds.vocabulary()

# read this seq2seq model: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html --> for understanding embedding dimension and output dimension  
VOCAB_SIZE = len(word_to_idx)+1
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_LAYERS = 2
NUM_OF_CLASSES = len(pos_to_idx)
N_EPOCHS = 5
LEARNING_RATE = 0.025#0.1
BATCH_SIZE = 128#16

print(f"Size of vocabulary: {VOCAB_SIZE}" + f"\tNumber of classes: {NUM_OF_CLASSES}")
##################################### 03. NN Model  ########################################

print("Step 02. builing the model...")
model = RNNPOSTagger(embedding_dimension= EMBED_DIM,
                    vocabulary_size=VOCAB_SIZE,
                    hidden_dimension=HIDDEN_DIM,
                    num_of_layers=NUM_LAYERS,
                    dropout=0.1,
                    output_dimension=NUM_OF_CLASSES)

print("Done! here is our model:")
print(model)
print("="*100)

############################# 04. Optimizer and Loss  ####################################

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CrossEntropyLoss(ignore_index=45)
criterion = nn.NLLLoss(ignore_index=45)


#define metric
def train_accuracy(preds, y):
    predicted_labels_dirty = preds.permute(0,2,1)
    predicted_labels_final = torch.argmax(predicted_labels_dirty, dim=2).tolist()
    actual_labels_final = y.tolist()
    accuracy_of_all_lines = []
    for predicted, actual in zip(predicted_labels_final, actual_labels_final):
        counter = 0
        # print(predicted)
        # print(actual)
        for pred,act in zip(predicted,actual):
            if pred == act:
                counter = counter+1
        accuracy_of_this_line = counter/50
        accuracy_of_all_lines.append(accuracy_of_this_line)
    accuracy = sum(accuracy_of_all_lines)/len(predicted_labels_final)

    return accuracy

def training_accuracy(preds, y):
    
    predsx = preds.permute(0,2,1) #reshape
    predsx2 = torch.argmax(predsx, dim=2) #find POS index with max value for each token

    for pred,act in zip(predsx2.tolist()[0],y.tolist()[0]):
        counter = 0
        if pred == act:
            counter = counter+1
        
    # correct = (predsx2 == y)
    # acc = correct.sum() / len(preds)
    acc = counter/len(preds)
    # print(type(acc))

    return acc
    
#push to cuda if available
# model = model.to(device)
# criterion = criterion.to(device)

############################## 05. NN Model Train Definition #############################

def train(model, dataset, optimizer, criterion):

    t = time.localtime()
    start_time = time.strftime("%H:%M:%S", t)
    print(start_time)

    epoch_loss = 0
    epoch_accuracy = 0

    epoch_dataset_length.append(len(dataset))

    model.train()

    for idx, (sample,label) in enumerate(dataset):
       
       current_samples = sample
       current_labels = label

       optimizer.zero_grad()

       predicted_labels = model(current_samples).permute(0,2,1)
      
       loss = criterion(predicted_labels, current_labels)
       accuracy = train_accuracy(predicted_labels, current_labels)

       loss.backward()
       optimizer.step()

       epoch_loss += loss.item()
       epoch_accuracy += accuracy

    return epoch_loss/len(dataset), epoch_accuracy/sum(epoch_dataset_length)

##########################################################################################
################################ 06. NN Model Eval Definition ############################
def evaluate(model, dataset, criterion):
    
    # start_time = time.time()
    # print(start_time)

    t = time.localtime()
    start_time = time.strftime("%H:%M:%S", t)
    print(start_time)

    epoch_loss = 0
    epoch_accuracy = 0
    model.eval()

    with torch.no_grad():

        for idx, (sample,label) in enumerate(dataset):
            current_samples = sample
            current_labels = label

            predicted_labels = model(current_samples).permute(0,2,1)

            loss = criterion(predicted_labels, current_labels)
            accuracy = train_accuracy(predicted_labels, current_labels)

            epoch_loss += loss.item()
            epoch_accuracy += accuracy

    return epoch_loss/len(dataset), epoch_accuracy/len(dataset)

##########################################################################################

################################## 06. NN Model training #####################################
#N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    print(f"Epoch #: {epoch}")
    epoch_dataset_length = []
    #train the model
    train_loss, train_acc = train(model, train_dataset, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, test_dataset, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print("-------------------------------------------------------------------")
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print("-------------------------------------------------------------------")

# modelpath = "notebooks"
# torch.save(model.state_dict(), os.path.join(modelpath, "PennPOSmodel.pth"))
torch.save(model.state_dict(),"/content/PennPOSmodel.pth")


01. Preparing train/test datasets:
STEP 01: Look-up Tables...
preparing look-up dictionaries
preparing train/test/valid datasets
done
done!
Total words in the dictionary: 46349 
Total POS tags in the dictionary: 46
dictionaries ready!
STEP 02: Fetching the dataset...
done!
STEP 03: Tokens and tags to Numbers...
STEP 01: Look-up Tables...
preparing look-up dictionaries
preparing train/test/valid datasets
done
done!
Total words in the dictionary: 46349 
Total POS tags in the dictionary: 46
dictionaries ready!
STEP 02: Fetching the dataset...
done!
STEP 03: Tokens and tags to Numbers...
datasets ready!
02. Loading Model Parameters:
preparing look-up dictionaries
preparing train/test/valid datasets
done
done!
Total words in the dictionary: 46349 
Total POS tags in the dictionary: 46
Size of vocabulary: 46350	Number of classes: 46
Step 02. builing the model...
Done! here is our model:
RNNPOSTagger(
  (embedding): Embedding(46350, 100, padding_idx=45)
  (lstm): LSTM(100, 64, num_layers=2, ba

In [146]:
############################################################################################
################################## 07. Model Predictions #####################################
from typing import Tuple, List
# from model import RNNPOSTagger
# from dataset import WSJDataset, vocabulary
# from fetchDataset import myDataset
# from prepareDictionary import PennTreeBankDictionary
ds2 = PennTreeBankDictionary()
valid_ds = myDataset
import torch
from torch.utils.data import DataLoader
import numpy as np
import pickle,gzip

############################### 01. Look-up dictionaries ####################################
word_to_idx, idx_to_word, pos_to_idx, idx_to_pos = ds2.vocabulary()

validation_dataset = DataLoader(dataset=valid_ds("PennTreeBankValid"),batch_size=16,shuffle=False)

# read this seq2seq model: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html --> for understanding embedding dimension and output dimension  
# VOCAB_SIZE = len(word_to_idx)+1
# EMBED_DIM = 100
# HIDDEN_DIM = 64
# NUM_LAYERS = 1
# NUM_OF_CLASSES = len(pos_to_idx)+1
# N_EPOCHS = 5
# LEARNING_RATE = 0.005
# BATCH_SIZE = 64

VOCAB_SIZE = len(word_to_idx)+1
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_LAYERS = 2
NUM_OF_CLASSES = len(pos_to_idx)
N_EPOCHS = 20
LEARNING_RATE = 0.06
BATCH_SIZE = 32

print(f"Our vocab size to the model is therefore: {VOCAB_SIZE}")
################################### 02. NN Model  ########################################

print("Step 02. builing the model...")
model = RNNPOSTagger(embedding_dimension= EMBED_DIM,
                            vocabulary_size=VOCAB_SIZE,
                            hidden_dimension=HIDDEN_DIM,
                            num_of_layers=NUM_LAYERS,
                            dropout=0.1,
                            output_dimension=NUM_OF_CLASSES)
print("----------------------------------------------------------------")
print("Done! here is our model:")
print(model)
print("----------------------------------------------------------------")

################################## 03. load trained model ###############################
model.load_state_dict(torch.load("/content/PennPOSmodel.pth"))
model.eval()

################################## 03. Predictions ###############################
print("Lets make predictions")

def token_pipeline(x):
    
    if len(x) < 50:
        for i in range(1,60-len(x)):
            x.append('PADDING')
    return [word_to_idx[tok] for tok in x]

def token_reverse_pipeline(x):
    return [idx_to_word[idx] for idx in x]

def pos_reverse_pipeline(x):
    return [idx_to_pos[idx] for idx in x]

def pos_pipeline(x):
    return [pos_to_idx[pos] for pos in x]
##############################################################################################
def predict_full_validation_dataset(example_sentence) -> Tuple[List, List]:
    sentence_to_tensor = example_sentence.unsqueeze(1).T
    with torch.no_grad():
        output = model(sentence_to_tensor)
        predicted_output = torch.argmax(output, dim=2)
        example_predicted_labels = pos_reverse_pipeline(predicted_output.tolist()[0])
        example_sentence_words = token_reverse_pipeline(sentence_to_tensor.tolist()[0])

    # return example_predicted_labels
    return example_sentence_words,example_predicted_labels
###############################################################################################
def predict_example(example_sentence, example_actual_labels):

    

    # preprocessing:-
    sentence_to_token = token_pipeline(example_sentence)
    sentence_to_tensor = torch.tensor(sentence_to_token).unsqueeze(1).T

    # predicted labels:-
    with torch.no_grad():
        output = model(sentence_to_tensor)
        predicted_output = torch.argmax(output, dim=2)
        print(predicted_output)
        #-------------
        print(pos_pipeline(example_actual_labels))
        # print(predicted_output.tolist()[0][:-1])
        print(predicted_output.tolist()[0][:len(example_actual_labels)])

        example_predicted_labels = pos_reverse_pipeline(predicted_output.tolist()[0])
        print("-"*100)
        print(f"Actual lables:- \n{example_actual_labels}")
        print(f"Predicted lables:- \n{example_predicted_labels[:len(example_actual_labels)]}")
        print("-"*100)
    # return example_predicted_labels[:len(example_actual_labels)]
###################################################################################################
example = [['This', 'time', ',', 'the', 'firms', 'were', 'ready', '.'],
            ['We', "'re", 'about', 'to', 'see', 'if', 'advertising', 'works', '.']]
example_labels = [['DT', 'NN', ',', 'DT', 'NNS', 'VBD', 'JJ', '.'],
                ['PRP', 'VBP', 'IN', 'TO', 'VB', 'IN', 'NN', 'VBZ', '.']]

predict_example(example_sentence=example[0],example_actual_labels=example_labels[0])
print("EXAMPLE 2")
predict_example(example_sentence=example[1],example_actual_labels=example_labels[1])

##################################################################################################
# print("Composing the result of first nn network to POS tag the dataset:")
# all_results = []
# for idx, (sample, label) in enumerate(validation_dataset):
#     for item in sample:
#         all_results.append(predict_full_validation_dataset(item))
# # print(all_results[:2])
# with gzip.open('C:/Users/rahin/projects/WSJ-POS-tagger/data/interim/validation_dataset_pos_tagged.pklz', 'wb') as f:
#     pickle.dump(all_results, f)
#     f.close()
# print("done!")
#########################################################################################    




preparing look-up dictionaries
preparing train/test/valid datasets
done
done!
Total words in the dictionary: 46349 
Total POS tags in the dictionary: 46
STEP 01: Look-up Tables...
preparing look-up dictionaries
preparing train/test/valid datasets
done
done!
Total words in the dictionary: 46349 
Total POS tags in the dictionary: 46
dictionaries ready!
STEP 02: Fetching the dataset...
done!
STEP 03: Tokens and tags to Numbers...
Our vocab size to the model is therefore: 46350
Step 02. builing the model...
----------------------------------------------------------------
Done! here is our model:
RNNPOSTagger(
  (embedding): Embedding(46350, 100, padding_idx=45)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, dropout=0.1)
  (fc): Linear(in_features=64, out_features=46, bias=True)
  (activation_fn): LogSoftmax(dim=1)
)
----------------------------------------------------------------
Lets make predictions
tensor([[40, 28, 43, 18, 14,  4, 29, 44, 35, 35, 35, 45, 45, 45, 45, 45, 45, 45,

In [147]:
# result comparison
all_actual_labels, all_predicted_labels = [],[] 

def pos_reverse_pipeline(x):
    return [idx_to_pos[idx] for idx in x]

for idx, (sample,label) in enumerate(validation_dataset):
  for sam in sample:
    all_predicted_labels.append(predict_full_validation_dataset(sam)[1])
  for lab in label:
    all_actual_labels.append(pos_reverse_pipeline(lab.tolist()))





In [149]:
def train_accuracy(preds, y):

    # print(len(preds)) #i get 10 samples
    accuracy_of_all_lines = []

    
    
    for pred,act in zip(preds,y):
        
        counter = 0
        
        for itemx,itemj in zip(pred,act):
          
          
          if itemx == itemj:
              counter = counter+1
        accuracy_of_this_line = counter/50

        accuracy_of_all_lines.append(accuracy_of_this_line)
    
    # print(accuracy_of_all_lines)
    acc = sum(accuracy_of_all_lines)/len(preds)

    return acc*100
train_accuracy(all_predicted_labels,all_actual_labels)

79.32258064516083