# POS and BIOES NN model stacking experiments:
# (0) Library imports:-

In [5]:
from POSTaggerModel import RNNPOSTagger
from ConLLBIOESTaggerModel import RNNBIOESTagger
import torch
import pickle, gzip
from typing import Tuple, List

# (1) Step 1.1: Load all 8 dictionaries:-

In [2]:
# (1.1) POS dictionary from neural network #1:

def load_pickles(filename):
    data_raw_filepath = "C:/Users/rahin/projects/nlp-pos-bioes-tagging/data/raw/"

    if "pklz" in filename:     
        file = gzip.open(data_raw_filepath+str(filename),"rb")
    else:
        # print(f"loading: {data_raw_filepath+str(filename)}")
        file = open(data_raw_filepath+str(filename),"rb")

    return pickle.load(file)

list_of_lookup_tables = ["nn1_wsj_idx_to_pos.pkl",
                         "nn1_wsj_pos_to_idx.pkl",
                         "nn2_ConLL2003_pos_tags.pkl",
                         "nn1_wsj_word_to_idx.pkl",
                         "nn1_wsj_idx_to_word.pkl",
                         "nn2_ConLL2003_vocabulary.pkl",
                         "nn2_ConLL2003_BIOES_tags.pkl"]
# (a) Vocabulory: 
nn1_vocab = load_pickles(list_of_lookup_tables[3])
nn1_vocab_reverse = load_pickles(list_of_lookup_tables[4])
nn2_vocab = load_pickles(list_of_lookup_tables[5])
nn2_vocab_reverse = {}
for key,value in nn2_vocab.items():
    nn2_vocab_reverse[value] = key

# (b) POS tags:
nn1_pos_tags = load_pickles(list_of_lookup_tables[0])
nn1_pos_tags_reverse = load_pickles(list_of_lookup_tables[1])
nn2_pos_tags_reverse = load_pickles(list_of_lookup_tables[2])
nn2_pos_tags = {}
for key,value in nn2_pos_tags_reverse.items():
    nn2_pos_tags[value] = key

# (c) BIOES tags:
nn2_bioes_tags_reverse = load_pickles(list_of_lookup_tables[6])
nn2_bioes_tags = {}
for key,value in nn2_bioes_tags_reverse.items():
    nn2_bioes_tags[value] = key


In [16]:
# checking Vocab
# print(list(nn1_vocab.keys())[:10])
# print(list(nn1_vocab_reverse.keys())[:10])
print(list(nn2_vocab.keys())[:10])
# print(list(nn2_vocab_reverse.keys())[:10])

# checking POS tags
# print(list(nn1_pos_tags.keys())[:10])
# print(list(nn1_pos_tags_reverse.keys())[:10])
# print(list(nn2_pos_tags.keys())[:10])
print(list(nn2_pos_tags_reverse.keys())[:10])

# checking BIOES tags
# print(list(nn2_bioes_tags.keys())[:10])
print(list(nn2_bioes_tags_reverse.keys())[:10])

['Forecasts', '31.80', 'Johns', 'concluding', 'Seizinger', 'extraordinary', 'sorghum', 'shorter', 'adversories', 'Kamiel']
['CC', 'NNPS', '$', 'VBP', 'VB', 'FW', 'RBR', 'PDT', 'LS', 'JJR']
['I-LST', 'E-PRT', 'I-CONJP', 'E-ADJP', 'I-ADVP', 'E-INTJ', 'E-PP', 'B-CONJP', '-X-', 'B-ADVP']


# (1) Step 1.2: Sync POS tags from both corpus': 

In [3]:
# (a) Check what needs to be replaced:
in_penn_not_in_conll, in_conll_not_in_penn = [], []
penn_tags = list(nn1_pos_tags_reverse.keys())
conll_tags = list(nn2_pos_tags_reverse.keys())
for item in penn_tags:
    if item not in conll_tags:
        in_penn_not_in_conll.append(item)
for item in conll_tags:
    if item not in penn_tags:
        in_conll_not_in_penn.append(item)

print(f"Tag in Penn but not in Conll:\n{in_penn_not_in_conll}")
print(f"Tag in Conll but not in Penn:\n{in_conll_not_in_penn}")

############################################################################

# (b) Change in Tags -> IDX table
nn2_pos_tags_reverse['-LRB-'] = nn2_pos_tags_reverse.pop('(')
nn2_pos_tags_reverse['-RRB-'] = nn2_pos_tags_reverse.pop(')')
nn2_pos_tags_reverse['``'] = nn2_pos_tags_reverse.pop('"')
nn2_pos_tags_reverse['#'] = nn2_pos_tags_reverse.pop('-X-')
nn2_pos_tags_reverse['-NONE-'] = nn2_pos_tags_reverse.pop('NN|SYM')

print(nn1_pos_tags_reverse.keys())
print(nn2_pos_tags_reverse.keys())

# (c) Change in Idx -> Tags table
nn2_pos_tags['11'] = '#'
nn2_pos_tags['3'] = '-RRB-'
nn2_pos_tags['33'] = '-LRB-'
nn2_pos_tags['45'] = '-NONE'
nn2_pos_tags['29'] = '``'


nn2_pos_tags.values()

Tag in Penn but not in Conll:
['``', '-RRB-', '#', '-LRB-']
Tag in Conll but not in Penn:
[')', '-X-', '"', '(', 'NN|SYM']
dict_keys(['NN', 'UH', 'PDT', ',', 'CD', '``', 'JJS', 'FW', 'RBS', 'WRB', 'WP$', 'NNPS', 'NNP', 'VBG', 'VBP', 'WP', 'NNS', 'VBD', ':', 'DT', 'CC', 'RB', 'RBR', '-RRB-', '.', "''", 'VB', 'PRP', 'EX', 'VBZ', 'IN', 'JJ', 'JJR', 'SYM', 'WDT', 'TO', 'POS', 'VBN', 'MD', 'RP', 'PRP$', '#', '-LRB-', 'LS', '$', 'PADDING'])
dict_keys(['CC', 'NNPS', '$', 'VBP', 'VB', 'FW', 'RBR', 'PDT', 'LS', 'JJR', 'NNP', '.', 'WP$', ':', 'DT', 'NN', 'VBZ', 'EX', 'MD', 'RP', 'UH', 'VBD', 'PRP$', ',', 'WRB', 'TO', 'RBS', 'WDT', 'JJS', 'POS', 'VBG', 'PRP', 'WP', 'IN', 'NNS', 'SYM', "''", 'VBN', 'JJ', 'CD', 'RB', 'PADDING', '-LRB-', '-RRB-', '``', '#', '-NONE-'])


dict_values(['CC', 'NNPS', '$', ')', 'VBP', 'VB', 'FW', 'RBR', 'PDT', 'LS', 'JJR', '-X-', 'NNP', '.', 'WP$', ':', 'DT', 'NN', 'VBZ', 'EX', 'MD', 'RP', 'UH', 'VBD', 'PRP$', ',', 'WRB', 'TO', 'RBS', '"', 'WDT', 'JJS', 'POS', '(', 'VBG', 'PRP', 'WP', 'IN', 'NNS', 'SYM', "''", 'VBN', 'JJ', 'CD', 'RB', 'NN|SYM', 'PADDING', '#', '-RRB-', '-LRB-', '-NONE', '``'])

In [4]:
# nn1_vocab # word to idx
# nn1_pos_tags_reverse # Tags to idx
len(nn1_pos_tags_reverse)

47

# (2) Step 2: Load pre-trained model weights:-

In [4]:
# model 1: Hyperparameters:
POS_VOCAB_SIZE = len(nn1_vocab)+1
POS_EMBED_DIM = 100
POS_HIDDEN_DIM = 32
POS_NUM_LAYERS = 2
POS_NUM_OF_CLASSES = len(nn1_pos_tags_reverse)
POS_N_EPOCHS = 10
POS_LEARNING_RATE = 0.02
POS_BATCH_SIZE = 128
# model 1: POS tagger
pos_tagger_model = RNNPOSTagger(embedding_dimension = POS_EMBED_DIM,
                    vocabulary_size = POS_VOCAB_SIZE,
                    hidden_dimension = POS_HIDDEN_DIM,
                    num_of_layers = POS_NUM_LAYERS,
                    dropout = 0.1,
                    output_dimension = POS_NUM_OF_CLASSES)
pos_tagger_model.load_state_dict(torch.load("C:/Users/rahin/projects/nlp-pos-bioes-tagging/data/processed/PennPOSmodel.pth"))  

###########################################################################################
# model 2: Hyperparameters:
BIOES_VOCAB_SIZE = len(nn2_vocab)+len(nn2_pos_tags_reverse)+2
BIOES_EMBED_DIM = 100
BIOES_HIDDEN_DIM = 64
BIOES_NUM_LAYERS = 2
BIOES_NUM_OF_CLASSES = len(nn2_bioes_tags_reverse)+1
BIOES_N_EPOCHS = 10
BIOES_LEARNING_RATE = 0.01
BIOES_BATCH_SIZE = 32

################################### 02. NN Model  ########################################

bioes_tagger_model = RNNBIOESTagger(embedding_dimension= BIOES_EMBED_DIM,
                            vocabulary_size=BIOES_VOCAB_SIZE,
                            hidden_dimension=BIOES_HIDDEN_DIM,
                            num_of_layers=BIOES_NUM_LAYERS,
                            dropout=0.2,
                            output_dimension=BIOES_NUM_OF_CLASSES)
bioes_tagger_model.load_state_dict(torch.load("C:/Users/rahin/projects/nlp-pos-bioes-tagging/data/processed/ConLLBIOESmodel.pth"))                              

<All keys matched successfully>

In [7]:
validation_dataset = load_pickles("PennTreeBankValid.pklz")
# validation_dataset[0] #everything is in text format

# 3. Validation Dataset through nn 1 and 2:- 
## (3.1) Functions that converts Token & Tags --> Index and Index --> Token & Tags:- 

In [34]:
def token_pipeline(x):
    
    if len(x) < 50:
        for i in range(0,50-len(x)):
            x.append('PADDING')
    return [nn1_vocab[tok] for tok in x]

def token_reverse_pipeline(x):
    return [nn1_vocab_reverse[idx] for idx in x]

def pos_reverse_pipeline(x):
    return [nn1_pos_tags[idx] for idx in x]

def pos_pipeline(x):

    if len(x) <50:
        for i in range(0,50-len(x)):
            x.append('PADDING')
    return [nn1_pos_tags_reverse[pos] for pos in x]
#######################################################################################################

## (3.2) Function returns predicted tags of the validation dataset:-

In [111]:
def predict_dataset(input_dataset) -> Tuple[List, List]:

    # convert text to numbers after expanding sentence length to 50 elements
    sentence_to_idx = token_pipeline(input_dataset) 
    idx_to_tensor = torch.tensor(sentence_to_idx).unsqueeze(1).T #input sentence as tensors {A}
    

    with torch.no_grad():
        output = pos_tagger_model(idx_to_tensor)
        predicted_output_tensor = torch.argmax(output, dim=2) # predicted labels as tensors {B}
        predicted_output = predicted_output_tensor.squeeze(1).tolist()[0]

    predicted_output_tags = pos_reverse_pipeline(predicted_output)
        
    # return idx_to_tensor, predicted_output # return Tuple({A},{B})
    return predicted_output_tags, input_dataset

# tag the validation dataset:
predictions_pos_tagger = []

for idx, (sentence,label) in enumerate(validation_dataset):
    predictions_pos_tagger.append(predict_dataset(sentence))

## (3.3) File 1: POS Tags | Sentence tree construction:-

In [160]:
def write_line_to_file(newline):
    with open(r"C:\Users\rahin\projects\nlp-pos-bioes-tagging\data\interim\pos_tags_tree.txt","a") as f:
        f.write(newline+'\n')
        f.close()

for idx, samples in enumerate(predictions_pos_tagger):
    pos_tags_final = predictions_pos_tagger[idx][0]
    tokens_final = predictions_pos_tagger[idx][1]
    line = ""
    for item in zip(pos_tags_final, tokens_final):
        line = line + str(item) + ' '

    write_line_to_file(line)
# f.close()
    
