<h1> <div align="center"> Compositional Neural Network </div></h1>

In [2]:
import torch
from datasetConLL2003 import SlidingWindowDataset


In [3]:
ds = SlidingWindowDataset("C:/Users/rahin/projects/paper-draft-03/data/raw/ConLL2003-bioes-valid.txt")
x1, x2, y = ds.load_dictionaries()

loading dictionaries...
done
Performing dataset pre-processing activities...
loading dataset: ConLL2003-bioes-valid.txt
done
Parsing the dataset now...
converting tokens to indices to tensors
done
Length matches! Hurray!
loading dictionaries...
done


In [4]:
x2.keys() #VBG, -X-, (

dict_keys(['VBZ', '"', 'WDT', 'DT', 'VBG', 'MD', 'NN', 'WP$', 'NNS', 'PDT', 'WP', '-X-', 'WRB', 'NNPS', 'PRP', 'JJ', 'SYM', '(', 'PRP$', 'EX', 'VBD', "''", 'POS', 'RP', ')', ',', 'VBN', 'CD', 'JJS', 'LS', 'NNP', 'TO', 'VB', 'VBP', ':', 'JJR', 'RB', 'CC', '.', 'RBR', '$', 'FW', 'NN|SYM', 'UH', 'RBS', 'IN', 'PADDING'])

In [3]:
sentence = "Leicestershire beat Somerset by an innings and 39 runs"
pos = "VB NNP IN DT NN CC CD NNS ."
tok_in_sent, pos_in_sent = sentence.split(' '), pos.split(' ')

for padding in range(50-len(sentence.split(' '))):
    tok_in_sent.append('PADDING')
    pos_in_sent.append('PADDING')



In [4]:
sent_with_tags_idx = []
for word,pos in zip(tok_in_sent, pos_in_sent):
    features = x1[word]+x2[pos]
    sent_with_tags_idx.append(features)

sent_with_tags_tensor = torch.tensor(sent_with_tags_idx, dtype=torch.int64)


In [6]:
sent_with_tags_tensor

tensor([30003, 20574, 12800, 23952,  9699, 13617, 29459, 25035,  1505, 30338,
        30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338,
        30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338,
        30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338,
        30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338, 30338])

# Predict tags

In [5]:
from modelConLL2003 import RNNBIOESTagger
from datasetConLL2003 import SlidingWindowDataset
import torch
from torch.utils.data import DataLoader
import numpy as np

# read this seq2seq model: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html --> for understanding embedding dimension and output dimension  
VOCAB_SIZE = len(x1)+len(x2)+2
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_LAYERS = 2
NUM_OF_CLASSES = len(y)+1
N_EPOCHS = 10
LEARNING_RATE = 0.01
BATCH_SIZE = 32

print(f"Our vocab size to the model is therefore: {VOCAB_SIZE}")
################################### 02. NN Model  ########################################

print("Step 02. builing the model...")
model = RNNBIOESTagger(embedding_dimension= EMBED_DIM,
                            vocabulary_size=VOCAB_SIZE,
                            hidden_dimension=HIDDEN_DIM,
                            num_of_layers=NUM_LAYERS,
                            dropout=0.2,
                            output_dimension=NUM_OF_CLASSES)
print("----------------------------------------------------------------")
print("Done! here is our model:")
print(model)
print("----------------------------------------------------------------")


model.load_state_dict(torch.load("C:/Users/rahin/projects/paper-draft-03/notebooks/conLLmodel.pth"))
model.eval()

idx_to_BIOES = {}
print("Lets make predictions")

validation_dataset = DataLoader(dataset=SlidingWindowDataset("C:/Users/rahin/projects/paper-draft-03/data/raw/ConLL2003-bioes-valid.txt"),
                                batch_size=64,
                                shuffle=True)

for key, value in y.items():
    idx_to_BIOES[value] = key

# print(idx_to_BIOES)

def predict(sentence, model):

    # token idx to tensor conversion

    idx_to_torch01 = torch.tensor(sentence, dtype=torch.int64)
    idx_to_torch = idx_to_torch01.unsqueeze(1).T


    with torch.no_grad():
        output = model(idx_to_torch)
        predicted_ouput=torch.argmax(output,dim=2)
        
        predicted_labels = []

        for pred in predicted_ouput:
            for i in pred:
                predicted_labels.append(idx_to_BIOES[int(i)])

        return output, predicted_labels

model = model.to("cpu")

Our vocab size to the model is therefore: 30340
Step 02. builing the model...
----------------------------------------------------------------
Done! here is our model:
RNNBIOESTagger(
  (embedding): Embedding(30340, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=35, bias=True)
  (activation_fn): Tanh()
)
----------------------------------------------------------------
Lets make predictions
loading dictionaries...
done
Performing dataset pre-processing activities...
loading dataset: ConLL2003-bioes-valid.txt
done
Parsing the dataset now...
converting tokens to indices to tensors
done
Length matches! Hurray!


In [8]:
probs, preds = predict(sent_with_tags_tensor, model)

  idx_to_torch01 = torch.tensor(sentence, dtype=torch.int64)


In [12]:
preds_only_tokens = []

for tag in preds:
    if tag !="PADDING":
        preds_only_tokens.append(tag)
preds_only_tokens
preds_only_tokens = ['B-VP', 'B-NP', 'I-NP', 'E-NP', 'B-VP', 'B-NP', 'B-VP', 'E-VP', 'B-NP'] #corrected

['B-VP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'B-VP', 'I-VP', 'B-NP']