## CS310 Natural Language Processing
## Assignment 4. Dependency Parsing

**Total points**: 50

In this assignment, you will train feed-forward neural network-based dependency parser and evaluate its performance on the provided treebank dataset.

### 0. Import Necessary Libraries

In [1]:
import torch.nn as nn
from dep_utils import conll_reader, DependencyTree
import copy
from pprint import pprint
from collections import Counter, defaultdict
from typing import List, Dict, Tuple
import numpy as np

### 1. Read Data and Generate Training Instances

In [2]:
print('In train.conll:')
with open('fake_data/valid_train.conll') as f:
    train_trees = list(conll_reader(f))
print(f'{len(train_trees)} trees read.')

print('In dev.conll:')
with open('fake_data/valid_dev.conll') as f:
    dev_trees = list(conll_reader(f))
print(f'{len(dev_trees)} trees read.')

print('In test.conll:')
with open('fake_data/valid_test.conll') as f:
    test_trees = list(conll_reader(f))
print(f'{len(test_trees)} trees read.')

In train.conll:
92 trees read.
In dev.conll:
65 trees read.
In test.conll:
60 trees read.


In [3]:
# Re-use the code from Lab 7
class RootDummy(object):
    def __init__(self):
        self.head = None
        self.id = 0
        self.deprel = None
    def __repr__(self):
        return "<ROOT>"

class State(object):
    def __init__(self, sentence=[],words=[]):
        self.stack = []
        self.buffer = []
        self.stack_words = []
        self.buffer_words = []
        if sentence:
            self.buffer = list(reversed(sentence))
        if words:
            self.buffer_words = list(reversed(words))
        self.deps = set()

    def shift(self):
        ### START YOUR CODE ###
        if self.buffer:
            front_of_buffer=self.buffer[-1]
            self.buffer.pop()
            self.stack.append(front_of_buffer)

            front_of_buffer_word=self.buffer_words[-1]
            self.buffer_words.pop()
            self.stack_words.append(front_of_buffer_word)

        ### END YOUR CODE ###

    def left_arc(self, label: str):
        assert len(self.stack) >= 2
        ### START YOUR CODE ###

        s1=self.stack[-1]
        s2 = self.stack[-2]  

        s1_word=self.stack_words[-1]
        s2_word = self.stack_words[-2]  
        

        self.stack.pop(-2)
        self.stack_words.pop(-2)

        self.deps.add((s1, s2, label))
        ### END YOUR CODE ###

    def right_arc(self, label: str):
        assert len(self.stack) >= 2
        ### START YOUR CODE ###
        s1=self.stack[-1]
        s2 = self.stack[-2] 

        s1_word=self.stack_words[-1]
        s2_word = self.stack_words[-2] 
        
        self.stack.pop()
        self.stack_words.pop()

        self.deps.add((s2, s1, label))
        ### END YOUR CODE ###

    def __repr__(self):
        return "({},{},{},{},{})".format(self.stack, self.buffer, self.deps,self.stack_words,self.buffer_words)

def get_training_instances(dep_tree) -> List[Tuple[State, Tuple[str, str]]]:
    deprels = dep_tree.deprels

    word_ids = list(deprels.keys())
    words_list=[]
    words = list(deprels.values())
    for i in words:
        words_list.append((i.word,i.pos))
    state = State(word_ids,words_list)
    state.stack.append(0) # ROOT
    state.stack_words.append(("<ROOT>",'<ROOT>')) # ROOT

    childcount = defaultdict(int)
    for _, rel in deprels.items():
        childcount[rel.head] += 1

    seq = []


    dep_relation=[]
    for i in range(1,len(deprels)+1):
        dep_relation.append((deprels[i].head,i))
    

    while len(state.buffer) > 0 or len(state.stack) > 1:
    

        if state.stack[-1] == 0:
            seq.append((copy.deepcopy(state), ("shift", None)))
            state.shift()
            continue
        
        stack_top1 = deprels[state.stack[-1]]
        if state.stack[-2] == 0:
            stack_top2 = RootDummy()
        else:
            stack_top2 = deprels[state.stack[-2]]

        ### START YOUR CODE ###

        if (int(state.stack[-1]), int(state.stack[-2])) in dep_relation:
            relation = stack_top2.deprel
            
            action = "left_arc"
            
            seq.append((copy.deepcopy(state), (action, relation)))
            childcount[state.stack[-1]] -= 1 
            state.left_arc(relation)
        elif (int(state.stack[-2]), int(state.stack[-1])) in dep_relation and childcount[state.stack[-1]] == 0:
            relation = stack_top1.deprel
            
            action = "right_arc"
            
            seq.append((copy.deepcopy(state), (action, relation)))
            childcount[state.stack[-2]] -= 1
            state.right_arc(relation)
        else:
            
            seq.append((copy.deepcopy(state), ("shift", None)))
            state.shift()
            
        ### END YOUR CODE ###
    
    seq.append((copy.deepcopy(state), ("done", None)))

    return seq


In [4]:

words_set=[]
tags_set=[]
actions_set=[]
for tree in train_trees:
    words_set.extend(tree.words())
    tags_set.extend(tree.pos())
    
    for i in tree.deprels:
        actions_set.append(tree.deprels[i].deprel)

words_set=set(words_set)
tags_set=set(tags_set)
actions_set=set(actions_set)

#TODO 记得改回去
actions_set=list({'acl', 'mark', 'nmod:npmod', 'advcl', 'expl', 'cop', 'iobj', 'root', 'amod', 'xcomp', 'det:predet', 'auxpass', 'appos', 'parataxis', 'nsubjpass', 'case', 'discourse', 'nmod:poss', 'neg', 'csubjpass', 'cc', 'punct', 'mwe', 'conj', 'dobj', 'nummod', 'aux', 'det', 'compound:prt', 'csubj', 'acl:relcl', 'ccomp', 'dep', 'nmod:tmod', 'advmod', 'nmod', 'nsubj', 'compound', 'cc:preconj'})


def create_action_vocab():
    action_vocab=[]
    # Define the list of transition actions
    # actions = ['shift', 'left_arc', 'right_arc']  # Add any additional actions as needed
    for i in actions_set:
        if i !='root':
            action_vocab.append(('left_arc',i))
            action_vocab.append(('right_arc',i))
        else:
            action_vocab.append(('right_arc',i))

    
    action_vocab.append(('shift', None))
    
    return action_vocab

action_vocab=create_action_vocab()
print(len(words_set))
print(len(tags_set))
print(len(actions_set))
print(len(action_vocab))


987
40
39
78


### 1) Implement the Feature Extractor (10 points) 


In [5]:
DIMENSION=100
NUM_FEATURES = 12  # Assuming there are 12 features as stack and buffer words/tags

In [6]:
import numpy as np
from collections import Counter

class BagOfWordsEmbedding:
    def __init__(self, dimension):
        self.vocab = {}
        self.vocab_size = 0
        self.dimension = dimension

    def build_vocab(self, corpus):
        """Build vocabulary."""
        word_counts = Counter()
        for document in corpus:
            if document is not None:
                word_counts.update(document.split())
        # Add special tokens <NULL> and <ROOT>
        self.vocab = {'<NULL>': 0, '<ROOT>': 1}
        idx = 2
        for word, _ in word_counts.items():
            self.vocab[word] = idx
            idx += 1
        self.vocab_size = len(self.vocab)

    def text_to_bow_vector(self, text):
        """Convert text to a bag-of-words vector."""
        if text is None:
            return np.zeros(self.dimension)
        bow_vector = np.zeros(self.dimension)
        words = text.split()
        for word in words:
            if word in self.vocab:
                # Consider revising if dimension < vocab_size to avoid potential collisions
                bow_vector[self.vocab[word] % self.dimension] += 1
        return bow_vector



words_embedding = BagOfWordsEmbedding(DIMENSION)
words_embedding.build_vocab(words_set)
tags_embedding = BagOfWordsEmbedding(DIMENSION)
tags_embedding.build_vocab(tags_set)


word = "<ROOT>"
embedding1 = words_embedding.text_to_bow_vector(word)
print(embedding1)
word = "<NULL>"
embedding2 = tags_embedding.text_to_bow_vector(word)
print(embedding2)


[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [21]:
# import numpy as np

# class FeatureExtractor:
#     def __init__(self,embedding_size,words_embedding,tags_embedding):
#         self.embedding_size = embedding_size
#         self.word_embeddings = words_embedding # Dictionary to store word embeddings
#         self.tag_embeddings = tags_embedding  # Dictionary to store tag embeddings


#     def get_word_embedding(self, word):
#         # Return word embedding or a zero vector if word is not found
#         return self.word_embeddings.text_to_bow_vector(word)

#     def get_tag_embedding(self, tag):
#         # Return tag embedding or a zero vector if tag is not found
#         return self.tag_embeddings.text_to_bow_vector(tag)

#     def extract_features(self, stack, buffer):
 
#         features = []
#         stack_extended=[]
#         buffer_extended=[]
#         stack_pos_extended=[]
#         buffer_pos_extended=[]
#         # reversed(stack)
#         reversed(buffer)
        
#         # Get embeddings for the top 3 words on stack and buffer along with their POS tags
#         for i in range(3,0,-1):
           

#             if i > len(stack):
#                 stack_word, stack_tag = ("<NULL>", "<NULL>")  # Stack[-1] is the top word
#             else:
#                 stack_word, stack_tag = stack[ len(stack)-i]
#             if i > len(buffer):
#                 buffer_word, buffer_tag = ("<NULL>", "<NULL>")
#             else:
#                 buffer_word, buffer_tag = buffer[ len(buffer)-i]
#             stack_extended.append(stack_word)
#             stack_pos_extended.append(stack_tag)
#             buffer_extended.append(buffer_word)
#             buffer_pos_extended.append(buffer_tag)

#         feature_words=[]
#         for i in stack_extended:
#             feature_words.append(i)
#             features.extend(self.get_word_embedding(i))
#         for i in buffer_extended:
#             feature_words.append(i)
#             features.extend(self.get_word_embedding(i))
#         for i in stack_pos_extended:
#             feature_words.append(i)
#             features.extend(self.get_tag_embedding(i))
#         for i in buffer_pos_extended:
#             feature_words.append(i)
#             features.extend(self.get_tag_embedding(i))
    

#         return np.array(features),feature_words

# # Example usage
# fe = FeatureExtractor(DIMISION,words_embedding,tags_embedding)
# # fe.load_embeddings('word_embeddings.txt', 'tag_embeddings.txt')
# stack = [("<ROOT>", "<ROOT>"), ("the", "DT")]
# buffer = [("apple", "NN"), ("trees", "NNS"), ("grow", "VB")]
# features,feature_words = fe.extract_features(stack, buffer)
# print(features.shape)  # Output should be (300,)
# print(features)
# print(feature_words)


(600,)
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [7]:
import numpy as np

class FeatureExtractor:
    def __init__(self, embedding_size, words_embedding, tags_embedding):
        self.embedding_size = embedding_size
        self.word_embeddings = words_embedding
        self.tag_embeddings = tags_embedding

    def get_word_embedding(self, word):
        return self.word_embeddings.text_to_bow_vector(word)

    def get_tag_embedding(self, tag):
        return self.tag_embeddings.text_to_bow_vector(tag)

    def extract_features(self, stack, buffer):
        features = []
        stack_extended = []
        buffer_extended = []
        stack_pos_extended = []
        buffer_pos_extended = []

        # Ensure at least 3 elements by appending <NULL> tuples if necessary
        while len(stack) < 3:
            stack.append(("<NULL>", "<NULL>"))
        while len(buffer) < 3:
            buffer.append(("<NULL>", "<NULL>"))

        # Extract embeddings for stack and buffer elements along with their tags
        for i in range(3):
            stack_word, stack_tag = stack[-(i + 1)]
            buffer_word, buffer_tag = buffer[i]

            stack_extended.append(stack_word)
            stack_pos_extended.append(stack_tag)
            buffer_extended.append(buffer_word)
            buffer_pos_extended.append(buffer_tag)

        # Concatenate all embeddings
        for word in stack_extended + buffer_extended:
            features.extend(self.get_word_embedding(word))
        for tag in stack_pos_extended + buffer_pos_extended:
            features.extend(self.get_tag_embedding(tag))

        return np.array(features), stack_extended + buffer_extended + stack_pos_extended + buffer_pos_extended

# Example usage
fe = FeatureExtractor(DIMENSION, words_embedding, tags_embedding)
stack = [("<ROOT>", "<ROOT>"), ("the", "DT")]
buffer = [("apple", "NN"), ("trees", "NNS"), ("grow", "VB")]
features, feature_words = fe.extract_features(stack, buffer)
print(features.shape)  # Output should now be (600,) if each embedding is 50 in dimension
print(features)
print(feature_words)


(1200,)
[1. 0. 0. ... 0. 0. 0.]
['<NULL>', 'the', '<ROOT>', 'apple', 'trees', 'grow', '<NULL>', 'DT', '<ROOT>', 'NN', 'NNS', 'VB']


### 2) Implement the scoring function (5 points)

In [22]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F


# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class ScoringFunction(nn.Module):
#     def __init__(self, input_size, hidden_size, num_actions):
#         super(ScoringFunction, self).__init__()
#         # Input to hidden layer
#         self.hidden = nn.Linear(input_size, hidden_size)
#         # Hidden layer to output (scores for each action)
#         self.output = nn.Linear(hidden_size, num_actions)
#         # Optional: Initialize weights and biases here if needed

#     def forward(self, x):
#         # x is the feature vector phi(c)
#         x = self.hidden(x)
#         x = torch.tanh(x)  # Using tanh activation function as suggested
#         x = self.output(x)
#         return F.softmax(x, dim=-1)  # Apply softmax to convert to probabilities

# # Example usage
# input_size = DIMISION*12 # Size of the feature vector phi(c)
# hidden_size = 200  # As suggested in the prompt
# num_actions = len(action_vocab)  # Number of possible transition actions t

# # Create an instance of the scoring function
# mlp = ScoringFunction(input_size, hidden_size, num_actions)

# # Example feature vector (randomly generated for demonstration)
# phi_c = torch.randn(1, input_size)
# print(phi_c.size())
# # Compute scores for each action
# scores = mlp(phi_c)
# print("Scores for each action:", scores)
# print(scores.size())



torch.Size([1, 600])
Scores for each action: tensor([[0.0124, 0.0120, 0.0140, 0.0190, 0.0133, 0.0070, 0.0155, 0.0122, 0.0087,
         0.0166, 0.0151, 0.0102, 0.0156, 0.0187, 0.0133, 0.0092, 0.0127, 0.0117,
         0.0100, 0.0136, 0.0130, 0.0114, 0.0090, 0.0200, 0.0131, 0.0091, 0.0100,
         0.0088, 0.0127, 0.0118, 0.0169, 0.0159, 0.0114, 0.0152, 0.0078, 0.0119,
         0.0121, 0.0102, 0.0127, 0.0102, 0.0115, 0.0112, 0.0121, 0.0196, 0.0073,
         0.0135, 0.0132, 0.0272, 0.0174, 0.0112, 0.0138, 0.0128, 0.0127, 0.0107,
         0.0115, 0.0124, 0.0121, 0.0120, 0.0101, 0.0119, 0.0115, 0.0131, 0.0076,
         0.0107, 0.0153, 0.0089, 0.0132, 0.0112, 0.0164, 0.0132, 0.0134, 0.0190,
         0.0107, 0.0169, 0.0154, 0.0136, 0.0102, 0.0118]],
       grad_fn=<SoftmaxBackward0>)
torch.Size([1, 78])


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScoringFunction(nn.Module):
    def __init__(self, input_size, hidden_size, num_actions):
        super(ScoringFunction, self).__init__()
        # Input to hidden layer
        self.hidden = nn.Linear(input_size, hidden_size)
        # Hidden layer to output (scores for each action)
        self.output = nn.Linear(hidden_size, num_actions)
        # Optional: Initialize weights and biases here if needed

    def forward(self, x):
        # x is the feature vector phi(c)
        x = self.hidden(x)  # Apply linear transformation
        x = torch.tanh(x)  # Using tanh activation function as suggested
        x = self.output(x)  # Apply second linear transformation
        return F.softmax(x, dim=-1)  # Apply softmax to convert logits to probabilities

# Example usage

input_size = DIMENSION * NUM_FEATURES  # Total size of the input feature vector
hidden_size = 200  # As suggested in the prompt
num_actions = len(action_vocab)  # Number of possible transition actions t (e.g., SHIFT, LEFT-ARC, RIGHT-ARC)

# Create an instance of the scoring function
mlp = ScoringFunction(input_size, hidden_size, num_actions)

# Example feature vector (randomly generated for demonstration)
phi_c = torch.randn(1, input_size)  # Ensure input is in the shape (batch_size, input_size)
print("Feature vector size:", phi_c.size())

# Compute scores for each action
scores = mlp(phi_c)
print("Scores for each action:", scores)
print("Scores size:", scores.size())


Feature vector size: torch.Size([1, 1200])
Scores for each action: tensor([[0.0098, 0.0132, 0.0109, 0.0083, 0.0081, 0.0123, 0.0082, 0.0099, 0.0143,
         0.0181, 0.0149, 0.0109, 0.0133, 0.0101, 0.0125, 0.0099, 0.0164, 0.0155,
         0.0099, 0.0132, 0.0209, 0.0117, 0.0235, 0.0116, 0.0154, 0.0112, 0.0131,
         0.0084, 0.0110, 0.0133, 0.0078, 0.0128, 0.0135, 0.0094, 0.0111, 0.0108,
         0.0213, 0.0120, 0.0109, 0.0179, 0.0151, 0.0106, 0.0158, 0.0194, 0.0112,
         0.0108, 0.0090, 0.0083, 0.0150, 0.0084, 0.0172, 0.0118, 0.0145, 0.0142,
         0.0186, 0.0086, 0.0121, 0.0094, 0.0117, 0.0111, 0.0130, 0.0087, 0.0147,
         0.0176, 0.0071, 0.0107, 0.0128, 0.0271, 0.0143, 0.0180, 0.0102, 0.0086,
         0.0082, 0.0115, 0.0125, 0.0148, 0.0112, 0.0188]],
       grad_fn=<SoftmaxBackward0>)
Scores size: torch.Size([1, 78])


### 3) Implement the Training Step (15 points)

In [9]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class Parser(nn.Module):
#     def __init__(self, embedding_size, hidden_size, num_actions):
#         super(Parser, self).__init__()
#         # Assuming FeatureExtractor and ScoringFunction are properly defined to handle batched input
#         self.feature_extractor = FeatureExtractor(embedding_size,words_embedding,tags_embedding)  # Assuming FeatureExtractor is already defined
#         self.scoring_function = ScoringFunction(embedding_size * 12, hidden_size, num_actions)  # Adjust input size

#     def forward(self, X):

#         scores_list = []
#         for instance in X:
#                 # Compute scores using the scoring function for the current instance
#             cur_scores = self.scoring_function(instance)
                
#                 # Append the current scores to the list
#             scores_list.append(cur_scores)
            
#             # Stack the list of scores along a new dimension
#         scores = torch.stack(scores_list)
#         return scores

#     def compute_loss(self, predicted_scores, y):
#         # Compute cross-entropy loss between predicted scores and ground truth y
#         loss = F.cross_entropy(predicted_scores, y)
#         return loss
    
   
#     def parse_sentence(self, sentence):
#         stack = [( '<ROOT>', '<ROOT>')]  # Include index and label for the root
#         buffer = list(reversed(sentence) ) # Include index and placeholder for label
#         tree = [{} for _ in sentence]  # Initialize the tree with empty dicts for each word
#         location=list(sentence)
#         location.insert(0,('<ROOT>', '<ROOT>'))
#         while len(buffer) > 0 or len(stack) > 1:
#             print("buffer",len(buffer))
#             print("stack",len(stack))
            
#             # Get current configuration
#             current_config = (stack, buffer)

#             # Extract features based on the current configuration
#             features, _ = self.feature_extractor.extract_features(stack, buffer)

#             # Convert features to a torch tensor
#             feature_tensor = torch.tensor(features, dtype=torch.float32)
#             scores = self.scoring_function(feature_tensor)
#             # Predict scores for all transitions
#             if len(buffer) > 0:
#                 value,indices=torch.topk(scores,1)
#                 index=indices[0]
               
#             else:
#                 for i in range(0,len(action_vocab)):
#                     value,indices=torch.topk(scores,i+1)
#                     index=indices[i]
#                     if action_vocab[index][0]!='shift':
#                         break

#             best_transition=action_vocab[index]
#             print(best_transition)

#             if best_transition[0] == 'shift':
                
#                     front_of_buffer=buffer[-1]
#                     buffer.pop()
#                     stack.append(front_of_buffer)

#             elif best_transition[0] == 'left_arc':

#                 s1=stack[-1]
#                 s2 = stack[-2] 

#                 stack.pop(-2)

#                 tree[location.index(s1) - 1] = {
#                     'deprel': best_transition[1],
#                     'head': (location.index(s1), location.index(s2))
#                 }

                
#             elif best_transition[0] == 'right_arc':
#                 s1=stack[-1]
#                 s2 = stack[-2]

#                 stack.pop()
                
#                 tree[location.index(s2) - 1] = {
#                     'deprel': best_transition[1],
#                     'head': (location.index(s2), location.index(s1))
#                 }

#         return tree



In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Parser(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_actions):
        super(Parser, self).__init__()
        # Assuming FeatureExtractor and ScoringFunction are properly defined to handle batched input
        self.feature_extractor = FeatureExtractor(embedding_size, words_embedding, tags_embedding)
        self.scoring_function = ScoringFunction(embedding_size * 12, hidden_size, num_actions)

    # def forward(self, X):
    #     # Assuming X is already batched
    #     feature_tensors = [self.feature_extractor.extract_features(*x)[0] for x in X]  # Extract features for each instance
    #     feature_tensors = torch.stack(feature_tensors)  # Convert list of tensors to a single tensor
    #     scores = self.scoring_function(feature_tensors)
    #     return scores
    
    def forward(self, X):

        scores_list = []
        for instance in X:
            print(instance)
                # Compute scores using the scoring function for the current instance
            cur_scores = self.scoring_function(instance)
                
                # Append the current scores to the list
            scores_list.append(cur_scores)
            
            # Stack the list of scores along a new dimension
        scores = torch.stack(scores_list)
        print(scores)
        return scores

    def compute_loss(self, predicted_scores, y):
        # Compute cross-entropy loss between predicted scores and ground truth y
        loss = F.cross_entropy(predicted_scores, y)
        return loss

    def parse_sentence(self, sentence):
        stack = [('<ROOT>', '<ROOT>')]  # Stack initially contains ROOT
        buffer = list(reversed(sentence))  # Buffer contains the sentence in reverse order
        tree = [{} for _ in sentence]  # Dependency tree initialization
        
        location=list(sentence)
        location.insert(0,('<ROOT>', '<ROOT>'))

        while len(buffer) > 0 or len(stack) > 1:
            features, _ = self.feature_extractor.extract_features(stack, buffer)
            feature_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
            scores = self.scoring_function(feature_tensor)
            _, predicted_action = scores.max(dim=1)  # Get the index of the max score

            action = action_vocab[predicted_action.item()]  # Get action from vocab using index
            print("action",action)
            if action == 'shift' and len(buffer) > 0:
                stack.append(buffer.pop())

            elif action == 'left_arc' and len(stack) > 1:
                s1=stack[-1]
                s2 = stack[-2] 

                stack.pop(-2)

                tree[location.index(s1) ] = {
                    'deprel': action[1],
                    'head': (location.index(s1), location.index(s2))
                }

               

            elif action == 'right_arc' and len(stack) > 1:
                s1=stack[-1]
                s2 = stack[-2]

                stack.pop()
                
                tree[location.index(s2) ] = {
                    'deprel': action[1],
                    'head': (location.index(s2), location.index(s1))
                }

        return tree

# Example usage
# Note: Define words_embedding, tags_embedding, action_vocab, and feature_extractor before using this class.


In [12]:
# Step 1: Preprocess the Trees




import torch.optim as optim



def process(dep_trees: List[DependencyTree], word_vocab: dict, pos_vocab: dict, action_vocab, words_embedding, tags_embedding) -> List[Tuple[torch.Tensor, torch.Tensor]]:
    tensor_data = []
    embedding_size=50
    for tree in dep_trees:
        instances = get_training_instances(tree)
        for state, action in instances:
            if action==('done', None):
                continue
            feature_extractor = FeatureExtractor(embedding_size, words_embedding, tags_embedding)
            feature_vector, _ = state_to_feature_vector(state, feature_extractor)

           
            action_tensor = torch.tensor(action_vocab.index(action), dtype=torch.long)

            tensor_data.append({'X': feature_vector, 'y': action_tensor})

    return tensor_data

import numpy as np

def state_to_feature_vector(state, feature_extractor):
    stack = state.stack_words
    buffer = state.stack_words
   
    
    features, feature_words = feature_extractor.extract_features(stack, buffer)

    # Convert features to a torch tensor
    feature_tensor = torch.tensor(features, dtype=torch.float32)

    return feature_tensor, feature_words




def train_model(model, train_loader, dev_loader, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        total_loss = 0.0
        for batch in train_loader:
            X_batch, y_batch = batch['X'], batch['y']
            optimizer.zero_grad()  # Zero the gradients
            predicted_scores = model(X_batch)
            loss = model.compute_loss(predicted_scores, y_batch)
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model parameters
            total_loss += loss.item() * X_batch.size(0)
        train_loss = total_loss / len(train_loader.dataset)
        
        # Validate the model
        model.eval()  # Set model to evaluation mode
        total_loss = 0.0
        for batch in dev_loader:
            X_batch, y_batch = batch['X'], batch['y']
            predicted_scores = model(X_batch)
            loss = model.compute_loss(predicted_scores, y_batch)
            total_loss += loss.item() * X_batch.size(0)
        dev_loss = total_loss / len(dev_loader.dataset)
        
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Dev Loss: {dev_loss:.4f}')

# Example usage



train_data = process(train_trees,words_set,tags_set,action_vocab,words_embedding, tags_embedding)
print("train_data process finish")
dev_data = process(dev_trees,words_set,tags_set,action_vocab,words_embedding, tags_embedding)
print("dev_data process finish")
# test_data = process(test_trees,words_set,tags_set,action_vocab,words_embedding, tags_embedding)
# print("test_data process finish")





train_data process finish
dev_data process finish


In [62]:
# import torch
# import torch.optim as optim
# import torch.nn.functional as F
# from torch.utils.data import DataLoader, Dataset

# # class DependencyDataset(Dataset):
# #     def __init__(self, dep_trees, word_vocab, pos_vocab, action_vocab, feature_extractor):
# #         self.data = []
# #         for tree in dep_trees:
# #             instances = get_training_instances(tree)
# #             for state, action in instances:
# #                 if action == ('done', None):
# #                     continue
# #                 feature_vector, _ = state_to_feature_vector(state, feature_extractor)
# #                 action_index = action_vocab.index(action)  # Assuming action_vocab is a list
# #                 self.data.append((feature_vector, action_index))
                
# #     def __len__(self):
# #         return len(self.data)
    
# #     def __getitem__(self, idx):
# #         return self.data[idx]

# def process(trees: List[DependencyTree], word_vocab: Dict[str, int], pos_vocab: Dict[str, int], action_vocab: List[str], words_embedding, tags_embedding) -> List[Dict[str, torch.Tensor]]:
#     tensor_data = []
#     embedding_size = 50  # Define embedding size as per your embedding dimensions
#     feature_extractor = FeatureExtractor(embedding_size, words_embedding, tags_embedding)

#     for tree in trees:
#         instances = get_training_instances(tree)
#         for state, action in instances:
#             if action == ('done', None):
#                 continue  # Skip processing for 'done' actions

#             feature_vector, _ = state_to_feature_vector(state, feature_extractor)
#             if action in action_vocab:
#                 action_index = action_vocab.index(action)
#                 action_tensor = torch.tensor(action_index, dtype=torch.long)
#                 tensor_data.append({'X': feature_vector, 'y': action_tensor})
#             else:
#                 print(f"Warning: Action {action} not found in action vocabulary.")

#     return tensor_data

# def state_to_feature_vector(state, feature_extractor):
#     stack = state.stack_words
#     buffer = state.buffer_words  # Corrected from stack_words to buffer_words
    
#     features, feature_words = feature_extractor.extract_features(stack, buffer)
#     feature_tensor = torch.tensor(features, dtype=torch.float32)
#     return feature_tensor, feature_words

# def train_model(model, train_loader, dev_loader, optimizer, num_epochs):
#     for epoch in range(num_epochs):
#         model.train()
#         total_train_loss = 0.0
#         for X_batch, y_batch in train_loader:
#             optimizer.zero_grad()
#             predicted_scores = model(X_batch)
#             loss = F.cross_entropy(predicted_scores, y_batch)
#             loss.backward()
#             optimizer.step()
#             total_train_loss += loss.item() * len(X_batch)
        
#         train_loss = total_train_loss / len(train_loader.dataset)
#         dev_loss = evaluate(model, dev_loader)

#         print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Dev Loss: {dev_loss:.4f}')

# def evaluate(model, data_loader):
#     model.eval()
#     total_loss = 0.0
#     with torch.no_grad():
#         for X_batch, y_batch in data_loader:
#             predicted_scores = model(X_batch)
#             loss = F.cross_entropy(predicted_scores, y_batch)
#             total_loss += loss.item() * len(X_batch)
#     return total_loss / len(data_loader.dataset)

# # Instantiate the feature extractor only once
# feature_extractor = FeatureExtractor(DIMENSION, words_embedding, tags_embedding)

# # Create the datasets
# train_dataset = process(train_trees,words_set,tags_set,action_vocab,words_embedding, tags_embedding)
# dev_dataset = process(dev_trees,words_set,tags_set,action_vocab,words_embedding, tags_embedding)

# # train_dataset = DependencyDataset(train_trees, words_set, tags_set, action_vocab, feature_extractor)
# # dev_dataset = DependencyDataset(dev_trees, words_set, tags_set, action_vocab, feature_extractor)

# # Create DataLoaders
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)




In [13]:
from torch.utils.data import DataLoader, Dataset

learning_rate=0.0001
batch_size=32
num_epochs=1
hidden_size = 200
num_actions = len(action_vocab)
model = Parser(DIMENSION, hidden_size, num_actions)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=32, shuffle=False)
train_model(model, train_loader, dev_loader, optimizer, num_epochs)


# # Initialize model and optimizer
# model = Parser(DIMENSION, hidden_size, len(action_vocab))
# optimizer = optim.Adam(model.parameters())
# num_epochs=1
# # Train the model
# train_model(model, train_loader, dev_loader, optimizer, num_epochs)


tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 1., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([1., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([1., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([1., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0., 0., 0.,  ...,

In [14]:


# Example sentence
sentence = [('The','DT'), ('equity','NN'), ('market','NN'), ('was','VBD'), ('illiquid','JJ'), ('.','.')]

# Assuming 'model' is an instance of your Parser class

parsed_tree = model.parse_sentence(sentence)
print("Parsed Dependency Tree:", parsed_tree)


action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('shift', None)
action ('sh

KeyboardInterrupt: 

In [38]:
test_trees_modified=[]
for tree in test_trees:
    # instances = get_training_instances(tree)
    tempt=[]
    for i in tree.deprels:
        tempt.append(({'head':(i,tree.deprels[i].head),'deprel':tree.deprels[i].deprel}))
       
    test_trees_modified.append(tempt)
       
pprint(test_trees_modified)
 

[[{'deprel': 'discourse', 'head': (1, 7)},
  {'deprel': 'punct', 'head': (2, 7)},
  {'deprel': 'nsubj', 'head': (3, 7)},
  {'deprel': 'cop', 'head': (4, 7)},
  {'deprel': 'neg', 'head': (5, 7)},
  {'deprel': 'compound', 'head': (6, 7)},
  {'deprel': 'root', 'head': (7, 0)},
  {'deprel': 'punct', 'head': (8, 7)}],
 [{'deprel': 'cc', 'head': (1, 33)},
  {'deprel': 'mark', 'head': (2, 10)},
  {'deprel': 'det', 'head': (3, 7)},
  {'deprel': 'compound', 'head': (4, 7)},
  {'deprel': 'compound', 'head': (5, 7)},
  {'deprel': 'compound', 'head': (6, 7)},
  {'deprel': 'nsubj', 'head': (7, 10)},
  {'deprel': 'aux', 'head': (8, 10)},
  {'deprel': 'neg', 'head': (9, 10)},
  {'deprel': 'advcl', 'head': (10, 33)},
  {'deprel': 'advmod', 'head': (11, 10)},
  {'deprel': 'nmod:tmod', 'head': (12, 10)},
  {'deprel': 'mark', 'head': (13, 19)},
  {'deprel': 'det', 'head': (14, 18)},
  {'deprel': 'compound', 'head': (15, 18)},
  {'deprel': 'compound', 'head': (16, 18)},
  {'deprel': 'compound', 'head': (1

In [39]:
predicted_trees = []
test_sentences=[]



for tree in test_trees:
    sentence=[]
    for i in range(1,len(tree.words())):
        sentence.append((tree.words()[i],tree.pos()[i]))
        # words_set.extend(tree.words())
        # tags_set.extend(tree.pos())
    # print(sentence)
    test_sentences.append(sentence)

for processed_instance in test_sentences:
    # Depending on how your process function works, you may need to extract the sentence
    # or the necessary input format that your parse_sentence function expects.
    # sentence = processed_instance['words']  # or however you access the words in your processed_instance
    parsed_tree = model.parse_sentence(processed_instance)
    predicted_trees.append(parsed_tree)

pprint(test_sentences[:1])
print("----")
pprint(test_trees_modified[:1])
print("----")
pprint(predicted_trees[:1])

buffer 8
stack 1
('shift', None)
buffer 7
stack 2
('shift', None)
buffer 6
stack 3
('shift', None)
buffer 5
stack 4
('shift', None)
buffer 4
stack 5
('shift', None)
buffer 3
stack 6
('shift', None)
buffer 2
stack 7
('shift', None)
buffer 1
stack 8
('shift', None)
buffer 0
stack 9
('right_arc', 'punct')
buffer 0
stack 8
('left_arc', 'case')
buffer 0
stack 7
('left_arc', 'case')
buffer 0
stack 6
('left_arc', 'case')
buffer 0
stack 5
('left_arc', 'case')
buffer 0
stack 4
('left_arc', 'case')
buffer 0
stack 3
('right_arc', 'nmod')
buffer 0
stack 2
('left_arc', 'case')
buffer 40
stack 1
('shift', None)
buffer 39
stack 2
('shift', None)
buffer 38
stack 3
('shift', None)
buffer 37
stack 4
('shift', None)
buffer 36
stack 5
('shift', None)
buffer 35
stack 6
('shift', None)
buffer 34
stack 7
('shift', None)
buffer 33
stack 8
('shift', None)
buffer 32
stack 9
('shift', None)
buffer 31
stack 10
('shift', None)
buffer 30
stack 11
('shift', None)
buffer 29
stack 12
('shift', None)
buffer 28
stack 13

In [40]:
def calculate_uas_las(gold_tree, predicted_tree):
    correct_heads = 0
    correct_labels = 0
    total = 0
    
    for gold_token, predicted_token in zip(gold_tree, predicted_tree):
        if not predicted_token:
            continue
        
        if gold_token['head'] == predicted_token['head']:
            correct_heads += 1
            if gold_token['deprel'] == predicted_token['deprel']:
                correct_labels += 1
        total += 1
    
    return correct_heads, correct_labels, total

total_correct_heads = 0
total_correct_labels = 0
total_tokens = 0

for gold_tree, predicted_tree in zip(test_trees_modified, predicted_trees):
    print("#########")
    print("gold_tree",gold_tree)
    print("predicted_tree",predicted_tree)
    print("#########")
    correct_heads, correct_labels, tokens = calculate_uas_las(gold_tree, predicted_tree)
    total_correct_heads += correct_heads
    total_correct_labels += correct_labels
    total_tokens += tokens

UAS_score = total_correct_heads / total_tokens
LAS_score = total_correct_labels / total_tokens

print(f"UAS (Unlabeled Attachment Score): {UAS_score:.4f}")
print(f"LAS (Labeled Attachment Score): {LAS_score:.4f}")


#########
gold_tree [{'head': (1, 7), 'deprel': 'discourse'}, {'head': (2, 7), 'deprel': 'punct'}, {'head': (3, 7), 'deprel': 'nsubj'}, {'head': (4, 7), 'deprel': 'cop'}, {'head': (5, 7), 'deprel': 'neg'}, {'head': (6, 7), 'deprel': 'compound'}, {'head': (7, 0), 'deprel': 'root'}, {'head': (8, 7), 'deprel': 'punct'}]
predicted_tree [{'deprel': 'case', 'head': (1, 0)}, {}, {}, {}, {}, {}, {'deprel': 'case', 'head': (7, 2)}, {}]
#########
#########
gold_tree [{'head': (1, 33), 'deprel': 'cc'}, {'head': (2, 10), 'deprel': 'mark'}, {'head': (3, 7), 'deprel': 'det'}, {'head': (4, 7), 'deprel': 'compound'}, {'head': (5, 7), 'deprel': 'compound'}, {'head': (6, 7), 'deprel': 'compound'}, {'head': (7, 10), 'deprel': 'nsubj'}, {'head': (8, 10), 'deprel': 'aux'}, {'head': (9, 10), 'deprel': 'neg'}, {'head': (10, 33), 'deprel': 'advcl'}, {'head': (11, 10), 'deprel': 'advmod'}, {'head': (12, 10), 'deprel': 'nmod:tmod'}, {'head': (13, 19), 'deprel': 'mark'}, {'head': (14, 18), 'deprel': 'det'}, {'he

In [22]:
# Assuming 'model' is your instance of Parser that has been trained
model_path = "parser_model.pth"
torch.save(model.state_dict(), model_path)


In [11]:
# Initialize the model again
model = Parser(50, hidden_size, num_actions)

# Load the state dictionary
model_path = "parser_model.pth"
model.load_state_dict(torch.load(model_path))

# Make sure to call eval() for inference to set dropout and batch normalization layers to evaluation mode
# loaded_model.eval()


<All keys matched successfully>

### 2. Build the Model

In [None]:
class Parser(nn.Module):
    def __init__(self):
        pass

    def forward(self):
        pass

    def parse_sentence(self, sentence):
        pass

### 3. Train and Evaluate