## CS310 Natural Language Processing
## Assignment 4. Dependency Parsing

**Total points**: 50

In this assignment, you will train feed-forward neural network-based dependency parser and evaluate its performance on the provided treebank dataset.

### 0. Import Necessary Libraries

In [59]:
import torch.nn as nn

### 1. Read Data and Generate Training Instances

In [60]:
import copy
from collections import defaultdict
from dep_utils import conll_reader


class State(object):
    def __init__(self, sentence):
        self.stack = []
        self.buffer = []
        if sentence:
            self.buffer = list(reversed(sentence))
        self.deps = set()

    def shift(self):
        ### START YOUR CODE ###
        # print("shift")
        if self.buffer:
            buffer_out=self.buffer.pop(-1)
            self.stack.append(buffer_out)
        ### END YOUR CODE ###

    def left_arc(self, label: str):
        assert len(self.stack) >= 2
        ### START YOUR CODE ###
        dependent = self.stack.pop(-2)
        head = self.stack[-1]
        self.deps.add((head, dependent, label))
        ### END YOUR CODE ###

    def right_arc(self, label: str):
        assert len(self.stack) >= 2
        ### START YOUR CODE ###
        dependent = self.stack.pop()
        head = self.stack[-1]
        self.deps.add((head, dependent, label))
        ### END YOUR CODE ###

    def __repr__(self):
        return "({},{},{})".format(self.stack, self.buffer, self.deps)


class RootDummy(object):
    def __init__(self):
        self.head = None
        self.id = 0
        self.deprel = None
    def __repr__(self):
        return "<ROOT>"




# Re-use the code from Lab 7
def get_training_instances(dep_tree):
    deprels = dep_tree.deprels


    word_ids = list(deprels.keys())

    state = State(word_ids)
    state.stack.append(0)  # ROOT

    childcount = defaultdict(int)
    for _, rel in deprels.items():
        childcount[rel.head] += 1

    seq = []
    while len(state.buffer) > 0 or len(state.stack) > 1:

        if state.stack[-1] == 0:
            seq.append((copy.deepcopy(state), ("shift", None)))
            state.shift()
            continue

        stack_top1 = deprels[state.stack[-1]]
        if state.stack[-2] == 0:
            stack_top2 = RootDummy()
        else:
            stack_top2 = deprels[state.stack[-2]]
        if stack_top2.head == stack_top1.id and childcount[stack_top2.id] == 0:
            # Left-Arc
            relation = stack_top2.deprel
            seq.append((copy.deepcopy(state), ("left_arc", relation)))
            state.left_arc(relation)
            childcount[stack_top2.id] -= 1
            childcount[stack_top1.id] -= 1
        elif stack_top1.head == stack_top2.id and childcount[stack_top1.id] == 0:
            # Right-Arc
            relation = stack_top1.deprel
            seq.append((copy.deepcopy(state), ("right_arc", relation)))
            state.right_arc(relation)
            childcount[stack_top1.id] -= 1
            childcount[stack_top2.id] -= 1
        else:
            # Shift
            seq.append((copy.deepcopy(state), ("shift", None)))
            state.shift()
        ### END YOUR CODE ###

    seq.append((copy.deepcopy(state), ("done", None)))

    return seq


###  Implement the Feature Extractor

In [61]:
import torch
from typing import List
from dep_utils import DependencyTree


def feature_extract(
        tree: DependencyTree,
        word_vectors: dict,
        pos_vectors: dict,):
    instances = get_training_instances(tree)
    for state, action in instances:
        if action[0] == "done":
            break
        if len(state.stack) >= 3:
            stack = state.stack[-3:]
        else:
            stack= state.stack + [-1] * (3 - len(state.stack))
        if len(state.buffer) >= 3:
            buffer = state.buffer[-3:]
        else:
            buffer= state.buffer + [-1] * (3 - len(state.buffer))

        stack_idxes = []
        stack_pos_idxes = []
        buffer_idxes = []
        buffer_pos_idxes = []
        for s in stack:
            if s == -1:
                stack_idxes.extend(word_vectors["<NULL>"])
                stack_pos_idxes.extend(pos_vectors["<NULL>"])
            elif s == 0:
                stack_idxes.extend(word_vectors["<ROOT>"])
                stack_pos_idxes.extend(pos_vectors["<ROOT>"])
            else:
                if tree.deprels[s].word in word_vectors.keys():
                    stack_idxes.extend(word_vectors[tree.deprels[s].word])
                else:
                    stack_idxes.extend(word_vectors["<NULL>"])
                if tree.deprels[s].pos in pos_vectors.keys():
                    stack_pos_idxes.extend(pos_vectors[tree.deprels[s].pos])
                else:
                    stack_pos_idxes.extend(pos_vectors["<NULL>"])

        for b in buffer:
            if b == -1:
                buffer_idxes.extend(word_vectors["<NULL>"])
                buffer_pos_idxes.extend(pos_vectors["<NULL>"])
            elif b == 0:
                buffer_idxes.extend(word_vectors["<ROOT>"])
                buffer_pos_idxes.extend(pos_vectors["<ROOT>"])
            else:
                if tree.deprels[b].word in word_vectors.keys():
                    buffer_idxes.extend(word_vectors[tree.deprels[b].word])
                else:
                    buffer_idxes.extend(word_vectors["<NULL>"])
                if tree.deprels[b].pos in pos_vectors.keys():
                    buffer_pos_idxes.extend(pos_vectors[tree.deprels[b].pos])
                else:
                    buffer_pos_idxes.extend(pos_vectors["<NULL>"])
        # concatenate all index to get word vectors
        data_vector = torch.tensor(stack_idxes + buffer_idxes + stack_pos_idxes + buffer_pos_idxes)

    return data_vector,action


In [62]:
from collections import Counter
import random

print("In train.conll:")
with open("data/train.conll") as f:
    train_trees = list(conll_reader(f))
print(f"{len(train_trees)} trees read.")

print("In dev.conll:")
with open("data/dev.conll") as f:
    dev_trees = list(conll_reader(f))
print(f"{len(dev_trees)} trees read.")

print("In test.conll:")
with open("data/test.conll") as f:
    test_trees = list(conll_reader(f))
print(f"{len(test_trees)} trees read.")

relation_counter = Counter()
for tree in train_trees:
    for item in tree.deprels.values():
        relation_counter[item.deprel] += 1

word_counter = Counter()
for tree in train_trees:
    for item in tree.words():
        word_counter[item] += 1

pos_counter = Counter()
for tree in train_trees:
    for item in tree.pos():
        pos_counter[item] += 1


In train.conll:
39832 trees read.
In dev.conll:
1700 trees read.
In test.conll:
2416 trees read.


In [69]:
import torch.nn as nn

vocab_size=len(word_counter)+2
relation_num=len(relation_counter)
pos_size=len(pos_counter)+2


# 定义词汇量大小和词向量维度
embedding_dim = 50

# 使用均匀分布初始化词向量
word_embeddings = nn.Embedding(vocab_size, embedding_dim)
word_embeddings.weight.data.uniform_(-0.1 + 1e-5, 0.1 - 1e-5)  # 在[-0.1, 0.1]范围内均匀分布
pos_embeddings = nn.Embedding(pos_size, embedding_dim)
pos_embeddings.weight.data.uniform_(-0.1 + 1e-5, 0.1 - 1e-5)

null_index = 0
root_index = 1
word_embeddings.weight.data[null_index] = torch.tensor(-0.1)
word_embeddings.weight.data[root_index] = torch.tensor(0.1)

pos_embeddings.weight.data[null_index] = torch.tensor(-0.1)
pos_embeddings.weight.data[root_index] = torch.tensor(0.1)


word_vocab = {"<NULL>": -1, "<ROOT>": 0}
word_vectors = {"<NULL>":word_embeddings.weight.data[null_index].tolist(),"<ROOT>": word_embeddings.weight.data[root_index].tolist()}
pos_vocab = {"<NULL>": -1, "<ROOT>": 0}
pos_vectors = {"<NULL>":pos_embeddings.weight.data[null_index].tolist(),"<ROOT>":pos_embeddings.weight.data[root_index].tolist()}


index = 1  # 从索引 1 开始
for word, count in word_counter.most_common():
    word_vocab[word] = index
    word_embedding = word_embeddings.weight.data[index].tolist()
    word_vectors[word]=(word_embedding)
    index += 1

index = 1  # 从索引 1 开始
for pos, count in pos_counter.most_common():
    pos_vocab[pos] = index
    pos_embedding = pos_embeddings.weight.data[index].tolist()
    pos_vectors[pos]=(pos_embedding)
    index += 1

action_vocab = {}
action_vocab[("right_arc", "root")] = 0
action_vocab[("shift", None)] = 1
action_vocab[('done', None)]=2
index=3
for rel,count in relation_counter.most_common():
    if rel == "root":
        continue
    action_vocab[("left_arc", rel)] = index
    index+=1
    action_vocab[("right_arc", rel)] = index
    index+=1


print(f"Word vocab size: {len(word_vocab)}")
print(f"POS vocab size: {len(pos_vocab)}")
print(f"Action vocab size: {len(action_vocab)}")

Word vocab size: 44392
POS vocab size: 48
Action vocab size: 79


In [70]:
# train_data = []
# train_truth = []
# train_data, train_truth = feature_extract(
#     train_trees,
#     word_vectors,
#     pos_vectors,
#     action_vocab,
# )

for tree in train_trees:
    feature,action=feature_extract(tree,word_vectors,pos_vectors)
    print(feature)
    print(action)
    break

tensor([ 0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
         0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
         0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
         0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
         0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
         0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,  0.1000,
         0.1000,  0.1000,  0.0745, -0.0161, -0.0628, -0.0883,  0.0488, -0.0656,
         0.0749, -0.0987,  0.0494, -0.0593, -0.0508,  0.0150,  0.0687, -0.0580,
         0.0120,  0.0388, -0.0362,  0.0031, -0.0363,  0.0921,  0.0031, -0.0253,
         0.0389,  0.0870,  0.0698, -0.0687,  0.0085,  0.0377,  0.0865,  0.0039,
        -0.0469, -0.0062,  0.0829,  0.0998,  0.0346, -0.0943,  0.0204,  0.0665,
         0.0149, -0.0654, -0.0252, -0.0850,  0.0929, -0.0892,  0.0353, -0.0690,
         0.0504, -0.0381,  0.0367, -0.02

### Implement the scoring function

In [71]:
import torch.nn as nn

class ScoringFunction(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ScoringFunction, self).__init__()

        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, features):
        hidden_output = self.hidden_layer(features)
        hidden_output = self.activation(hidden_output)
        scores = self.output_layer(hidden_output)
        scores = self.softmax(scores)
        return scores


### 2. Build the Model

In [72]:
import torch
import torch.nn as nn

class Parser(nn.Module):
    def __init__(self,  scoring_function):
        super().__init__()
        self.scoring_function = scoring_function

    def forward(self, tree):

        feature,action=feature_extract(tree,word_vectors,pos_vectors)

        scores = self.scoring_function(feature)

        return scores


    def parse_sentence(self, sentence):
        state = State(sentence)
        state.stack.append(0)
        while len(state.buffer) > 0 or len(state.stack) > 1:
            if len(state.stack) >= 3:
                stack = state.stack[-3:]
            else:
                stack= state.stack + [-1] * (3 - len(state.stack))
            if len(state.buffer) >= 3:
                buffer = state.buffer[-3:]
            else:
                buffer= state.buffer + [-1] * (3 - len(state.buffer))






### 3. Train and Evaluate

In [None]:
train_data = []
train_tags = []
for tree in train_trees:
    feature,action=feature_extract(tree,word_vectors,pos_vectors)
    train_data.append(feature)
    train_data.append(action_vocab[action])

train_data=torch.stack(train_data)
train_tags=torch.tensor(train_tags)

input_size=200
hidden_size=200
action_size=len(action_vocab)
mlp = ScoringFunction(input_size, hidden_size, action_size)

model = Parser(mlp)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
epochs = 1
steps=1000
for epoch in range(epochs):
    total_loss = 0.0
    for i in range(0, len(train_trees)):
        optimizer.zero_grad()
        scores = model(train_trees[i])
        # print(torch.argmax(output))
        loss = loss_fn(scores, train_tags[i])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % steps == 0:
            print(f"Epoch {epoch + 1} Train Loss: {total_loss / steps}")
            total_loss = 0
            # print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {total_loss:.4f}, Dev Loss: {dev_loss:.4f}')
    model.eval()
    total_loss = 0.0
    for i in range(0, len(dev_trees)):
        scores = model(dev_trees[i])
        # print(torch.argmax(output))
        loss = loss_fn(scores, train_tags[i])
        total_loss += loss.item()
        if i % steps == 0:
            print(f"Epoch {epoch + 1} Dev Loss: {total_loss / steps}")
            total_loss = 0

model_path = "parser.pth"
torch.save(model.state_dict(), model_path)