In [1]:
import numpy as np

In [2]:
train_path_en = "Treebanks/english/train/wsj_train.only-projective.first-1k.conll06"

In [3]:
train_path_en.split('/')[-1]

'wsj_train.only-projective.first-1k.conll06'

In [4]:
class Token:
    
    def __init__(self):
        self.id = None
        self.form = None
        self.lemma = None
        self.pos = None
        self.xpos = None
        self.morph = None
        self.head = None
        self.deprel = None
        self.x = None
        self.y = None

In [5]:
class Sentence:
    
    def __init__(self, token_items):
        # create ROOT token
        root = Token()
        root.id = '0'
        root.form = 'ROOT'
        root.lemma = '_'
        root.pos = 'ROOT'
        root.xpos = '_'
        root.morph = '_'
        root.head = '_'
        root.deprel = '_'
        root.x = '_'
        root.y = '_'
        
        # initialize tokens with ROOT
        self.tokens = [root]
        
        # add each token in token_items to sentence
        for token in token_items:
            self.tokens.append(token)

In [6]:
class Reader:
    
    def __init__(self, filepath):
        self.filepath = filepath
        self.sentences = []
    
    def read_file(self):
        f = open(self.filepath)
        sentences = []
    
        # init token_items
        token_items = []
    
        for line in f:
            # init current sentence and token
            token = Token()
        
            # if not at end of sentence
            if line != '\n':
                items = line.split('\t')
                #print(items)
            
                # add token data
                token.id = items[0]
                token.form = items[1]
                token.lemma = items[2]
                token.pos = items[3]
                token.xpos = items[4]
                token.morph = items[5]
                token.head = items[6]
                token.deprel = items[7]
                token.x = items[8]
                token.y = items[9]
            
                # add Token to Sentence
                token_items.append(token)
        
            # add sentence and reset token_items
            else:
                sentences.append(Sentence(token_items))
                token_items = []
                
        f.close()
        self.sentences = sentences

In [7]:
class Writer:
    
    def __init__(self, filepath, sentences):
        self.filepath = filepath
        self.sentences = sentences
        
    def write_file(self):
        target_filename = self.filepath.split('/')[-1]
        with open(target_filename+'.pred', 'w') as f:
            for sentence in self.sentences:
                for token in sentence.tokens:
                    if token.form == 'ROOT':
                        continue
                    else:
                        line = ""
                        line+=token.id+'\t'
                        line+=token.form+'\t'
                        line+=token.lemma+'\t'
                        line+=token.pos+'\t'
                        line+=token.xpos+'\t'
                        line+=token.morph+'\t'
                        line+=token.head+'\t'
                        line+=token.deprel+'\t'
                        line+=token.x+'\t'
                        line+=token.y
                    
                        f.write(str(line))
                f.write('\n')
            f.close()

In [8]:
train_1k_sentences = Reader(train_path_en)
train_1k_sentences.read_file()
train_1k_sentences.sentences

[<__main__.Sentence at 0x1116cddf0>,
 <__main__.Sentence at 0x1116cf2b0>,
 <__main__.Sentence at 0x1116cf5e0>,
 <__main__.Sentence at 0x1116cf9a0>,
 <__main__.Sentence at 0x1116cfbb0>,
 <__main__.Sentence at 0x1107dffa0>,
 <__main__.Sentence at 0x1107df7c0>,
 <__main__.Sentence at 0x1107dfdc0>,
 <__main__.Sentence at 0x107728940>,
 <__main__.Sentence at 0x1116e9430>,
 <__main__.Sentence at 0x1116e9b50>,
 <__main__.Sentence at 0x1116f4310>,
 <__main__.Sentence at 0x1116f46d0>,
 <__main__.Sentence at 0x1116f4fa0>,
 <__main__.Sentence at 0x1117002e0>,
 <__main__.Sentence at 0x111700910>,
 <__main__.Sentence at 0x111700d00>,
 <__main__.Sentence at 0x11170b670>,
 <__main__.Sentence at 0x11170bd90>,
 <__main__.Sentence at 0x111716130>,
 <__main__.Sentence at 0x111716370>,
 <__main__.Sentence at 0x111716d00>,
 <__main__.Sentence at 0x111721130>,
 <__main__.Sentence at 0x1117213a0>,
 <__main__.Sentence at 0x111721910>,
 <__main__.Sentence at 0x111721d30>,
 <__main__.Sentence at 0x11172c220>,
 

In [None]:
from difflib import Differ
 
with open(train_path_en) as file_1, open(train_path_en.split('/')[-1]+'.pred') as file_2:
    differ = Differ()
 
    for line in differ.compare(file_1.readlines(), file_2.readlines()):
        print(line)

# Evaluation

# Decoder (Eisner's Algorithm)

In [32]:
test_matrix = np.array([[-10000, 9, 10, 9], 
               [-10000, -10000, 20, 3], 
               [-10000, 30, -10000, 30],
               [-10000, 11, 0, -10000]])

In [33]:
# Some constants
L, R = 0, 1
I, C = 0, 1
DIRECTIONS = (L, R)
COMPLETENESS = (I, C)
NEG_INF = -float('inf')


class Span(object):
    def __init__(self, left_idx, right_idx, head_side, complete):
        self.data = (left_idx, right_idx, head_side, complete)

    @property
    def left_idx(self):
        return self.data[0]

    @property
    def right_idx(self):
        return self.data[1]

    @property
    def head_side(self):
        return self.data[2]

    @property
    def complete(self):
        return self.data[3]

    def __str__(self):
        return "({}, {}, {}, {})".format(
            self.left_idx,
            self.right_idx,
            "L" if self.head_side == L else "R",
            "C" if self.complete == C else "I",
        )

    def __repr__(self):
        return self.__str__()

    def __hash__(self):
        return hash(self.data)

    def __eq__(self, other):
        return isinstance(other, Span) and hash(other) == hash(self)

In [50]:
def eisner(weight):
    """
    `N` denotes the length of sentence.

    :param weight: size N x N
    :return: the projective tree with maximum score
    """
    N = weight.shape[0]

    btp = {}  # Back-track pointer
    dp_s = {}

    # Init
    for i in range(N):
        for j in range(i + 1, N):
            for dir in DIRECTIONS:
                for comp in COMPLETENESS:
                    dp_s[Span(i, j, dir, comp)] = NEG_INF
    print(dp_s)

    # base case
    for i in range(N):
        for dir in DIRECTIONS:
            dp_s[Span(i, i, dir, C)] = 0.
            btp[Span(i, i, dir, C)] = None

    rules = [
        # span_shape_tuple := (span_direction, span_completeness),
        # rule := (span_shape, (left_subspan_shape, right_subspan_shape))
        ((L, I), ((R, C), (L, C))),
        ((R, I), ((R, C), (L, C))),
        ((L, C), ((L, C), (L, I))),
        ((R, C), ((R, I), (R, C))),
    ]

    for size in range(1, N):
        for i in range(0, N - size):
            j = i + size
            for rule in rules:
                ((dir, comp), ((l_dir, l_comp), (r_dir, r_comp))) = rule

                if comp == I:
                    edge_w = weight[i, j] if (dir == R) else weight[j, i]
                    k_start, k_end = (i, j)
                    offset = 1
                else:
                    edge_w = 0.
                    k_start, k_end = (i + 1, j + 1) if dir == R else (i, j)
                    offset = 0

                span = Span(i, j, dir, comp)
                for k in range(k_start, k_end):
                    l_span = Span(i, k, l_dir, l_comp)
                    r_span = Span(k + offset, j, r_dir, r_comp)
                    s = edge_w + dp_s[l_span] + dp_s[r_span]
                    if s > dp_s[span]:
                        dp_s[span] = s
                        btp[span] = (l_span, r_span)

    # recover tree
    return back_track(btp, Span(0, N - 1, R, C), set())


def back_track(btp, span, edge_set):
    if span.complete == I:
        if span.head_side == L:
            edge = (span.right_idx, span.left_idx)
        else:
            edge = (span.left_idx, span.right_idx)
        edge_set.add(edge)

    if btp[span] is not None:
        l_span, r_span = btp[span]

        back_track(btp, l_span, edge_set)
        back_track(btp, r_span, edge_set)
    else:
        return

    return edge_set

In [51]:
eisner(test_matrix)

{(0, 1, L, I): -inf, (0, 1, L, C): -inf, (0, 1, R, I): -inf, (0, 1, R, C): -inf, (0, 2, L, I): -inf, (0, 2, L, C): -inf, (0, 2, R, I): -inf, (0, 2, R, C): -inf, (0, 3, L, I): -inf, (0, 3, L, C): -inf, (0, 3, R, I): -inf, (0, 3, R, C): -inf, (1, 2, L, I): -inf, (1, 2, L, C): -inf, (1, 2, R, I): -inf, (1, 2, R, C): -inf, (1, 3, L, I): -inf, (1, 3, L, C): -inf, (1, 3, R, I): -inf, (1, 3, R, C): -inf, (2, 3, L, I): -inf, (2, 3, L, C): -inf, (2, 3, R, I): -inf, (2, 3, R, C): -inf}


{(0, 2), (2, 1), (2, 3)}

In [47]:
test_list = []
test_list.append((2, 1))
test_list

[(2, 1)]

In [78]:
tree = []

def Eisner(edge_scores):
    # Get length of sentence
    #n = len(sentence.tokens)
    n=4
    
    # Initialize matrices with zeros
    O_r = np.zeros([n, n], dtype=int)
    O_l = np.zeros([n, n], dtype=int)
    C_r = np.zeros([n, n], dtype=int)
    C_l = np.zeros([n, n], dtype=int)
    
    # Initialize backtracking matrices
    b_table = [[None]*n]*n
    
    for m in np.arange(1, n):
        for s in np.arange(0, n-m):
            t = s+m
            
            # O_r
            q=s
            max_q_score = -10000; max_q = q
            while q < t:
                curr_q_score = C_l[s][q] + C_r[q+1][t] + edge_scores[t][s]
                #print("O_r step: " + str((C_l[s][q], C_r[q+1][t], edge_scores[t][s])))
                if curr_q_score > max_q_score:
                    max_q_score = curr_q_score; max_q = q
                q += 1
            #print("O_r max_q: " + str(max_q_score))
            O_r[s][t] = max_q_score; b_table[s][t] = (max_q, "Open", "R")
            
            # O_l
            q=s
            max_q_score = -10000; max_q = q
            while q < t:
                curr_q_score = C_l[s][q] + C_r[q+1][t] + edge_scores[s][t]
                #print("O_l step: " + str((C_l[s][q], C_r[q+1][t], edge_scores[s][t])))
                if curr_q_score > max_q_score:
                    max_q_score = curr_q_score; max_q = q
                q += 1
            #print("O_l max_q: " + str(max_q_score))
            O_l[s][t] = max_q_score; b_table[s][t] = (max_q, "Open", "L")
            
            # C_r
            q=s
            max_q_score = -10000; max_q = q
            while q < t:
                curr_q_score = C_r[s][q] + O_r[q][t]
                #print("C_r step: " + str((C_r[s][q], O_r[q][t])))
                if curr_q_score > max_q_score:
                    max_q_score = curr_q_score; max_q = q
                q += 1
            #print("C_r max_q: " + str(max_q_score))
            C_r[s][t] = max_q_score; b_table[s][t] = (max_q, "Closed", "R")
                
            # C_l
            q=s+1
            max_q_score = -10000; max_q = q
            while q <= t:
                curr_q_score = O_l[s][q] + C_l[q][t]
                #print("C_l step: " + str((O_l[s][q], C_l[q][t])))
                if curr_q_score > max_q_score:
                    max_q_score = curr_q_score; max_q = q
                q += 1
            #print("C_l max_q: " + str(max_q_score))
            C_l[s][t] = max_q_score; b_table[s][t] = (max_q, "Closed", "L")
            
            #print("s: " + str(s))
            #print("")

    print(O_r)
    print(O_l)
    print(C_r)
    print(C_l)
    print("")
    
    for line in b_table:
        print(line)
    
    d_index = np.argmax(C_l[0])
    tree.append((0, d_index))
    #return C_l
    #return backtrack(b_Or, b_Ol, b_Cr, b_Cl, 0, d_index, n)
    return C_l[0][n-1]

def backtrack(C_r, C_l, b_Cr, b_Cl, h_index, d_index, n):
    if h_index > d_index:
        direction = "L"
    else:
        direction = "R"
    for n in np.arange(0, n):
        if direction == "L":
            next_index = np.argmax(C_l[h_index])
            tree.append((h_index, next_index))
            h_index = b_Cl[h_index][next_index]
        else:
            next_index = np.argmax(C_r[h_index])
            h_index = b_Cr[h_index][next_index]

In [79]:
Eisner(test_matrix)

[[     0 -10000  -9970  -9960]
 [     0      0     30     31]
 [     0      0      0      0]
 [     0      0      0      0]]
[[ 0  9 40 49]
 [ 0  0 20 23]
 [ 0  0  0 30]
 [ 0  0  0  0]]
[[     0 -10000  -9970  -9960]
 [     0      0     30     31]
 [     0      0      0      0]
 [     0      0      0      0]]
[[ 0  9 40 70]
 [ 0  0 20 50]
 [ 0  0  0 30]
 [ 0  0  0  0]]

[None, (1, 'Closed', 'L'), (2, 'Closed', 'L'), (2, 'Closed', 'L')]
[None, (1, 'Closed', 'L'), (2, 'Closed', 'L'), (2, 'Closed', 'L')]
[None, (1, 'Closed', 'L'), (2, 'Closed', 'L'), (2, 'Closed', 'L')]
[None, (1, 'Closed', 'L'), (2, 'Closed', 'L'), (2, 'Closed', 'L')]


70

# Feature Templates, Extraction, and Mapping

In [52]:
### Feature templates

unigram_features = ["hform, hpos", "hform", "hpos", "dform, dpos", "dform", "dpos"]
bigram_features = ["hform, hpos, dform, dpos", 
                   "hpos, dform, dpos", 
                   "hform, dform, dpos", 
                   "hform, hpos, dform", 
                   "hform, hpos, dpos", 
                   "hform, dform", 
                   "hpos, dpos"]

# Combine features with direction of edge and distance between head and dependent

In [53]:
test_sentence = train_1k_sentences.sentences[0]
vars(test_sentence.tokens[1])

{'id': '1',
 'form': 'In',
 'lemma': 'in',
 'pos': 'IN',
 'xpos': '_',
 'morph': '_',
 'head': '43',
 'deprel': 'ADV',
 'x': '_',
 'y': '_\n'}

In [54]:
test_data = [test_sentence]
len(test_data[0].tokens)

50

In [55]:
getattr(test_sentence.tokens[0], "form")

'ROOT'

In [59]:
### Feature extraction
test_features = ["dform", "dpos", "dform, dpos"]

class FeatureMapping:
    
    '''
        IMPORTANT: m features for each n token
        AKA n*m feature vectors
        .map -> dictionary
            keys = feature_name, values = feature_vector_id
                ex. feature_name = 'hform=likes'
                    feature_vector_id = 1
    '''
    
    def __init__(self, features, sentences):
        self.sentences = sentences
        self.features = features
        self.feature_dict = {"dform": "form", "dpos": "pos", "dform, dpos": "form, pos"}
        self.map = {}
        self.vectors = {}
        self.frozen = False
    
    def arcs(self, sentence):
        '''
            arc_list = list of (head_int, token(dependency)_int) tuples
        '''
        arc_list = []
        for token in sentence.tokens:
            if token.form == "ROOT":
                continue
            else:
                arc_list.append((int(token.head), int(token.id)))
        return arc_list
    
    def create_mapping(self):
        i=0
        for sentence in self.sentences:
            sentence_arcs = self.arcs(sentence)
            #print(len(sentence_arcs))
            for arc in sentence_arcs:
                # Get current arc
                token = sentence.tokens[arc[1]]
                head = sentence.tokens[arc[0]]
                token_vector = []
                #print("feature: " + str(feature))
                #print("token: " + str(token.form))
                #print("head: " + str(head.form))
                for feature_name in self.features:
                    full_feature = feature_name+"="
                    if feature_name in ["dform", "dpos"]:
                        feature_val = getattr(token, feature_name[1:])
                        if feature_val == "_":
                            full_feature += "_NULL_"
                        else:
                            full_feature += feature_val
                    if feature_name == "dform, dpos":
                        form_val = "_NULL_" if token.form == "_" else token.form
                        pos_val = "_NULL_" if token.pos == "_" else token.pos
                        full_feature += form_val+"+"+pos_val
                    #print(full_feature, i)
                    #print("")
                    if full_feature not in self.map.keys():
                        self.map[full_feature] = i
                        token_vector.append(i)
                        i += 1
                self.vectors[token] = token_vector

In [60]:
test_mapping = FeatureMapping(test_features, test_data)
test_mapping.create_mapping()
test_mapping.map

{'dform=In': 0,
 'dpos=IN': 1,
 'dform, dpos=In+IN': 2,
 'dform=an': 3,
 'dpos=DT': 4,
 'dform, dpos=an+DT': 5,
 'dform=Oct.': 6,
 'dpos=NNP': 7,
 'dform, dpos=Oct.+NNP': 8,
 'dform=19': 9,
 'dpos=CD': 10,
 'dform, dpos=19+CD': 11,
 'dform=review': 12,
 'dpos=NN': 13,
 'dform, dpos=review+NN': 14,
 'dform=of': 15,
 'dform, dpos=of+IN': 16,
 'dform=``': 17,
 'dpos=``': 18,
 'dform, dpos=``+``': 19,
 'dform=The': 20,
 'dform, dpos=The+DT': 21,
 'dform=Misanthrope': 22,
 'dform, dpos=Misanthrope+NN': 23,
 "dform=''": 24,
 "dpos=''": 25,
 "dform, dpos=''+''": 26,
 'dform=at': 27,
 'dform, dpos=at+IN': 28,
 'dform=Chicago': 29,
 'dform, dpos=Chicago+NNP': 30,
 "dform='s": 31,
 'dpos=POS': 32,
 "dform, dpos='s+POS": 33,
 'dform=Goodman': 34,
 'dform, dpos=Goodman+NNP': 35,
 'dform=Theatre': 36,
 'dform, dpos=Theatre+NNP': 37,
 'dform=(': 38,
 'dpos=-LRB-': 39,
 'dform, dpos=(+-LRB-': 40,
 'dform=Revitalized': 41,
 'dpos=VBN': 42,
 'dform, dpos=Revitalized+VBN': 43,
 'dform=Classics': 44,
 'd

# Feature Vector Representation

In [61]:
test_mapping.vectors

{<__main__.Token at 0x1077788b0>: [0, 1, 2],
 <__main__.Token at 0x107778190>: [3, 4, 5],
 <__main__.Token at 0x1077789d0>: [6, 7, 8],
 <__main__.Token at 0x1107d45b0>: [9, 10, 11],
 <__main__.Token at 0x10771ea60>: [12, 13, 14],
 <__main__.Token at 0x10771ee80>: [15, 16],
 <__main__.Token at 0x10771ec10>: [17, 18, 19],
 <__main__.Token at 0x10771ec40>: [20, 21],
 <__main__.Token at 0x1116cd880>: [22, 23],
 <__main__.Token at 0x1116cd850>: [24, 25, 26],
 <__main__.Token at 0x1116cdfd0>: [27, 28],
 <__main__.Token at 0x1116cd8b0>: [29, 30],
 <__main__.Token at 0x1116cd910>: [31, 32, 33],
 <__main__.Token at 0x1116cd8e0>: [34, 35],
 <__main__.Token at 0x1116cd2e0>: [36, 37],
 <__main__.Token at 0x1116cd100>: [38, 39, 40],
 <__main__.Token at 0x1116cd820>: [],
 <__main__.Token at 0x1116cd580>: [41, 42, 43],
 <__main__.Token at 0x1116cd730>: [44, 45, 46],
 <__main__.Token at 0x1116cdf70>: [47, 48, 49],
 <__main__.Token at 0x1116cd5b0>: [50, 51],
 <__main__.Token at 0x1116cd3d0>: [52, 53],


# Weight Vector