In [1]:
import numpy as np

In [2]:
train_path_en = "Treebanks/english/train/wsj_train.only-projective.first-1k.conll06"

In [3]:
f = open(train_path_en, "r", encoding="utf-8")

for element in f:
    print(element.split('\t'))

['1', 'In', 'in', 'IN', '_', '_', '43', 'ADV', '_', '_\n']
['2', 'an', 'an', 'DT', '_', '_', '5', 'NMOD', '_', '_\n']
['3', 'Oct.', 'oct.', 'NNP', '_', '_', '5', 'NMOD', '_', '_\n']
['4', '19', '0', 'CD', '_', '_', '3', 'NMOD', '_', '_\n']
['5', 'review', 'review', 'NN', '_', '_', '20', 'NMOD', '_', '_\n']
['6', 'of', 'of', 'IN', '_', '_', '5', 'NMOD', '_', '_\n']
['7', '``', '``', '``', '_', '_', '9', 'P', '_', '_\n']
['8', 'The', 'the', 'DT', '_', '_', '9', 'NMOD', '_', '_\n']
['9', 'Misanthrope', 'misanthrope', 'NN', '_', '_', '6', 'PMOD', '_', '_\n']
['10', "''", "''", "''", '_', '_', '9', 'P', '_', '_\n']
['11', 'at', 'at', 'IN', '_', '_', '9', 'NMOD', '_', '_\n']
['12', 'Chicago', 'chicago', 'NNP', '_', '_', '15', 'NMOD', '_', '_\n']
['13', "'s", "'s", 'POS', '_', '_', '12', 'NMOD', '_', '_\n']
['14', 'Goodman', 'goodman', 'NNP', '_', '_', '15', 'NMOD', '_', '_\n']
['15', 'Theatre', 'theatre', 'NNP', '_', '_', '11', 'PMOD', '_', '_\n']
['16', '(', '(', '-LRB-', '_', '_', '20', 'P

In [4]:
class Token:
    
    def __init__(self):
        self.id = None
        self.form = None
        self.lemma = None
        self.pos = None
        self.unknown = None
        self.features = None
        self.head = None
        self.deprel = None
        self.unknown2 = None
        self.unknown3 = None

In [5]:
class Sentence:
    
    def __init__(self, token_items):
        # create ROOT token
        root = Token()
        root.id = '0'
        root.form = 'ROOT'
        root.lemma = '_'
        root.pos = 'ROOT'
        root.unknown = '_'
        root.features = '_'
        root.head = '_'
        root.deprel = '_'
        root.unknown2 = '_'
        root.unknown3 = '_'
        
        # initialize tokens with ROOT
        self.tokens = [root]
        
        # add each token in token_items to sentence
        for token in token_items:
            self.tokens.append(token)

In [6]:
def Reader(filepath):
    f = open(filepath)
    sentences = []
    
    # init token_items
    token_items = []
    
    for line in f:
        # init current sentence and token
        token = Token()
        
        # if not at end of sentence
        if line != '\n':
            items = line.split('\t')
            #print(items)
            
            # add token data
            token.id = items[0]
            token.form = items[1]
            token.lemma = items[2]
            token.pos = items[3]
            token.unknown = items[4]
            token.features = items[5]
            token.head = items[6]
            token.deprel = items[7]
            token.unknown2 = items[8]
            token.unknown3 = items[9]
            
            # add Token to Sentence
            token_items.append(token)
        
        # add sentence and reset token_items
        else:
            sentences.append(Sentence(token_items))
            token_items = []
            
    return sentences

In [7]:
def Writer(target_filepath, sentences):
    with open(target_filepath, 'w') as f:
        for sentence in sentences:
            for token in sentence.tokens:
                if token.form == 'ROOT':
                    continue
                else:
                    line = ""
                    line+=token.id+'\t'
                    line+=token.form+'\t'
                    line+=token.lemma+'\t'
                    line+=token.pos+'\t'
                    line+=token.unknown+'\t'
                    line+=token.features+'\t'
                    line+=token.head+'\t'
                    line+=token.deprel+'\t'
                    line+=token.unknown2+'\t'
                    line+=token.unknown3
                    
                    f.write(str(line))
            f.write('\n')

In [9]:
train_path_en_write_test = "test/wsj_trainWriteTest.only-projective.first-1k.conll06"

train_1k_read = Reader(train_path_en)
Writer(train_path_en_write_test, train_1k_read)

In [14]:
len(train_1k_read[0].tokens)

50

In [10]:
from difflib import Differ
 
with open(train_path_en) as file_1, open(train_path_en_write_test) as file_2:
    differ = Differ()
 
    for line in differ.compare(file_1.readlines(), file_2.readlines()):
        print(line)

  1	In	in	IN	_	_	43	ADV	_	_

  2	an	an	DT	_	_	5	NMOD	_	_

  3	Oct.	oct.	NNP	_	_	5	NMOD	_	_

  4	19	0	CD	_	_	3	NMOD	_	_

  5	review	review	NN	_	_	20	NMOD	_	_

  6	of	of	IN	_	_	5	NMOD	_	_

  7	``	``	``	_	_	9	P	_	_

  8	The	the	DT	_	_	9	NMOD	_	_

  9	Misanthrope	misanthrope	NN	_	_	6	PMOD	_	_

  10	''	''	''	_	_	9	P	_	_

  11	at	at	IN	_	_	9	NMOD	_	_

  12	Chicago	chicago	NNP	_	_	15	NMOD	_	_

  13	's	's	POS	_	_	12	NMOD	_	_

  14	Goodman	goodman	NNP	_	_	15	NMOD	_	_

  15	Theatre	theatre	NNP	_	_	11	PMOD	_	_

  16	(	(	-LRB-	_	_	20	P	_	_

  17	``	``	``	_	_	20	P	_	_

  18	Revitalized	revitalize	VBN	_	_	19	NMOD	_	_

  19	Classics	classic	NNS	_	_	20	SBJ	_	_

  20	Take	take	VBP	_	_	1	PMOD	_	_

  21	the	the	DT	_	_	22	NMOD	_	_

  22	Stage	stage	NN	_	_	20	OBJ	_	_

  23	in	in	IN	_	_	20	ADV	_	_

  24	Windy	windy	NNP	_	_	25	NMOD	_	_

  25	City	city	NNP	_	_	23	PMOD	_	_

  26	,	,	,	_	_	20	P	_	_

  27	''	''	''	_	_	20	P	_	_

  28	Leisure	leisure	NN	_	_	20	NMOD	_	_

  29	&	and	CC	_	_	28	COORD	_	_

  30	Arts	ar

# Decoder (Eisner's Algorithm)

In [None]:
def Eisner(sentence, edge_scores):
    # Get length of sentence
    n = len(sentence.tokens)
    
    # Creating matrices
    O_r = # 2d matrix [n][n]
    O_l = 
    C_r = 
    C_l = 
    
    # Initialize with zeros
    asdf
    
    for m in np.arange(1, n):
        for s in np.arange(0, n-m-1):
            t = s+m
            O_r[s][t] = max(C_l[s][q] + C_r[q+1][t] + edge_scores[t][s])
            O_l[s][t] = max(C_l[s][q] + C_r[q+1][t] + edge_scores[s][t])
            C_r[s][t] = max(C_r[s][q] + O_r[q][t])
            C_l[s][t] = max(O_l[s][q] + C_l[q][t])
    
    return C_l[0][n]

# implement backtrackers