In [1]:
import numpy as np
import timeit
import random
import pickle, gzip

In [2]:
from IO import Reader, Writer, Data, Sentence, Token
from feature import FeatureMapping
from eisner import Eisner
from model import Model

# Decoder (Eisner's Algorithm)

In [3]:
test_matrix = np.array([[-10000, 9, 10, 9], 
               [-10000, -10000, 20, 3], 
               [-10000, 30, -10000, 30],
               [-10000, 11, 0, -10000]])
test_matrix[0, 2]

10

In [4]:
test_decoder = Eisner()
test_decoder.decode(test_matrix)

{'1': '2', '2': '0', '3': '2'}

# Main

In [None]:
train_path_en_1k = "Treebanks/english/train/wsj_train.only-projective.first-1k.conll06"
reader_en_1k = Reader(train_path_en_1k)
train_data_en_1k = Data(reader_en_1k.read_file())
train_data_en_1k.sentences

In [5]:
train_path_en_5k = "Treebanks/english/train/wsj_train.only-projective.first-5k.conll06"
reader_en_5k = Reader(train_path_en_5k)
train_data_en_5k = Data(reader_en_5k.read_file())
train_data_en_5k.sentences

[<IO.Sentence at 0x11046bd00>,
 <IO.Sentence at 0x11046be80>,
 <IO.Sentence at 0x11046b490>,
 <IO.Sentence at 0x11046b1c0>,
 <IO.Sentence at 0x11045cee0>,
 <IO.Sentence at 0x11045c910>,
 <IO.Sentence at 0x11045c6d0>,
 <IO.Sentence at 0x11045c4c0>,
 <IO.Sentence at 0x110446b50>,
 <IO.Sentence at 0x1104468e0>,
 <IO.Sentence at 0x1104930a0>,
 <IO.Sentence at 0x1104938b0>,
 <IO.Sentence at 0x110493c70>,
 <IO.Sentence at 0x11049e580>,
 <IO.Sentence at 0x11049e880>,
 <IO.Sentence at 0x11049eeb0>,
 <IO.Sentence at 0x1104aa2e0>,
 <IO.Sentence at 0x1104aac10>,
 <IO.Sentence at 0x1104b5370>,
 <IO.Sentence at 0x1104b56d0>,
 <IO.Sentence at 0x1104b5910>,
 <IO.Sentence at 0x1104c02e0>,
 <IO.Sentence at 0x1104c06d0>,
 <IO.Sentence at 0x1104c0940>,
 <IO.Sentence at 0x1104c0eb0>,
 <IO.Sentence at 0x1104ca310>,
 <IO.Sentence at 0x1104ca7c0>,
 <IO.Sentence at 0x1104caee0>,
 <IO.Sentence at 0x1105be7f0>,
 <IO.Sentence at 0x1105beca0>,
 <IO.Sentence at 0x1105c90d0>,
 <IO.Sentence at 0x1105c95b0>,
 <IO.Sen

In [None]:
#train_path_de = "Treebanks/german/train/tiger-2.2.train.only-projective.first-5k.conll06"
#reader = Reader(train_path_de)
#test_data = Data(reader.read_file())
#test_data.sentences

In [None]:
mapping_en_1k = FeatureMapping(train_data_en_1k.sentences)
mapping_en_1k.create_map()

In [None]:
len(mapping_en_1k.map)

In [None]:
train_data_en_1k.sentences[0].tokens[2].form

In [None]:
model_en_1k = Model(train_data_en_1k, mapping_en_1k)

In [None]:
model_en_1k.train(3)

In [None]:
vars(model_en_1k)

In [6]:
mapping_en_5k = FeatureMapping(train_data_en_5k.sentences)
mapping_en_5k.create_map()

Start time: 8.652391416
1000 sentences took 17.914218459 to map
2000 sentences took 39.267353459 to map
3000 sentences took 63.96708420900001 to map
4000 sentences took 98.810717709 to map
5000 sentences took 138.117167792 to map


In [7]:
model_en_5k = Model(train_data_en_5k, mapping_en_5k)

In [None]:
model_en_5k.train(5)

Start time: 2428.020590875
Epoch: 1
Time taken for past 100 sentences: 4.576695833000031
Time taken for past 200 sentences: 10.457128000000012
Time taken for past 300 sentences: 14.588591916000041
Time taken for past 400 sentences: 20.312308291000136
Time taken for past 500 sentences: 25.479420582999865
Time taken for past 600 sentences: 30.615048749999914
Time taken for past 700 sentences: 34.84838958299997
Time taken for past 800 sentences: 39.749225083000056
Time taken for past 900 sentences: 43.836054416000025
Time taken for past 1000 sentences: 50.680409832999885
Time taken for past 1100 sentences: 56.22226483300028
Time taken for past 1200 sentences: 61.28591795800003
Time taken for past 1300 sentences: 65.947939958
Time taken for past 1400 sentences: 70.62136054100029
Time taken for past 1500 sentences: 75.61333858299986
Time taken for past 1600 sentences: 81.3069412079999
Time taken for past 1700 sentences: 86.98382745800018
Time taken for past 1800 sentences: 92.549225708
Time

Time taken for past 4700 sentences: 195.5867106249998
Time taken for past 4800 sentences: 199.74595495799986
Time taken for past 4900 sentences: 203.58571574999996
Time taken for past 5000 sentences: 208.16937362499993
UAS score on 127143 tokens over 5000 sentences: 0.9576146543655569
Time taken for epoch: 208.32797737500005
0.0

Start time: 3117.696211541
Epoch: 4
Time taken for past 100 sentences: 3.7694867089999207
Time taken for past 200 sentences: 7.533061999999973
Time taken for past 300 sentences: 11.862632999999732
Time taken for past 400 sentences: 15.57107304200008
Time taken for past 500 sentences: 19.963328374999946
Time taken for past 600 sentences: 23.53218545899972
Time taken for past 700 sentences: 28.335005541999635
Time taken for past 800 sentences: 31.897865999999794
Time taken for past 900 sentences: 36.47107262500003
Time taken for past 1000 sentences: 40.809026958999766
Time taken for past 1100 sentences: 45.056185124999956
Time taken for past 1200 sentences: 49.8

In [None]:
dev_gold_path_en = "Treebanks/english/dev/wsj_dev.conll06.gold"
reader_en_dev = Reader(dev_gold_path_en)
dev_gold_data = Data(reader_en_dev.read_file())
dev_gold_data.sentences

In [None]:
model_en_5k.make_predictions(dev_gold_data)

In [None]:
dev_gold_data.sentences[0].tokens[5].x

In [None]:
dev_gold_data.evaluate()

In [None]:
test_gold_path_en = "Treebanks/english/test/wsj_test.conll06.blind"
reader_en_test = Reader(test_gold_path_en)
test_gold_data = Data(reader_en_test.read_file())
test_gold_data.sentences

In [None]:
model_en_5k.make_predictions(test_gold_data)

In [None]:
test_gold_data.sentences[0].tokens[1].x

In [None]:
for gold_sentence in test_gold_data.sentences:
    for token in gold_sentence.tokens:
        token.head = token.x
        token.x = "_"

In [None]:
test_gold_data.sentences[0].tokens[1].head

In [None]:
test_gold_writer = Writer(test_gold_path_en.split('/')[-1], test_gold_data.sentences)
test_gold_writer.write_file()

# German

In [None]:
train_path_de_5k = "Treebanks/german/train/tiger-2.2.train.only-projective.first-5k.conll06"
reader_de_train_5k = Reader(train_path_de_5k)
train_data_de_5k = Data(reader_de_train_5k.read_file())
train_data_de_5k.sentences

In [None]:
mapping_de_5k = FeatureMapping(train_data_de_5k.sentences)
mapping_de_5k.create_map()

In [None]:
model_de_5k = Model(train_data_de_5k, mapping_de_5k)

In [None]:
model_de_5k.train(10)

In [None]:
train_path_de_full = "Treebanks/german/train/tiger-2.2.train.only-projective.conll06"
reader_de_train_full = Reader(train_path_de_full)
train_data_de_full = Data(reader_de_train_full.read_file())
train_data_de_full.sentences

In [None]:
len(train_data_de_full.sentences)

In [None]:
mapping_de_full = FeatureMapping(train_data_de_full.sentences)
mapping_de_full.create_map()

In [None]:
len(train_data_de_full.sentences)

In [None]:
model_de_full = Model(train_data_de_full, mapping_de_full)

In [None]:
model_de_full.train(3)

In [None]:
model_de_full.make_predictions(dev_data_de_5k)

In [None]:
# test on dev data
dev_path_de_5k = "Treebanks/german/dev/tiger-2.2.dev.conll06.gold"
reader_de_dev_5k = Reader(dev_path_de_5k)
dev_data_de_5k = Data(reader_de_dev_5k.read_file())
len(dev_data_de_5k.sentences)

In [None]:
model_de_5k.make_predictions(dev_data_de_5k)

In [None]:
dev_data_de_5k.evaluate()

In [None]:
for dev_sentence in dev_data_de_5k.sentences:
    for token in dev_sentence.tokens:
        token.head = token.x
        token.x = "_"

In [None]:
dev_writer_de = Writer(dev_path_de_5k.split('/')[-1], dev_data_de_5k.sentences)
dev_writer_de.write_file()

In [None]:
test_path_de = "Treebanks/german/test/tiger-2.2.test.conll06.blind"
reader_de_test = Reader(test_path_de)
test_data_de = Data(reader_de_test.read_file())
len(test_data_de.sentences)

In [None]:
model_de_5k.make_predictions(test_data_de)

In [None]:
test_data_de.sentences[0].tokens[1].x

In [None]:
for gold_sentence in test_data_de.sentences:
    for token in gold_sentence.tokens:
        token.head = token.x
        token.x = "_"

In [None]:
test_data_de.sentences[0].tokens[1].head

In [None]:
test_writer_de = Writer(test_path_de.split('/')[-1], test_data_de.sentences)
test_writer_de.write_file()

In [None]:
from difflib import Differ
 
with open(train_path_en) as file_1, open(train_path_en.split('/')[-1]+'.pred') as file_2:
    differ = Differ()
 
    for line in differ.compare(file_1.readlines(), file_2.readlines()):
        print(line)