In [1]:
import numpy as np
import timeit
import random
import pickle, gzip

In [2]:
from IO import Reader, Writer, Data, Sentence, Token
from feature import FeatureMapping
from eisner import Eisner
from model import Model

# Decoder (Eisner's Algorithm)

In [3]:
test_matrix = np.array([[-10000, 9, 10, 9], 
               [-10000, -10000, 20, 3], 
               [-10000, 30, -10000, 30],
               [-10000, 11, 0, -10000]])
test_matrix[0, 2]

10

In [4]:
test_decoder = Eisner()
test_decoder.decode(test_matrix)

{'1': '2', '2': '0', '3': '2'}

# Main

In [5]:
train_path_en_1k = "Treebanks/english/train/wsj_train.only-projective.first-1k.conll06"
reader_en_1k = Reader(train_path_en_1k)
train_data_en_1k = Data(reader_en_1k.read_file())
train_data_en_1k.sentences

[<IO.Sentence at 0x104394400>,
 <IO.Sentence at 0x1043947c0>,
 <IO.Sentence at 0x10438dcd0>,
 <IO.Sentence at 0x10438da00>,
 <IO.Sentence at 0x10438d400>,
 <IO.Sentence at 0x104300ac0>,
 <IO.Sentence at 0x104300eb0>,
 <IO.Sentence at 0x1043b7040>,
 <IO.Sentence at 0x1043b7400>,
 <IO.Sentence at 0x1043b7940>,
 <IO.Sentence at 0x1043c20a0>,
 <IO.Sentence at 0x1043c2820>,
 <IO.Sentence at 0x1043c2be0>,
 <IO.Sentence at 0x1043ce4f0>,
 <IO.Sentence at 0x1043ce7f0>,
 <IO.Sentence at 0x1043cee20>,
 <IO.Sentence at 0x1043d8250>,
 <IO.Sentence at 0x1043d8b80>,
 <IO.Sentence at 0x1043e32e0>,
 <IO.Sentence at 0x1043e3640>,
 <IO.Sentence at 0x1043e3880>,
 <IO.Sentence at 0x1043ee250>,
 <IO.Sentence at 0x1043ee640>,
 <IO.Sentence at 0x1043ee8b0>,
 <IO.Sentence at 0x1043eee20>,
 <IO.Sentence at 0x1043f8280>,
 <IO.Sentence at 0x1043f8730>,
 <IO.Sentence at 0x1043f8e50>,
 <IO.Sentence at 0x104403760>,
 <IO.Sentence at 0x104403c10>,
 <IO.Sentence at 0x10440f040>,
 <IO.Sentence at 0x10440f520>,
 <IO.Sen

In [6]:
train_path_en_5k = "Treebanks/english/train/wsj_train.only-projective.first-5k.conll06"
reader_en_5k = Reader(train_path_en_5k)
train_data_en_5k = Data(reader_en_5k.read_file())
train_data_en_5k.sentences

[<IO.Sentence at 0x1066eaa00>,
 <IO.Sentence at 0x1066eab80>,
 <IO.Sentence at 0x1066f6070>,
 <IO.Sentence at 0x1066f6340>,
 <IO.Sentence at 0x1066f6940>,
 <IO.Sentence at 0x1066f6ca0>,
 <IO.Sentence at 0x1066f6ee0>,
 <IO.Sentence at 0x106700130>,
 <IO.Sentence at 0x1067004f0>,
 <IO.Sentence at 0x106700a30>,
 <IO.Sentence at 0x10670b190>,
 <IO.Sentence at 0x10670b970>,
 <IO.Sentence at 0x10670bd30>,
 <IO.Sentence at 0x106717640>,
 <IO.Sentence at 0x106717940>,
 <IO.Sentence at 0x106717f70>,
 <IO.Sentence at 0x1067213a0>,
 <IO.Sentence at 0x106721cd0>,
 <IO.Sentence at 0x10672e430>,
 <IO.Sentence at 0x10672e790>,
 <IO.Sentence at 0x10672e9d0>,
 <IO.Sentence at 0x1067383a0>,
 <IO.Sentence at 0x106738790>,
 <IO.Sentence at 0x106738a00>,
 <IO.Sentence at 0x106738f70>,
 <IO.Sentence at 0x1067433d0>,
 <IO.Sentence at 0x106743880>,
 <IO.Sentence at 0x106743fa0>,
 <IO.Sentence at 0x10674d8b0>,
 <IO.Sentence at 0x10674dd60>,
 <IO.Sentence at 0x106759190>,
 <IO.Sentence at 0x106759670>,
 <IO.Sen

In [7]:
#train_path_de = "Treebanks/german/train/tiger-2.2.train.only-projective.first-5k.conll06"
#reader = Reader(train_path_de)
#test_data = Data(reader.read_file())
#test_data.sentences

In [8]:
mapping_en_1k = FeatureMapping(train_data_en_1k.sentences)
mapping_en_1k.create_map()

1000 sentences mapped


In [9]:
len(mapping_en_1k.map)

6063219

In [10]:
train_data_en_1k.sentences[0].tokens[2].form

'an'

In [11]:
model_en_1k = Model(train_data_en_1k, mapping_en_1k)

In [12]:
model_en_1k.train(3)

Start time: 16.85629275
Epoch: 1
Time taken for past 100 sentences: 2.7735615829999993
Time taken for past 200 sentences: 5.210685082999998
Time taken for past 300 sentences: 7.451746665999998
Time taken for past 400 sentences: 9.551509790999997
Time taken for past 500 sentences: 12.688865707999998
Time taken for past 600 sentences: 15.115399665999998
Time taken for past 700 sentences: 17.416576915999997
Time taken for past 800 sentences: 20.443068832999998
Time taken for past 900 sentences: 23.008012291
Time taken for past 1000 sentences: 25.273787999999996
UAS score on 26145 tokens over 1000 sentences: 0.6599732262382865
Time taken for epoch: 25.3120695
0.0

Start time: 42.604995041
Epoch: 2
Time taken for past 100 sentences: 2.5195868339999947
Time taken for past 200 sentences: 5.058483291999998
Time taken for past 300 sentences: 7.175467333999997
Time taken for past 400 sentences: 9.506574084
Time taken for past 500 sentences: 12.567336749999996
Time taken for past 600 sentences: 1

In [13]:
vars(model_en_1k)

{'data': <IO.Data at 0x1035443d0>,
 'feature_mapping': <feature.FeatureMapping at 0x127c43220>,
 'weight_vector': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)}

In [14]:
dev_gold_path_en = "Treebanks/english/dev/wsj_dev.conll06.gold"
reader_en_dev = Reader(dev_gold_path_en)
dev_gold_data = Data(reader_en_dev.read_file())
dev_gold_data.sentences

[<IO.Sentence at 0x127c43d90>,
 <IO.Sentence at 0x14b1ed130>,
 <IO.Sentence at 0x14b1ed8e0>,
 <IO.Sentence at 0x14b1edf40>,
 <IO.Sentence at 0x14b24a5e0>,
 <IO.Sentence at 0x14b24ab80>,
 <IO.Sentence at 0x14b24af10>,
 <IO.Sentence at 0x14b244460>,
 <IO.Sentence at 0x14b244a90>,
 <IO.Sentence at 0x14b244fa0>,
 <IO.Sentence at 0x14b280580>,
 <IO.Sentence at 0x14b280a90>,
 <IO.Sentence at 0x14b27f2b0>,
 <IO.Sentence at 0x14b27f9a0>,
 <IO.Sentence at 0x14b27ff40>,
 <IO.Sentence at 0x14b3913a0>,
 <IO.Sentence at 0x14b391a60>,
 <IO.Sentence at 0x14b391f70>,
 <IO.Sentence at 0x14b3a25e0>,
 <IO.Sentence at 0x14b3a2b20>,
 <IO.Sentence at 0x14b3a2fa0>,
 <IO.Sentence at 0x14b3833a0>,
 <IO.Sentence at 0x14b383910>,
 <IO.Sentence at 0x14b383a60>,
 <IO.Sentence at 0x14b395130>,
 <IO.Sentence at 0x14b3952b0>,
 <IO.Sentence at 0x14b3953a0>,
 <IO.Sentence at 0x14b395820>,
 <IO.Sentence at 0x14b395970>,
 <IO.Sentence at 0x14b388040>,
 <IO.Sentence at 0x14b388550>,
 <IO.Sentence at 0x14b3888b0>,
 <IO.Sen

In [15]:
model_en_1k.make_predictions(dev_gold_data)

In [16]:
dev_gold_data.sentences[0].tokens[5].x

'0'

In [17]:
dev_gold_data.evaluate()

UAS score on 27678 tokens over 1083 sentences: 0.726497579304863


0.726497579304863

In [20]:
train_data_en_5k

<IO.Data at 0x1034c3760>

In [21]:
mapping_en_5k = FeatureMapping(train_data_en_5k.sentences)
mapping_en_5k.create_map()

1000 sentences mapped
2000 sentences mapped
3000 sentences mapped
4000 sentences mapped
5000 sentences mapped


In [22]:
model_en_5k = Model(train_data_en_5k, mapping_en_5k)

In [23]:
model_en_5k.train(5)

Start time: 321.25666925
Epoch: 1
Time taken for past 100 sentences: 4.307021582999994
Time taken for past 200 sentences: 8.581340415999989
Time taken for past 300 sentences: 12.866729290999956
Time taken for past 400 sentences: 16.820144957999958
Time taken for past 500 sentences: 20.933719499999995
Time taken for past 600 sentences: 24.774229875000003
Time taken for past 700 sentences: 29.580057915999987
Time taken for past 800 sentences: 33.602590916
Time taken for past 900 sentences: 37.64904487499996
Time taken for past 1000 sentences: 41.21184058299997
Time taken for past 1100 sentences: 45.356080582999994
Time taken for past 1200 sentences: 49.25101720800001
Time taken for past 1300 sentences: 52.69243562499997
Time taken for past 1400 sentences: 56.355345166000006
Time taken for past 1500 sentences: 60.635780874999966
Time taken for past 1600 sentences: 64.61693287499997
Time taken for past 1700 sentences: 68.22340187499998
Time taken for past 1800 sentences: 71.778243583
Time 

Time taken for past 4900 sentences: 218.72947491700006
Time taken for past 5000 sentences: 224.237665042
UAS score on 127143 tokens over 5000 sentences: 0.9292843491187088
Time taken for epoch: 224.43126008299998
0.0

Start time: 932.960990291
Epoch: 4
Time taken for past 100 sentences: 4.81632562499999
Time taken for past 200 sentences: 9.152972792000014
Time taken for past 300 sentences: 13.009065083999985
Time taken for past 400 sentences: 16.927612875000023
Time taken for past 500 sentences: 21.356782291999934
Time taken for past 600 sentences: 26.469058625000002
Time taken for past 700 sentences: 30.996722834000025
Time taken for past 800 sentences: 35.34028095899998
Time taken for past 900 sentences: 40.41775070899996
Time taken for past 1000 sentences: 44.930019625
Time taken for past 1100 sentences: 49.300244583999984
Time taken for past 1200 sentences: 53.56587887499995
Time taken for past 1300 sentences: 57.187389209
Time taken for past 1400 sentences: 61.56952783399993
Time 

In [24]:
model_en_5k.make_predictions(dev_gold_data)

In [25]:
dev_gold_data.evaluate()

UAS score on 27678 tokens over 1083 sentences: 0.7910614928824337


0.7910614928824337

In [26]:
test_gold_path_en = "Treebanks/english/test/wsj_test.conll06.blind"
reader_en_test = Reader(test_gold_path_en)
test_gold_data = Data(reader_en_test.read_file())
test_gold_data.sentences

[<IO.Sentence at 0x127c3ab80>,
 <IO.Sentence at 0x1689eadf0>,
 <IO.Sentence at 0x168a06250>,
 <IO.Sentence at 0x168a06790>,
 <IO.Sentence at 0x168a06eb0>,
 <IO.Sentence at 0x128030280>,
 <IO.Sentence at 0x128030910>,
 <IO.Sentence at 0x128030c10>,
 <IO.Sentence at 0x128030f70>,
 <IO.Sentence at 0x1282dd1c0>,
 <IO.Sentence at 0x1282dd550>,
 <IO.Sentence at 0x1282ddac0>,
 <IO.Sentence at 0x1282ddeb0>,
 <IO.Sentence at 0x128110760>,
 <IO.Sentence at 0x128110f40>,
 <IO.Sentence at 0x128540880>,
 <IO.Sentence at 0x128540d90>,
 <IO.Sentence at 0x128409310>,
 <IO.Sentence at 0x128409940>,
 <IO.Sentence at 0x128409dc0>,
 <IO.Sentence at 0x1281c5250>,
 <IO.Sentence at 0x1281c5700>,
 <IO.Sentence at 0x1281c5a90>,
 <IO.Sentence at 0x1281c5fa0>,
 <IO.Sentence at 0x37e76c3a0>,
 <IO.Sentence at 0x37e76c760>,
 <IO.Sentence at 0x37e76cd90>,
 <IO.Sentence at 0x1283d6670>,
 <IO.Sentence at 0x1283d6910>,
 <IO.Sentence at 0x1283d6cd0>,
 <IO.Sentence at 0x12863c0d0>,
 <IO.Sentence at 0x12863c610>,
 <IO.Sen

In [41]:
model_en_5k.make_predictions(test_gold_data)

In [44]:
test_gold_data.sentences[0].tokens[1].x

'2'

In [45]:
for gold_sentence in test_gold_data.sentences:
    for token in gold_sentence.tokens:
        token.head = token.x
        token.x = "_"

In [47]:
test_gold_data.sentences[0].tokens[1].head

'2'

In [48]:
test_gold_writer = Writer(test_gold_path_en.split('/')[-1], test_gold_data.sentences)
test_gold_writer.write_file()

NameError: name 'test_gold_path' is not defined

In [None]:
writer = Writer(train_path_en.split('/')[-1], train_data_en_1k.sentences)
writer.write_file()

In [None]:
from difflib import Differ
 
with open(train_path_en) as file_1, open(train_path_en.split('/')[-1]+'.pred') as file_2:
    differ = Differ()
 
    for line in differ.compare(file_1.readlines(), file_2.readlines()):
        print(line)