# Final Project
## Computational Linguistics (Winter semester 2020/21, Prof. Dr. Alexander Koller)
## Siyu Tao

### Environment Setup

In [1]:
! pip install editdistance



In [1]:
# adding ./bin to system path
import sys
import os
sys.path.insert(0, os.path.abspath('./bin'))
from preprocessing import Preprocessor
# standard IBM model 1
from model1 import Model1
# models extended with edit-distance
from ed_model import ED_Model1

## Preprocessing

### Download and Prepare Corpus

In [None]:
! wget http://demo.spraakdata.gu.se/gerlof/OpenEDGeS/OpenEDGeS_v1.0.0.zip
! unzip OpenEDGeS_v1.0.0.zip -d data/ && rm OpenEDGeS_v1.0.0.zip

### Code for Processing the Entire Corpus
Uncomment and run this code block in order to process **THE ENTIRE COPRUS**, not necessary for our experiment / eval

In [None]:
# UNCOMMENT block below to process all data
# path_to_corpus = "data/OpenEDGeS/"
# path_to_processed_data = "data/processed/"
# preprocessor = Preprocessor(path_to_corpus, path_to_processed_data)
# path_to_sentence_alignments = "data/OpenEDGeS/Alignments/"
# for root, dirs, files in os.walk(path_to_sentence_alignments):
#     files = [ fi for fi in files if fi.endswith(".tsv")]
#     for file in files:
#         preprocessor.process(os.path.join(root,file), verbose = False)

### Processing the three pairs of texts for which we have gold alignment labels
- de_1781_Rosalino-de_1871_Elberfelder
- de_1871_Elberfelder-en_1890_Darby
- en_1611_KJV-en_1890_Darby

In [None]:
path_to_corpus = "data/OpenEDGeS/"
path_to_processed_data = "data/eval/"
# initialize the preprocessor with in- and output paths
preprocessor = Preprocessor(path_to_corpus, path_to_processed_data)
testset = ['data/OpenEDGeS/Alignments/de-de/de_1781_Rosalino-de_1871_Elberfelder.tsv', 'data/OpenEDGeS/Alignments/de-en/de_1871_Elberfelder-en_1890_Darby.tsv', 'data/OpenEDGeS/Alignments/en-en/en_1611_KJV-en_1890_Darby.tsv']
for alignment_file in testset:
    preprocessor.process(alignment_file)

## Baseline: Standard IBM Model 1

### DE-DE

In [2]:
aligner = Model1(d = 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text')
aligner.em_training(num_iter = 5)

f_vocab_size: 10578
e_vocab_size: 10324
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [3]:
os.makedirs("./output/", exist_ok=True)
aligner.align(out = "./output/de-de_ibm1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0  < ./output/de-de_ibm1.a
# Output
! echo "\nStandard IBM Model 1, DE-DE" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0 < ./output/de-de_ibm1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' < ./output/de-de_ibm1.a > ./output/de-de_ibm1.log

Aligned 15 sentences.
Output to: ./output/de-de_ibm1.a
Precision = 0.541176
Recall = 0.613333
AER = 0.425000


### EN-EN

In [4]:
aligner = Model1(d = 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text')
aligner.em_training(num_iter = 5)

f_vocab_size: 15351
e_vocab_size: 14143
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [5]:
aligner.align(out = "./output/en-en_ibm1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0  < ./output/en-en_ibm1.a
# Output
! echo "\nStandard IBM Model 1, EN-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0 < ./output/en-en_ibm1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' < ./output/en-en_ibm1.a > ./output/en-en_ibm1.log

Aligned 15 sentences.
Output to: ./output/en-en_ibm1.a
Precision = 0.541254
Recall = 0.567474
AER = 0.445946


### DE-EN

In [6]:
aligner = Model1(d = 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text')
aligner.em_training(num_iter = 5)

f_vocab_size: 24103
e_vocab_size: 14102
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [7]:
aligner.align(out = "./output/de-en_ibm1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0  < ./output/de-en_ibm1.a
# Output
! echo "\nStandard IBM Model 1, DE-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0 < ./output/de-en_ibm1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' < ./output/de-en_ibm1.a > ./output/de-en_ibm1.log

Aligned 15 sentences.
Output to: ./output/de-en_ibm1.a
Precision = 0.596958
Recall = 0.610895
AER = 0.396154


## Method 1: Heuristic Initialization

### DE-DE

In [8]:
aligner = ED_Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text", ed_heuristic = True)
aligner.em_training(num_iter = 5)

f_vocab_size: 10578
e_vocab_size: 10324
Using edit distance for heuristic initialization: (one dot represents 1 percent)
.....................................................................................................
Heuristic initialization completed.
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [9]:
aligner.align(out = "./output/de-de_ed1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0  < ./output/de-de_ed1.a
# Output
! echo "\nModel 1 with ED Heuristic, DE-DE" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0 < ./output/de-de_ed1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' < ./output/de-de_ed1.a > ./output/de-de_ed1.log

Aligned 15 sentences.
Output to: ./output/de-de_ed1.a
Precision = 0.579412
Recall = 0.656667
AER = 0.384375


### EN-EN

In [10]:
aligner = ED_Model1(d = 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text', ed_heuristic = True)
aligner.em_training(num_iter = 5)

f_vocab_size: 15351
e_vocab_size: 14143
Using edit distance for heuristic initialization: (one dot represents 1 percent)
.....................................................................................................
Heuristic initialization completed.
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [11]:
aligner.align(out = "./output/en-en_ed1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0  < ./output/en-en_ed1.a
# Output
! echo "\nModel 1 with ED Heuristic, EN-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0 < ./output/en-en_ed1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' < ./output/en-en_ed1.a > ./output/en-en_ed1.log

Aligned 15 sentences.
Output to: ./output/en-en_ed1.a
Precision = 0.590759
Recall = 0.619377
AER = 0.395270


### DE-EN

In [12]:
aligner = ED_Model1(d = 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text', ed_heuristic = True)
aligner.em_training(num_iter = 5)

f_vocab_size: 24103
e_vocab_size: 14102
Using edit distance for heuristic initialization: (one dot represents 1 percent)
.....................................................................................................
Heuristic initialization completed.
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [13]:
aligner.align(out = "./output/de-en_ed1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0  < ./output/de-en_ed1.a
# Output
! echo "\nModel 1 with ED Heuristic, DE-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0 < ./output/de-en_ed1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' < ./output/de-en_ed1.a > ./output/de-en_ed1.log

Aligned 15 sentences.
Output to: ./output/de-en_ed1.a
Precision = 0.657795
Recall = 0.673152
AER = 0.334615


## Method 2: Modified Decoding

### DE-DE

In [14]:
aligner = ED_Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text", ed_decode = True)
aligner.em_training(num_iter = 5)

f_vocab_size: 10578
e_vocab_size: 10324
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [15]:
aligner.align(out = "./output/de-de_ed2.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0  < ./output/de-de_ed2.a
# Output
! echo "\nModel 1 with ED Decoding, DE-DE" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0 < ./output/de-de_ed2.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' < ./output/de-de_ed2.a > ./output/de-de_ed2.log

Aligned 15 sentences.
Output to: ./output/de-de_ed2.a
Precision = 0.550000
Recall = 0.623333
AER = 0.415625


### EN-EN

In [16]:
aligner = ED_Model1(d = 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text', ed_decode = True)
aligner.em_training(num_iter = 5)

f_vocab_size: 15351
e_vocab_size: 14143
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [17]:
aligner.align(out = "./output/en-en_ed2.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0  < ./output/en-en_ed2.a
# Output
! echo "\nModel 1 with ED Decoding, EN-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0 < ./output/en-en_ed2.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' < ./output/en-en_ed2.a > ./output/en-en_ed2.log

Aligned 15 sentences.
Output to: ./output/en-en_ed2.a
Precision = 0.607261
Recall = 0.636678
AER = 0.378378


### DE-EN

In [18]:
aligner = ED_Model1(d = 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text', ed_decode = True)
aligner.em_training(num_iter = 5)

f_vocab_size: 24103
e_vocab_size: 14102
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


In [19]:
aligner.align(out = "./output/de-en_ed2.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0  < ./output/de-en_ed2.a
# Output
! echo "\nModel 1 with ED Decoding, DE-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0 < ./output/de-en_ed2.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' < ./output/de-en_ed2.a > ./output/de-en_ed2.log

Aligned 15 sentences.
Output to: ./output/de-en_ed2.a
Precision = 0.646388
Recall = 0.661479
AER = 0.346154
