# Final Project
## Computational Linguistics (Winter semester 2020/21, Prof. Dr. Alexander Koller)
## Siyu Tao

### Environment Setup

In [1]:
# adding ./bin to system path
import sys
sys.path.insert(0, os.path.abspath('./bin'))
import os
from preprocessing import Preprocessor
# standard IBM model 1
from model1 import Model1
# models extended with edit-distance
from ed_models import ED_Model1

## Preprocessing

### Download and Prepare Corpus

In [1]:
! wget http://demo.spraakdata.gu.se/gerlof/OpenEDGeS/OpenEDGeS_v1.0.0.zip
! unzip OpenEDGeS_v1.0.0.zip -d data/ && rm OpenEDGeS_v1.0.0.zip

--2021-03-13 21:45:24--  http://demo.spraakdata.gu.se/gerlof/OpenEDGeS/OpenEDGeS_v1.0.0.zip
Resolving demo.spraakdata.gu.se (demo.spraakdata.gu.se)... 130.241.135.164
Connecting to demo.spraakdata.gu.se (demo.spraakdata.gu.se)|130.241.135.164|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 76434359 (73M) [application/zip]
Saving to: ‘OpenEDGeS_v1.0.0.zip’


2021-03-13 21:58:55 (92.1 KB/s) - ‘OpenEDGeS_v1.0.0.zip’ saved [76434359/76434359]



### Code for Processing the Entire Corpus
Uncomment and run this code block in order to process **THE ENTIRE COPRUS**, not necessary for our experiment / eval

In [4]:
# UNCOMMENT block below to process all data
# path_to_corpus = "data/OpenEDGeS/"
# path_to_processed_data = "data/processed/"
# preprocessor = Preprocessor(path_to_corpus, path_to_processed_data)
# path_to_sentence_alignments = "data/OpenEDGeS/Alignments/"
# for root, dirs, files in os.walk(path_to_sentence_alignments):
#     files = [ fi for fi in files if fi.endswith(".tsv")]
#     for file in files:
#         preprocessor.process(os.path.join(root,file), verbose = False)

### Processing the three pairs of texts, for which we have gold alignment labels
- de_1781_Rosalino-de_1871_Elberfelder
- de_1871_Elberfelder-en_1890_Darby
- en_1611_KJV-en_1890_Darby

In [4]:
path_to_corpus = "data/OpenEDGeS/"
path_to_processed_data = "data/eval/"
# initialize the preprocessor with in- and output paths
preprocessor = Preprocessor(path_to_corpus, path_to_processed_data)
testset = ['data/OpenEDGeS/Alignments/de-de/de_1781_Rosalino-de_1871_Elberfelder.tsv', 'data/OpenEDGeS/Alignments/de-en/de_1871_Elberfelder-en_1890_Darby.tsv', 'data/OpenEDGeS/Alignments/en-en/en_1611_KJV-en_1890_Darby.tsv']
for alignment_file in testset:
    preprocessor.process(alignment_file)

Initializing preprocessor...
Corpus directory: data/OpenEDGeS/
Output directory: data/eval/
Processing alignment file: data/OpenEDGeS/Alignments/de-de/de_1781_Rosalino-de_1871_Elberfelder.tsv
Directory to f text: data/OpenEDGeS/Texts/de/de_1781_Rosalino
Directory to e text: data/OpenEDGeS/Texts/de/de_1871_Elberfelder
Outputing .f file: data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text.f
Outputing .e file: data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text.e
Processing alignment file: data/OpenEDGeS/Alignments/de-en/de_1871_Elberfelder-en_1890_Darby.tsv
Directory to f text: data/OpenEDGeS/Texts/de/de_1871_Elberfelder
Directory to e text: data/OpenEDGeS/Texts/en/en_1890_Darby
Outputing .f file: data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text.f
Outputing .e file: data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text.e
Processing alignment file: data/OpenEDGeS/Alignments/en-en/en_1611_KJV-en_1890_Darby.tsv
Directory to f text: data/OpenEDGeS/Texts/en/en_1611_KJV
Dire

## Baseline: Standard IBM Model 1

### Training

In [4]:
aligner = Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text")
aligner.em_training(num_iter = 5)

f_vocab_size: 10578
e_vocab_size: 10324
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


### Evaluation

In [20]:
aligner.align(out = "./output/de-de_ibm1.a", n = 10)
! ./bin/score-alignments -d "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text"  < ./output/de-de_ibm1.a

            *                                                                          | ,
 |                                                                                                            ( )       | und
 |                                                         ( )                                                          | denen
 |         ( )                                                                                                          | ,
 |            ( )                               *                                                                       | die
 |                              ( )                *                                                                    | berufen
 |                                                ( )                                                                   | sind
 |         ( )                                           *                                                              | ,
 |                      

In [46]:
! echo "Standard IBM Model 1, DE-DE" >> ./output/results.info
! ./bin/score-alignments -d "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text" -n 0 < ./output/de-de_ibm1.a >> ./output/results.info

## Method 1: Heuristic Initialization

### Install `editdistance` package for a more efficient implementation of edit distance

In [8]:
! pip install editdistance



### Training

In [12]:
aligner1 = ED_Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text", ed_heuristic = True)
aligner1.em_training(num_iter = 5)

f_vocab_size: 10578
e_vocab_size: 10324
Using edit distance for heuristic initialization: (one dot represents 1 percent)
.....................................................................................................
Heuristic initialization completed.
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


### Evaluation

In [18]:
aligner1.align(out = "./output/de-de_ed1.a", n = 10)
! ./bin/score-alignments -d "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text"  < ./output/de-de_ed1.a

            *                                                                          | ,
 |                                                                                                            ( )       | und
 |                                                ( )                                                                   | denen
 |         ( )                                                                                                          | ,
 |            ( )                               *                                                                       | die
 |                                                (*)                                                                   | berufen
 |                                                         ( )                                                          | sind
 |         ( )                                           *                                                              | ,
 |                      

In [47]:
! echo "\nModel 1 with ED Heuristic, DE-DE" >> ./output/results.info
! ./bin/score-alignments -d "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text" -n 0 < ./output/de-de_ed1.a >> ./output/results.info

## Method 2: Modified Decoding Algorithm

### Training

In [10]:
aligner2 = ED_Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text", ed_decode = True)
aligner2.em_training(num_iter = 5)

f_vocab_size: 10578
e_vocab_size: 10324
EM Training Starting:
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...


### Evaluation

In [19]:
aligner2.align(out = "./output/de-de_ed2.a", n = 10)
! ./bin/score-alignments -d "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text"  < ./output/de-de_ed2.a

            *                                                                          | ,
 |                                                                                                            ( )       | und
 |                                                         ( )                                                          | denen
 |         ( )                                                                                                          | ,
 |            ( )                               *                                                                       | die
 |                                                (*)                                                                   | berufen
 |                                                ( )                                                                   | sind
 |         ( )                                           *                                                              | ,
 |                      

In [48]:
! echo "\nModel 1 with ED Decoding, DE-DE" >> ./output/results.info
! ./bin/score-alignments -d "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text" -n 0 < ./output/de-de_ed2.a >> ./output/results.info