# Final Project
## Computational Linguistics (Winter semester 2020/21, Prof. Dr. Alexander Koller)
## Siyu Tao

### Environment Setup

In [None]:
# adding ./bin to system path
import sys
import os
sys.path.insert(0, os.path.abspath('./bin'))
from preprocessing import Preprocessor
# standard IBM model 1
from model1 import Model1
# models extended with edit-distance
from ed_models import ED_Model1

## Preprocessing

### Download and Prepare Corpus

In [None]:
! wget http://demo.spraakdata.gu.se/gerlof/OpenEDGeS/OpenEDGeS_v1.0.0.zip
! unzip OpenEDGeS_v1.0.0.zip -d data/ && rm OpenEDGeS_v1.0.0.zip

### Code for Processing the Entire Corpus
Uncomment and run this code block in order to process **THE ENTIRE COPRUS**, not necessary for our experiment / eval

In [None]:
# UNCOMMENT block below to process all data
# path_to_corpus = "data/OpenEDGeS/"
# path_to_processed_data = "data/processed/"
# preprocessor = Preprocessor(path_to_corpus, path_to_processed_data)
# path_to_sentence_alignments = "data/OpenEDGeS/Alignments/"
# for root, dirs, files in os.walk(path_to_sentence_alignments):
#     files = [ fi for fi in files if fi.endswith(".tsv")]
#     for file in files:
#         preprocessor.process(os.path.join(root,file), verbose = False)

### Processing the three pairs of texts, for which we have gold alignment labels
- de_1781_Rosalino-de_1871_Elberfelder
- de_1871_Elberfelder-en_1890_Darby
- en_1611_KJV-en_1890_Darby

In [None]:
path_to_corpus = "data/OpenEDGeS/"
path_to_processed_data = "data/eval/"
# initialize the preprocessor with in- and output paths
preprocessor = Preprocessor(path_to_corpus, path_to_processed_data)
testset = ['data/OpenEDGeS/Alignments/de-de/de_1781_Rosalino-de_1871_Elberfelder.tsv', 'data/OpenEDGeS/Alignments/de-en/de_1871_Elberfelder-en_1890_Darby.tsv', 'data/OpenEDGeS/Alignments/en-en/en_1611_KJV-en_1890_Darby.tsv']
for alignment_file in testset:
    preprocessor.process(alignment_file)

## Baseline: Standard IBM Model 1

### DE-DE

In [None]:
aligner = Model1(d = 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text')
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/de-de_ibm1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0  < ./output/de-de_ibm1.a
# Output
! echo "\nStandard IBM Model 1, DE-DE" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0 < ./output/de-de_ibm1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' < ./output/de-de_ibm1.a > ./output/de-de_ibm1.log

### EN-EN

In [None]:
aligner = Model1(d = 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text')
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/en-en_ibm1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0  < ./output/en-en_ibm1.a
# Output
! echo "\nStandard IBM Model 1, EN-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0 < ./output/en-en_ibm1.a >> ./output/summary.info
! python./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' < ./output/en-en_ibm1.a > ./output/en-en_ibm1.log

### DE-EN

In [None]:
aligner = Model1(d = 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text')
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/de-en_ibm1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0  < ./output/de-en_ibm1.a
# Output
! echo "\nStandard IBM Model 1, DE-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0 < ./output/de-en_ibm1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' < ./output/de-en_ibm1.a > ./output/de-en_ibm1.log

## Method 1: Heuristic Initialization

### Install `editdistance` package for a more efficient implementation of edit distance

In [None]:
! pip install editdistance

### DE-DE

In [None]:
aligner = ED_Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text", ed_heuristic = True)
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/de-de_ed1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0  < ./output/de-de_ed1.a
# Output
! echo "\nModel 1 with ED Heuristic, DE-DE" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0 < ./output/de-de_ed1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' < ./output/de-de_ed1.a > ./output/de-de_ed1.log

### EN-EN

In [None]:
aligner = ED_Model1(d = 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text', ed_heuristic = True)
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/en-en_ed1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0  < ./output/en-en_ed1.a
# Output
! echo "\nModel 1 with ED Heuristic, EN-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0 < ./output/en-en_ed1.a >> ./output/summary.info
! python./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' < ./output/en-en_ed1.a > ./output/en-en_ed1.log

### DE-EN

In [None]:
aligner = ED_Model1(d = 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text', ed_heuristic = True)
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/de-en_ed1.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0  < ./output/de-en_ed1.a
# Output
! echo "\nModel 1 with ED Heuristic, DE-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0 < ./output/de-en_ed1.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' < ./output/de-en_ed1.a > ./output/de-en_ed1.log

## Method 2: Modified Decoding

### DE-DE

In [None]:
aligner = ED_Model1(d = "data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text", ed_decode = True)
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/de-de_ed2.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0  < ./output/de-de_ed2.a
# Output
! echo "\nModel 1 with ED Decoding, DE-DE" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' -n 0 < ./output/de-de_ed2.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-de/de_1781_Rosalino-de_1871_Elberfelder/text' < ./output/de-de_ed2.a > ./output/de-de_ed2.log

### EN-EN

In [None]:
aligner = ED_Model1(d = 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text', ed_decode = True)
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/en-en_ed2.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0  < ./output/en-en_ed2.a
# Output
! echo "\nModel 1 with ED Decoding, EN-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' -n 0 < ./output/en-en_ed2.a >> ./output/summary.info
! python./bin/score-alignments -d 'data/eval/en-en/en_1611_KJV-en_1890_Darby/text' < ./output/en-en_ed2.a > ./output/en-en_ed2.log

### DE-EN

In [None]:
aligner = ED_Model1(d = 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text', ed_decode = True)
aligner.em_training(num_iter = 5)

In [None]:
aligner.align(out = "./output/de-en_ed2.a", n = 15)
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0  < ./output/de-en_ed2.a
# Output
! echo "\nModel 1 with ED Decoding, DE-EN" >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' -n 0 < ./output/de-en_ed2.a >> ./output/summary.info
! python ./bin/score-alignments -d 'data/eval/de-en/de_1871_Elberfelder-en_1890_Darby/text' < ./output/de-en_ed2.a > ./output/de-en_ed2.log