## Modeling

### Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%matplotlib notebook

In [1]:
import json
import os
from itertools import islice
from pathlib import Path

import dill as pickle
import pandas as pd
from nltk.translate import AlignedSent
from nltk.translate.ibm1 import IBMModel1
from nltk.translate.ibm2 import IBMModel2
from nltk.translate.ibm3 import IBMModel3
from nltk.translate.ibm4 import IBMModel4
from nltk.translate.ibm5 import IBMModel5

from src.config import MODELS_DIR, PROCESSED_DIR

[CONFIG] Directories ensured and random seed set.


In [2]:
# Switch to root as working directory
if Path.cwd().name == "notebooks":
    os.chdir("..")

%pwd

'c:\\Users\\qu1r0ra\\Documents\\GitHub\\philippine-machine-translation'

In [3]:
df = pd.read_csv(PROCESSED_DIR / "preprocessed.csv")

print(df.head())
print(f"\nColumns: {list(df.columns)}")

                                           language1  \
0                          Si Adan, si Set, si Enos,   
1                  si Kenan, si Mahalalel, si Jared,   
2                    si Enoc, si Metusela, si Lamec,   
3                si Noe, si Sem, si Ham ug si Jafet.   
4  Ang mga anak nga lalaki ni Jafet: si Gomer, si...   

                                           language2  \
0                                   Adán, Set, Enós,   
1                         Cainán, Mahalaleel, Jared,   
2                            Enoc, Matusalén, Lamec,   
3                             Noé, Sem, Cam y Jafet.   
4  Los hijos de Jafet: Gomer, Magog, Madai, Javán...   

                                          src_tokens  \
0          ['si', 'adan', 'si', 'set', 'si', 'enos']   
1  ['si', 'kenan', 'si', 'mahalalel', 'si', 'jared']   
2    ['si', 'enoc', 'si', 'metusela', 'si', 'lamec']   
3  ['si', 'noe', 'si', 'sem', 'si', 'ham', 'ug', ...   
4  ['ang', 'mga', 'anak', 'nga', 'lalaki', 'ni

In [4]:
word2class_path = PROCESSED_DIR / "word_classes.json"

with open(word2class_path) as f:
    word2class = json.load(f)

print(dict(islice(word2class.items(), 5)))  # print 5 key-value pairs

{'sa': 69, 'ug': 69, 'ang': 69, 'de': 36, 'nga': 69}


Next, let us create aligned sentences to be passed into the IBM models. For this, we need NLTK's `AlignedSent` class.

In [5]:
aligned_sents = [
    AlignedSent(tgt_tokens.split(), src_tokens.split())
    for src_tokens, tgt_tokens in zip(df["src_tokens"], df["tgt_tokens"], strict=True)
]

print(f"Created {len(aligned_sents):,} aligned sentence pairs.")

Created 31,105 aligned sentence pairs.


### Training

#### IBM Model 1

In [6]:
%%time

ibm1 = IBMModel1(aligned_sents, iterations=5)
print("\n[IBM1] Training complete.")

ibm1_path = MODELS_DIR / "ibm1_model.pkl"
with open(ibm1_path, "wb") as f:
    pickle.dump(ibm1, f)

print(f"\nIBM Model 1 saved to {ibm1_path}")


[IBM1] Training complete.

IBM Model 1 saved to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\models\ibm1_model.pkl
CPU times: total: 2min 8s
Wall time: 2min 9s


#### IBM Model 2

In [7]:
%%time

ibm2 = IBMModel2(aligned_sents, iterations=5)
print("\n[IBM2] Training complete.")

ibm2_path = MODELS_DIR / "ibm2_model.pkl"
with open(ibm2_path, "wb") as f:
    pickle.dump(ibm2, f)

print(f"\nIBM Model 2 saved to {ibm2_path}")


[IBM2] Training complete.

IBM Model 2 saved to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\models\ibm2_model.pkl
CPU times: total: 8min 51s
Wall time: 8min 54s


#### IBM Model 3

In [None]:
%%time

ibm3 = IBMModel3(aligned_sents, iterations=1)
print("\n[IBM3] Training complete.")

ibm3_path = MODELS_DIR / "ibm3_model.pkl"
with open(ibm3_path, "wb") as f:
    pickle.dump(ibm3, f)

print(f"\nIBM Model 3 saved to {ibm3_path}")

#### IBM Model 4

In [None]:
%%time

ibm4 = IBMModel4(
    aligned_sents,
    iterations=3,
    source_word_classes=word2class,
    target_word_classes=word2class,
)
print("\n[IBM4] Training complete.")

ibm4_path = MODELS_DIR / "ibm4_model.pkl"
with open(ibm4_path, "wb") as f:
    pickle.dump(ibm4, f)

print(f"\nIBM Model 4 saved to {ibm4_path}")

#### IBM Model 5

In [None]:
%%time

ibm5 = IBMModel5(
    aligned_sents,
    iterations=3,
    source_word_classes=word2class,
    target_word_classes=word2class,
)
print("\n[IBM5] Training complete.")

ibm5_path = MODELS_DIR / "ibm4_model.pkl"
with open(ibm5_path, "wb") as f:
    pickle.dump(ibm5, f)

print(f"\nIBM Model 5 saved to {ibm5_path}")