# Preprocessing

## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%matplotlib notebook

In [2]:
import os
from pathlib import Path

import pandas as pd

from src.augmentation import augment_dataset, mix_datasets
from src.config import RAW_DIR
from src.preprocessing_nmt import preprocess_corpus, split_and_export

[CONFIG] Directories ensured and random seed set.


In [3]:
if Path.cwd().name == "notebooks":
    os.chdir("..")

%pwd

'c:\\Users\\qu1r0ra\\Documents\\GitHub\\philippine-machine-translation'

## Loading Data

In [4]:
csv_files = list(RAW_DIR.glob("**/*.csv"))
if not csv_files:
    raise FileNotFoundError("No CSV files found in RAW_DIR.")

print(f"Found {len(csv_files)} CSV file(s).")

Found 8 CSV file(s).


In [5]:
ceb_spa_df = pd.read_csv(csv_files[0])
print(f"[INFO] Loaded: {csv_files[0].name} ({len(ceb_spa_df):,} rows)")
ceb_spa_df.head()

[INFO] Loaded: cebuano_spanish.csv (31,105 rows)


Unnamed: 0,usfm,book,verse,chapter,language1,language2
0,1CH.1.1,1CH,1,1,"Si Adan, si Set, si Enos,","Adán, Set, Enós,"
1,1CH.1.2,1CH,2,1,"si Kenan, si Mahalalel, si Jared,","Cainán, Mahalaleel, Jared,"
2,1CH.1.3,1CH,3,1,"si Enoc, si Metusela, si Lamec,","Enoc, Matusalén, Lamec,"
3,1CH.1.4,1CH,4,1,"si Noe, si Sem, si Ham ug si Jafet.","Noé, Sem, Cam y Jafet."
4,1CH.1.5,1CH,5,1,"Ang mga anak nga lalaki ni Jafet: si Gomer, si...","Los hijos de Jafet: Gomer, Magog, Madai, Javán..."


In [6]:
cbk_spa_df = pd.read_csv(csv_files[2])
print(f"[INFO] Loaded: {csv_files[2].name} ({len(cbk_spa_df):,} rows)")
cbk_spa_df.head()

[INFO] Loaded: chavacano_spanish.csv (31,090 rows)


Unnamed: 0,usfm,book,verse,chapter,language1,language2
0,1CH.1.1,1CH,1,1,"""N/A""","Adán, Set, Enós,"
1,1CH.1.2,1CH,2,1,"""N/A""","Cainán, Mahalaleel, Jared,"
2,1CH.1.3,1CH,3,1,"""N/A""","Enoc, Matusalén, Lamec,"
3,1CH.1.4,1CH,4,1,"""N/A""","Noé, Sem, Cam y Jafet."
4,1CH.1.5,1CH,5,1,"""N/A""","Los hijos de Jafet: Gomer, Magog, Madai, Javán..."


## Preprocessing Data

In [7]:
clean_ceb_spa_df = preprocess_corpus(ceb_spa_df)
clean_ceb_spa_df.head()


[Preprocessing] Cleaning and filtering 31,105 sentence pairs...
[Preprocessing] 30,904 valid sentence pairs remain after cleaning.


Unnamed: 0,language1,language2,src_tokens,tgt_tokens
0,"Si Adan, si Set, si Enos,","Adán, Set, Enós,",si adan si set si enos,adán set enós
1,"si Kenan, si Mahalalel, si Jared,","Cainán, Mahalaleel, Jared,",si kenan si mahalalel si jared,cainán mahalaleel jared
2,"si Enoc, si Metusela, si Lamec,","Enoc, Matusalén, Lamec,",si enoc si metusela si lamec,enoc matusalén lamec
3,"si Noe, si Sem, si Ham ug si Jafet.","Noé, Sem, Cam y Jafet.",si noe si sem si ham ug si jafet,noé sem cam y jafet
4,"Ang mga anak nga lalaki ni Jafet: si Gomer, si...","Los hijos de Jafet: Gomer, Magog, Madai, Javán...",ang mga anak nga lalaki ni jafet si gomer si m...,los hijos de jafet gomer magog madai javán tub...


In [8]:
aug_ceb_spa_df = augment_dataset(clean_ceb_spa_df)
aug_ceb_spa_df.head()

[INFO] Augmenting dataset: 30,904 rows with 1 copies each...
[INFO] Augmentation complete — total rows: 61,808 (2.0x increase)


Unnamed: 0,src_tokens,tgt_tokens
0,si adan si set si enos,adán set enós
1,si adan si set si enos,adán set enós
2,si kenan si mahalalel si jared,cainán mahalaleel jared
3,kenan si si mahalalel si si jared,cainán mahalaleel jared
4,si enoc si metusela si lamec,enoc matusalén lamec


In [9]:
clean_cbk_spa_df = preprocess_corpus(cbk_spa_df)
aug_ceb_cbk_spa_df = mix_datasets(clean_ceb_spa_df, clean_cbk_spa_df)
aug_ceb_cbk_spa_df.head()


[Preprocessing] Cleaning and filtering 31,090 sentence pairs...
[Preprocessing] 7,924 valid sentence pairs remain after cleaning.
[INFO] Mixed datasets: base=30,904, mix added=3,090, total=33,994 (10% mix ratio)


Unnamed: 0,src_tokens,tgt_tokens
0,sa dihang miabot siya ug nakita ang grasya sa ...,este cuando llegó y vio la gracia de dios se r...
1,nga gipanalanginan sa ginoo sa mga panon nga n...,porque jehová de los ejércitos los bendecirá d...
2,kay ang tawong patay na gipahigawas na gikan s...,porque el que ha muerto ha sido justificado de...
3,sila mahimong mga alagad sa akong balaang dapi...,servirán en mi santuario como porteros a las p...
4,human niana misulod ang mga levihanon aron sa ...,después de esto los levitas fueron para ejerce...


## Exporting Data

After running each cell below, transfer the generated files (i.e., `train.src`, `train.tgt`, `valid.src`, `valid.tgt`) to their own folder in the same directory as the function always exports to the same path.

In [10]:
split_and_export(clean_ceb_spa_df)

[Splitting] Train ratio = 0.9
[Exporting] Writing train split to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\processed
[Export] train.src and train.tgt written.
[Exporting] Writing valid split to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\processed
[Export] valid.src and valid.tgt written.
[Done] Train/valid splits exported successfully.


In [11]:
split_and_export(aug_ceb_spa_df)

[Splitting] Train ratio = 0.9
[Exporting] Writing train split to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\processed
[Export] train.src and train.tgt written.
[Exporting] Writing valid split to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\processed
[Export] valid.src and valid.tgt written.
[Done] Train/valid splits exported successfully.


In [12]:
split_and_export(aug_ceb_cbk_spa_df)

[Splitting] Train ratio = 0.9
[Exporting] Writing train split to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\processed
[Export] train.src and train.tgt written.
[Exporting] Writing valid split to C:\Users\qu1r0ra\Documents\GitHub\philippine-machine-translation\data\processed
[Export] valid.src and valid.tgt written.
[Done] Train/valid splits exported successfully.
