# Preprocessing

## Setup

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd

from src.augmentation import augment_dataset, mix_datasets
from src.config import PROCESSED_DIR, RAW_DIR
from src.preprocessing_nmt import preprocess_corpus, split_and_export

## Loading Data

In [None]:
csv_files = list(RAW_DIR.glob("**/*.csv"))
if not csv_files:
    raise FileNotFoundError("No CSV files found in RAW_DIR.")

print(f"Found {len(csv_files)} CSV file(s).")

In [None]:
ceb_spa_df = pd.read_csv(csv_files[0])
print(f"[INFO] Loaded: {csv_files[0].name} ({len(ceb_spa_df):,} rows)")
ceb_spa_df.head()

In [None]:
cbk_spa_df = pd.read_csv(csv_files[2])
print(f"[INFO] Loaded: {csv_files[2].name} ({len(cbk_spa_df):,} rows)")
cbk_spa_df.head()

## Preprocessing Data

In [None]:
clean_ceb_spa_df = preprocess_corpus(ceb_spa_df)
clean_ceb_spa_df.head()

In [None]:
aug_ceb_spa_df = augment_dataset(clean_ceb_spa_df)
aug_ceb_spa_df.head()

In [None]:
clean_cbk_spa_df = preprocess_corpus(cbk_spa_df)
aug_ceb_cbk_spa_df = mix_datasets(clean_ceb_spa_df, clean_cbk_spa_df)
aug_ceb_cbk_spa_df.head()

## Exporting Data

In [None]:
split_and_export(clean_ceb_spa_df, output_dir=PROCESSED_DIR / "base")

In [None]:
split_and_export(aug_ceb_spa_df, output_dir=PROCESSED_DIR / "aug-noise")

In [None]:
split_and_export(aug_ceb_cbk_spa_df, output_dir=PROCESSED_DIR / "aug-cbk")