In [None]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
from src.training.augmentation import flip_onsets, swap_consonants

In [None]:
fname_folder = '../data/trainset/named-entity/'
fname_prefix = 'train_ne'

output_folder = '../data/trainset/augmented/named-entity/'
output_fname  = 'aug_train_ne'

k_min = 1
k_max = 5

## Load the train set

In [None]:
data_train_folds = {}

for fold in range(k_min, k_max+1):
    fname = '{}{}_fold_{}.txt'.format(fname_folder, fname_prefix, fold)
    data_train_folds[fold] = pd.read_csv(
        fname,
        sep='\t',
        header=None,
        names=['word', 'syllables'],
        na_filter=False
    )
    print('Fold {} ("{}") | Number of words: {}'.format(fold, fname, len(data_train_folds[fold])))

data_train_folds[fold].sample(n=5, random_state=0)

## Flip onsets

In [None]:
# Flip onsets from the original train set
flipped_folds = {}

for fold, data_train in data_train_folds.items():
    flipped_folds[fold] = flip_onsets(data_train)
    print('Fold {} | Number of words: {}'.format(fold, len(flipped_folds[fold])))

flipped_folds[fold].sample(n=5, random_state=0)

## Swap consonants

In [None]:
# Swap consonants from the original train set
swapped_folds = {}

for fold, data_train in data_train_folds.items():
    swapped_folds[fold] = swap_consonants(data_train)
    print('Fold {} | Number of words: {}'.format(fold, len(swapped_folds[fold])))

swapped_folds[fold].sample(n=5, random_state=0)

In [None]:
# Swap consonants from the onset flipped set
flipped_swapped_folds = {}

for fold, flipped in flipped_folds.items():
    flipped_swapped_folds[fold] = swap_consonants(flipped)
    print('Fold {} | Number of words: {}'.format(fold, len(flipped_swapped_folds[fold])))

flipped_swapped_folds[fold].sample(n=5, random_state=0)

## Combine and save to file

In [None]:
for fold in data_train_folds.keys():
    augmented_data = pd.concat(
        [flipped_folds[fold], swapped_folds[fold], flipped_swapped_folds[fold]], 
        ignore_index=True
    )
    fname = '{}{}_fold_{}.txt'.format(output_folder, output_fname, fold)
    augmented_data.to_csv(fname, sep='\t', index=False, header=False)
    print('Fold {} | Number of words: {}, saved to "{}"'.format(fold, len(augmented_data), fname))
