In [None]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
import time
import src.ngram as ngram
from src.training.preprocess import tokenize, pad_tokens

In [None]:
data_train_folds_fnames = {
    1: '../data/trainset/train_fold_1.txt',
    2: '../data/trainset/train_fold_2.txt',
    3: '../data/trainset/train_fold_3.txt',
    4: '../data/trainset/train_fold_4.txt',
    5: '../data/trainset/train_fold_5.txt'
}

In [None]:
aug_train_fold_fnames = {
    1: '../data/trainset/augmented/aug_train_fold_1.txt',
    2: '../data/trainset/augmented/aug_train_fold_2.txt',
    3: '../data/trainset/augmented/aug_train_fold_3.txt',
    4: '../data/trainset/augmented/aug_train_fold_4.txt',
    5: '../data/trainset/augmented/aug_train_fold_5.txt'
}

In [None]:
n_max = 5
k_min = 1
k_max = 5
train_set_type = 'augmented' # Set to original / augmented

start_t = time.time()

print('Max n: ', n_max)
print('Train set: ', train_set_type)

if train_set_type == 'original':
    folds_fnames = data_train_folds_fnames
    output_fname_suffix = '../models/ngrams/{}_gram_'.format(n_max)
elif train_set_type == 'augmented':
    folds_fnames = aug_train_fold_fnames
    output_fname_suffix = '../models/ngrams/{}_gram_aug_'.format(n_max)

for fold in range(k_min, k_max+1):
    print('\nFold {}/{}'.format(fold, k_max))
    
    # Load train set
    fname = folds_fnames[fold]
    print('Train set: ', fname)
    data_train = pd.read_csv(
        fname, 
        sep='\t', 
        header=None, 
        names=['word', 'syllables'], 
        na_filter=False
    )
    print('Number of words: ', len(data_train))

    # Build the n-gram
    print('Building n-gram')
    tokens = pad_tokens(tokenize(data_train), n=n_max, start_pad=True, end_marker=True)
    ngram_fold = ngram.NGram(tokens, n=n_max, verbose=True)

    # Save the n-gram to a file
    fname = output_fname_suffix + 'fold_{}.json'.format(fold)
    ngram.save(ngram_fold, fname)
    print('n-gram saved to "{}"'.format(fname))

print('\nAll n-grams generated in {:.2f} s'.format(time.time() - start_t))