In [None]:
import sys
sys.path.append('..')

In [None]:
importlib.reload(sys.modules['src.testing.probability'])
importlib.reload(sys.modules['src.testing.tagger'])
importlib.reload(sys.modules['src.testing.syllabification'])
importlib.reload(sys.modules['src.utility'])

In [None]:
import importlib
import time
import pandas as pd
import src.utility as util
import src.testing.probability as probability
import src.testing.tagger as tagger
import src.testing.syllabification as syllabification
import src.ngram as ngram

In [None]:
def syllabify_folds(n, prob_args, state_elim=True, k_min=1, k_max=5, n_sample=None, sample_seed=0, fname_param='method'):
    start_t = time.time()
    
    results = {}
    results_data = {}

    results = {
        'metadata': {
            'n': n,
            'k_min': k_min,
            'k_max': k_max,
            'state_elim': state_elim,
            'n_sample': n_sample,
            'sample_seed': sample_seed,
            'prob_args': prob_args.copy()
        },
        'fold_results': {}
    }

    util.print_dict(results['metadata'])

    for fold in range(k_min, k_max+1):
        data_test_fname = '../data/testset/test_fold_{}.txt'.format(fold)
        n_gram_fname = '../models/ngrams/8_gram_fold_{}.json'.format(fold)
        n_gram_aug_fname = '../models/ngrams/8_gram_aug_fold_{}.json'.format(fold)

        print('Fold        : {}'.format(fold))
        print('Data test   : "{}"'.format(data_test_fname))
        print('n-gram      : "{}"'.format(n_gram_fname))

        data_test = pd.read_csv(
            data_test_fname, 
            sep='\t', 
            header=None,
            names=['word', 'syllables'],
            na_filter=False
        )

        print('Total words : {}'.format(len(data_test)))

        if n_sample != None:
            data_test = data_test.sample(n=n_sample, random_state=sample_seed).reset_index(drop=True)

        prob_args['n_gram'] = None
        prob_args['n_gram'] = ngram.load(n_gram_fname, n_max=n, load_follow_fdist=False, load_cont_fdist=False)
        
        if prob_args['with_cache']:
            prob_args['cache'] = probability.generate_prob_cache(n, prob_args['method'])

            if prob_args['method'] == 'gkn':
                prob_args['d_cache'] = probability.generate_gkn_discount_cache(n, prob_args['n_gram'], prob_args['d_ceil'])

        if prob_args['with_aug']:
            prob_args['n_gram_aug'] = None
            prob_args['n_gram_aug'] = ngram.load(n_gram_aug_fname, n_max=n, load_follow_fdist=False, load_cont_fdist=False)

            if prob_args['with_cache']:
                prob_args['cache_aug'] = probability.generate_prob_cache(n, prob_args['method'])

                if prob_args['method'] == 'gkn':
                    prob_args['d_cache_aug'] = probability.generate_gkn_discount_cache(n, prob_args['n_gram_aug'], prob_args['d_ceil'])

        result = syllabification.syllabify(data_test, n, prob_args)
        results['fold_results'][fold] = result['metadata']
        results_data[fold] = result['data']

        print('\n')
    
    end_t = time.time()
    avg_ser = sum(results['fold_results'][i]['syllable_error_rate'] for i in range(k_min, k_max+1)) / (k_max-k_min+1)
    results['metadata']['average_ser'] = round(avg_ser, 5)
    results['metadata']['start_time'] = time.strftime('%Y/%m/%d - %H:%M:%S', time.localtime(start_t))
    results['metadata']['end_time'] = time.strftime('%Y/%m/%d - %H:%M:%S', time.localtime(end_t))
    results['metadata']['duration'] = round(end_t - start_t, 2)

    util.save_dict_to_log(results, '{}={}.log'.format(fname_param, prob_args[fname_param]), '../logs/')

    print('Finished in {:.2f} s'.format(end_t - start_t))

    return results, results_data

In [None]:
for aug_w in [1]:
    prob_args = {
        'method': 'stupid_backoff',
        'alpha': 0.4,
        'with_cache': True,
        'with_aug': False,
        'aug_w': aug_w
    }

    results, _ = syllabify_folds(
        n=2, 
        prob_args=prob_args, 
        k_min=1, k_max=5,
        fname_param='method'
    )

    print()