In [1]:
import sys
sys.path.append('..')

In [14]:
importlib.reload(sys.modules['src.testing.probability'])
importlib.reload(sys.modules['src.testing.tagger'])
importlib.reload(sys.modules['src.testing.syllabification'])
importlib.reload(sys.modules['src.utility'])
importlib.reload(sys.modules['src.ngram'])

<module 'src.ngram' from '..\\src\\ngram.py'>

In [15]:
import importlib
import time
import pandas as pd
import src.utility as util
import src.testing.probability as probability
import src.testing.tagger as tagger
import src.testing.syllabification as syllabification
import src.ngram as ngram

In [29]:
def syllabify_folds(n, prob_args, state_elim=True, k_min=1, k_max=5, n_sample=None, sample_seed=0, cache_preload=None, log_fname='', fname_param='method', save_log=True, save_result=False, save_cache=False):
    start_t = time.time()
    
    results = {}

    results = {
        'metadata': {
            'n': n,
            'k_min': k_min,
            'k_max': k_max,
            'state_elim': state_elim,
            'n_sample': n_sample,
            'sample_seed': sample_seed,
            'prob_args': prob_args.copy()
        },
        'fold_results': {}
    }

    util.print_dict(results['metadata'])

    for fold in range(k_min, k_max+1):
        data_test_fname = '../data/testset/test_fold_{}.txt'.format(fold)
        n_gram_fname = '../models/ngrams/5_gram_fold_{}.json'.format(fold)

        print('Fold        : {}'.format(fold))
        print('Data test   : "{}"'.format(data_test_fname))
        print('n-gram      : "{}"'.format(n_gram_fname))

        if 'with_aug' in prob_args and prob_args['with_aug']:
            n_gram_aug_fname = '../models/ngrams/5_gram_aug_fold_{}.json'.format(fold)
            print('n-gram aug  : "{}"'.format(n_gram_aug_fname))

        data_test = pd.read_csv(
            data_test_fname, 
            sep='\t', 
            header=None,
            names=['word', 'syllables'],
            na_filter=False
        )

        print('Total words : {}'.format(len(data_test)))

        if n_sample != None:
            data_test = data_test.sample(n=n_sample, random_state=sample_seed).reset_index(drop=True)

        prob_args['n_gram'] = ngram.load(n_gram_fname, n_max=n, load_follow_fdist=True, load_cont_fdist=True)
        
        if prob_args['with_cache']:
            if cache_preload != None and 'cache' in cache_preload:
                prob_args['cache'] = probability.load_cache('{}_fold_{}.json'.format(cache_preload['cache'], fold), '../data/cache/')
            else:
                prob_args['cache'] = probability.generate_prob_cache(n)
                

        if prob_args['method'] == 'gkn':
            prob_args['d_cache'] = probability.generate_gkn_discount_cache(n, prob_args['n_gram'], prob_args['d_ceil'])

        if prob_args['with_aug']:
            prob_args['n_gram_aug'] = ngram.load(n_gram_aug_fname, n_max=n, load_follow_fdist=True, load_cont_fdist=True)

            if prob_args['with_cache']:
                if cache_preload != None and 'cache_aug' in cache_preload:
                    prob_args['cache_aug'] = probability.load_cache('{}_fold_{}.json'.format(cache_preload['cache_aug'], fold), '../data/cache/')
                else:
                    prob_args['cache_aug'] = probability.generate_prob_cache(n)
                    

            if prob_args['method'] == 'gkn':
                prob_args['d_cache_aug'] = probability.generate_gkn_discount_cache(n, prob_args['n_gram_aug'], prob_args['d_ceil'])

        result = syllabification.syllabify(data_test, n, prob_args)
        results['fold_results'][fold] = result['metadata']

        if save_result:
            util.save_result(result['data'], '{}_{}={}_fold_{}.txt'.format(log_fname, fname_param, prob_args[fname_param], fold), '../data/results/')
        
        if save_cache:
            if 'cache' in prob_args:
                probability.save_cache(prob_args['cache'], 'cache_prob_{}_{}={}_fold_{}.json'.format(log_fname, fname_param, prob_args[fname_param], fold), '../data/cache/')
            if 'cache_aug' in prob_args:
                probability.save_cache(prob_args['cache_aug'], 'cache_aug_prob_{}_{}={}_fold_{}.json'.format(log_fname, fname_param, prob_args[fname_param], fold), '../data/cache/')

        # Clear n_gram from memory
        prob_args['n_gram'] = None
        prob_args['n_gram_aug'] = None

        print('\n')
    
    end_t = time.time()
    avg_ser = sum(results['fold_results'][i]['syllable_error_rate'] for i in range(k_min, k_max+1)) / (k_max-k_min+1)
    results['metadata']['average_ser'] = round(avg_ser, 5)
    results['metadata']['start_time'] = time.strftime('%Y/%m/%d - %H:%M:%S', time.localtime(start_t))
    results['metadata']['end_time'] = time.strftime('%Y/%m/%d - %H:%M:%S', time.localtime(end_t))
    results['metadata']['duration'] = round(end_t - start_t, 2)
    
    if save_log:
        log_fname += '_' if log_fname != '' else ''
        util.save_dict_to_log(results, '{}_{}={}.log'.format(log_fname, fname_param, prob_args[fname_param]), '../logs/')

    print('Finished in {:.2f} s'.format(end_t - start_t))

    return results, prob_args

In [32]:
for aug_w in [0.4, 0.5, 0.7]:
    prob_args = {
        'method': 'gkn',
        'd_ceil': 4,
        'with_cache': True,
        'with_aug': True,
        'aug_w': aug_w,
    }

    result, post_prob_args = syllabify_folds(
        n=5, 
        prob_args=prob_args, 
        k_min=1, k_max=5,
        log_fname='gkn_n=5_B=4',
        fname_param='aug_w',
        cache_preload={
            'cache': 'cache_prob_gkn_n=5_B=4_with_aug=False'
        },
        save_log=True,
        save_result=True,
        save_cache=False
    )

    print()

n: 5
k_min: 1
k_max: 5
state_elim: True
n_sample: None
sample_seed: 0
prob_args: {'method': 'gkn', 'd_ceil': 4, 'with_cache': True, 'with_aug': True, 'aug_w': 0.3}

Fold        : 1
Data test   : "../data/testset/test_fold_1.txt"
n-gram      : "../models/ngrams/5_gram_fold_1.json"
n-gram aug  : "../models/ngrams/5_gram_aug_fold_1.json"
Total words : 10113
Words tagged: 10113/10113 (100.00%) | SER: 0.891% | Running time: 40.52 s

Fold        : 2
Data test   : "../data/testset/test_fold_2.txt"
n-gram      : "../models/ngrams/5_gram_fold_2.json"
n-gram aug  : "../models/ngrams/5_gram_aug_fold_2.json"
Total words : 10114
Words tagged: 10114/10114 (100.00%) | SER: 0.943% | Running time: 40.62 s

Fold        : 3
Data test   : "../data/testset/test_fold_3.txt"
n-gram      : "../models/ngrams/5_gram_fold_3.json"
n-gram aug  : "../models/ngrams/5_gram_aug_fold_3.json"
Total words : 10114
Words tagged: 10114/10114 (100.00%) | SER: 0.911% | Running time: 40.58 s

Fold        : 4
Data test   : "../