In [6]:
import ModelManager as mm_mod
import config_defaults as cd
from importlib import reload
import numpy as np
import matplotlib.pyplot as plt
import time
import logging

from tqdm import tqdm_notebook as tqdm


%matplotlib inline
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 4.63 ms


### Initializing the Model and Data pipes

In [7]:
reload(mm_mod)
reload(cd)
logger = logging.getLogger('__main__')
logger.setLevel(logging.WARNING)

time: 1.27 ms


In [8]:
mm = mm_mod.ModelManager()
mm.hparams
# mm = mm_mod.ModelManager(hparams=None)

{'LR': 0.01,
 'NEPOCH': 10,
 'BATCH_SIZE': 32,
 'NGRAM_SIZE': 2,
 'VOC_SIZE': 10000,
 'EMBEDDING_DIM': 100,
 'NGRAM_MODE': 'naive',
 'VAL_SIZE': 5000,
 'OPTIMIZER': torch.optim.adam.Adam,
 'VAL_FREQ': 4,
 'REMOVE_STOP_WORDS': True,
 'REMOVE_PUNC': True,
 'EARLY_STOP': True,
 'EARLY_STOP_LOOKBACK': 4,
 'EARLY_STOP_MIN_IMPROVE': 0.01}

time: 279 ms


In [9]:
#mm.load_data()
#mm.data_to_pipe()

time: 297 µs


## extracting the ngrams for n = 1, 2, 3, 4 with both naive and spacy

In [10]:
n_list = (1, 2, 3, 4)
mode_list = ('naive', 'spacy')

for n in n_list:
    for mode in mode_list:
        print("extracting n-grams for: n=%s, mode=%s" % (n, mode))
        param_overrides = {'NGRAM_MODE': mode,
                           'NGRAM_SIZE': n}
        mm = mm_mod.ModelManager(hparams=param_overrides)
        mm.load_data()

extracting n-grams for: n=1, mode=naive


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=1, mode=spacy


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=2, mode=naive


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=2, mode=spacy


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=3, mode=naive


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=3, mode=spacy


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=4, mode=naive


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


extracting n-grams for: n=4, mode=spacy


HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='NGRAMS', max=25000), HTML(value='')))


time: 1h 22min 45s


### Trying to find a good LR

In [None]:
lr_list_exp_neg = np.arange(1,6)
lr_list_neg = 1 / np.power(10, lr_list_exp_neg)
lr_list_exp_pos = np.arange(0,3)
lr_list_pos = np.power(10, lr_list_exp_pos)

lr_list = np.append(lr_list_neg, lr_list_pos)
lr_list.sort()
print(lr_list)

In [None]:
# training all of these through 1 epoch and seeing results
mm.res_df = None  # reset the results dataframe
for cur_lr in lr_list:
    # overriding some hyperparameters
    print("training for lr = %s" % cur_lr)
    param_overrides = {'LR': cur_lr,
                       'EARLY_STOP': False}
    mm.hparams.update(param_overrides)
    mm.train(epoch_override=1, reload_data=False)  
display(mm.res_df)

In [None]:
plt.scatter(np.log10(mm.res_df['LR']), mm.res_df['final_val_acc'])
plt.title('Validation Error after 1 epoch')
plt.show()

In [None]:
mm.save_results(res_name='lr_explore.p')

### for each ngram param, find the right vocabulary size:

In [None]:
reload(mm_mod)
logger.setLevel(logging.WARNING)
voc_sizes = np.arange(1, 9) * 10000
n_list = (1, 2, 3, 4)
mode_list = ('naive', 'spacy')

for n in n_list:
    for mode in mode_list:
        for voc_size in voc_sizes:
            start_time = time.time()
            print("training models for: n=%s, mode=%s, voc_size=%s" % (n, mode, voc_size))
            param_overrides = {'NGRAM_MODE': mode,
                               'NGRAM_SIZE': n,
                               'VOC_SIZE': voc_size}
            mm = mm_mod.ModelManager(hparams=param_overrides, res_name='vocab_explore.p')
            mm.train()
            print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.validation_acc_history[-1], 
                                                                  time.time() - start_time))
    
            mm.save_results()

In [None]:
display(mm.res_df.head())

In [None]:
reload(mm_mod)
logger.setLevel(logging.INFO)
param_overrides = {'NGRAM_MODE': 'naive',
                   'NGRAM_SIZE': 4,
                   'VOC_SIZE': 10000}
mm = mm_mod.ModelManager(hparams=param_overrides, res_name='vocab_explore.p')
mm.load_data()
mm.data_to_pipe()
mm.model_init()  # make sure we force the model to re-init
err = mm.training_loop()

In [None]:
vocab = mm.data['vocab']

In [None]:
val = mm.data['val']

first check all of the tokens

In [None]:
all_tokens = []

for datum in tqdm(val):
    all_tokens += datum.token_idx

In [None]:
for tk in all_tokens:
    if not isinstance(tk, int):
        print(tk)

In [None]:
arr = np.array(all_tokens)
arr

In [None]:
arr.max()

In [None]:
arr.min()