In [1]:
import ModelManager as mm_mod
import config_defaults as cd
from importlib import reload
import numpy as np
import matplotlib.pyplot as plt
import time
import logging
import torch

from tqdm import tqdm_notebook as tqdm


%matplotlib inline
%load_ext autotime

### Initializing the Model and Data pipes

In [2]:
reload(mm_mod)
reload(cd)
logger = logging.getLogger('__main__')
logger.setLevel(logging.INFO)

time: 1.91 ms


## extracting the ngrams for n = 1, 2, 3, 4 with both naive and spacy

In [None]:
"""
n_list = (1, 2, 3, 4)
mode_list = ('naive', 'spacy')

for n in n_list:
    for mode in mode_list:
        print("extracting n-grams for: n=%s, mode=%s" % (n, mode))
        param_overrides = {'NGRAM_MODE': mode,
                           'NGRAM_SIZE': n}
        mm = mm_mod.ModelManager(hparams=param_overrides)
        mm.load_data()
"""

### Testing the annealing of LR

In [None]:
# training all of these through 1 epoch and seeing results
reload(mm_mod)
mm = mm_mod.ModelManager()
mm.load_data()
mm.data_to_pipe()
param_overrides = {'EARLY_STOP': False}
mm.hparams.update(param_overrides)
mm.train(epoch_override=3, reload_data=False)  

### Trying to find a good LR

In [None]:
lr_list_exp_neg = np.arange(1,6)
lr_list_neg = 1 / np.power(10, lr_list_exp_neg)
lr_list_exp_pos = np.arange(0,3)
lr_list_pos = np.power(10, lr_list_exp_pos)

lr_list = np.append(lr_list_neg, lr_list_pos)
lr_list.sort()
print(lr_list)

In [None]:
# training all of these through 1 epoch and seeing results
mm = mm_mod.ModelManager()
mm.load_data()
mm.data_to_pipe()

mm.res_df = None  # reset the results dataframe
for cur_lr in lr_list:
    # overriding some hyperparameters
    print("training for initial lr = %s" % cur_lr)
    param_overrides = {'LR': cur_lr,
                       'EARLY_STOP': False}
    mm.hparams.update(param_overrides)
    mm.train(epoch_override=1, reload_data=False)  
display(mm.res_df)

In [None]:
plt.scatter(np.log10(mm.res_df['LR']), mm.res_df['final_val_acc'])
plt.title('Validation Error after 1 epoch')
plt.show()

In [None]:
mm.save_results()

### for each ngram param, find the right vocabulary size:

In [None]:
reload(mm_mod)
logger.setLevel(logging.WARNING)
voc_sizes = np.arange(1, 9) * 10000
n_list = (1, 2, 3, 4)
mode_list = ('naive', 'spacy')

for n in n_list:
    for mode in mode_list:
        for voc_size in voc_sizes:
            start_time = time.time()
            print("training models for: n=%s, mode=%s, voc_size=%s" % (n, mode, voc_size))
            param_overrides = {'NGRAM_MODE': mode,
                               'NGRAM_SIZE': n,
                               'VOC_SIZE': voc_size}
            mm = mm_mod.ModelManager(hparams=param_overrides, res_name='vocab_explore.p')
            mm.train()
            print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.validation_acc_history[-1], 
                                                                  time.time() - start_time))
    
            mm.save_results()

### Extra vocabulary - for spacy ngram =4, the upper tail hasn't been fully explored

In [None]:
voc_sizes = np.arange(9, 15) * 10000
for voc_size in voc_sizes:
    start_time = time.time()
    print("training models for: n=4, mode=spacy, voc_size=%s" % (voc_size))
    param_overrides = {'NGRAM_MODE': 'spacy',
                       'NGRAM_SIZE': 4,
                       'VOC_SIZE': voc_size}
    mm = mm_mod.ModelManager(hparams=param_overrides, res_name='voc_additional.p')
    mm.train()
    print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.validation_acc_history[-1], 
                                                          time.time() - start_time))

    mm.save_results()

In [None]:
voc_sizes = np.arange(3, 11) * 100000
voc_sizes

### What if we tried even larger vocabsizes?

In [None]:
voc_sizes = np.arange(3, 11) * 100000
for voc_size in voc_sizes:
    start_time = time.time()
    print("training models for: n=4, mode=spacy, voc_size=%s" % (voc_size))
    param_overrides = {'NGRAM_MODE': 'spacy',
                       'NGRAM_SIZE': 4,
                       'VOC_SIZE': voc_size}
    mm = mm_mod.ModelManager(hparams=param_overrides, res_name='voc_additional.p')
    mm.train()
    print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.validation_acc_history[-1], 
                                                          time.time() - start_time))

    mm.save_results()

### Embedding size

In [None]:
emb_dims = np.arange(2, 15) * 50
emb_dims

In [None]:
reload(mm_mod)
logger.setLevel(logging.WARNING)
voc_sizes = np.arange(2, 13) * 10000
emb_dims = np.arange(1, 15) * 50

for emb_dim in emb_dims:
    for voc_size in voc_sizes:
        start_time = time.time()
        print("training models for: emb_dim=%s, voc_size=%s" % (emb_dim, voc_size))
        param_overrides = {'VOC_SIZE': voc_size,
                           'NGRAM_MODE':'spacy',
                           'EMBEDDING_DIM':emb_dim}
        mm = mm_mod.ModelManager(hparams=param_overrides, res_name='embdim.p')
        mm.train()
        print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.validation_acc_history[-1], 
                                                              time.time() - start_time))

        mm.save_results()

In [None]:
plt.plot(mm.res_df['final_val_acc'].sort_values().values)
plt.show()

In [None]:
df = mm.res_df

In [18]:
import torch

time: 413 µs


In [25]:
opt_list = [torch.optim.RMSprop, torch.optim.Adagrad, torch.optim.Adam, torch.optim.SGD]
first_loop = True

for opt in opt_list:
    start_time = time.time()
    print("training models for: optimizer = %s" % (str(opt)))
    param_overrides = {'OPTIMIZER': opt}
    mm = mm_mod.ModelManager(hparams=param_overrides, res_name='optim.p')
    mm.train(epoch_override=2)
    print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.val_acc_hist[-1], 
                                                          time.time() - start_time))

    mm.save_results()
    first_loop = False

training models for: optimizer = <class 'torch.optim.rmsprop.RMSprop'>
INFO     initialized model with hyperparametrs:
INFO     LR: 0.01
INFO     LR_DECAY_RATE: 0.95
INFO     NEPOCH: 10
INFO     BATCH_SIZE: 32
INFO     NGRAM_SIZE: 4
INFO     VOC_SIZE: 100000
INFO     EMBEDDING_DIM: 50
INFO     NGRAM_MODE: spacy
INFO     VAL_SIZE: 5000
INFO     OPTIMIZER: <class 'torch.optim.rmsprop.RMSprop'>
INFO     VAL_FREQ: 4
INFO     REMOVE_STOP_WORDS: False
INFO     REMOVE_PUNC: True
INFO     EARLY_STOP: True
INFO     EARLY_STOP_LOOKBACK: 8
INFO     EARLY_STOP_MIN_IMPROVE: 0.01
INFO     allow pickle loads: True, allow pickle saves: True
INFO     Starting Training on device: cuda:0
INFO     looking for the following file paths: ./data/pickles/trainval_spacy_4_True_False.p
./data/pickles/test_spacy_4_True_False.p
./data/pickles/idx_spacy_4_True_False_5000_100000.p
INFO     found pickle files in ./data/pickles/, loading them instead of rebuilding ... 
INFO     found pickle files for indexer in ./data

Final Validation Acc = 51.34 (train time: 71.9s)

INFO     results saved to ./results/optim.p
time: 4min 52s


### Trying different LR decay rates

In [None]:
lr_decays = 0.5 + np.arange(2, 11) * 0.05

for lr_decay in lr_decays:
    param_overrides = {'LR_DECAY_RATE': lr_decay,
                       'LR': 0.001,
                       'NEPOCH': 30}
    mm = mm_mod.ModelManager(hparams=param_overrides, res_name='lr_decay_small_lr.p')
    mm.train()
    mm.save_results()

print("Final Validation Acc = %s" % (mm.validation_acc_history[-1]))

### The final massive model

In [None]:
reload(mm_mod)
logger.setLevel(logging.INFO)
start_time = time.time()
param_overrides = {'NEPOCH': 50,
                   'LR': 0.001,
                   'LR_DECAY_RATE': 0.95,
                   'VOC_SIZE': 1000000,
                   'NGRAM_SIZE': 4,
                   'NGRAM_MODE':'spacy',
                   'EMBEDDING_DIM':100,
                   'EARLY_STOP_LOOKBACK': 32}
mm = mm_mod.ModelManager(hparams=param_overrides, res_name='experiment.p')
mm.train()
print("Final Validation Acc = %s (train time: %.1fs)\n" % (mm.val_acc_hist[-1], 
                                                      time.time() - start_time))

INFO     initialized model with hyperparametrs:
INFO     LR: 0.001
INFO     LR_DECAY_RATE: 0.95
INFO     NEPOCH: 50
INFO     BATCH_SIZE: 32
INFO     NGRAM_SIZE: 4
INFO     VOC_SIZE: 1000000
INFO     EMBEDDING_DIM: 100
INFO     NGRAM_MODE: spacy
INFO     VAL_SIZE: 5000
INFO     OPTIMIZER: <class 'torch.optim.adam.Adam'>
INFO     VAL_FREQ: 4
INFO     REMOVE_STOP_WORDS: False
INFO     REMOVE_PUNC: True
INFO     EARLY_STOP: True
INFO     EARLY_STOP_LOOKBACK: 32
INFO     EARLY_STOP_MIN_IMPROVE: 0.01
INFO     allow pickle loads: True, allow pickle saves: True
INFO     Starting Training on device: cuda:0
INFO     looking for the following file paths: ./data/pickles/trainval_spacy_4_True_False.p
./data/pickles/test_spacy_4_True_False.p
./data/pickles/idx_spacy_4_True_False_5000_1000000.p
INFO     found pickle files in ./data/pickles/, loading them instead of rebuilding ... 
INFO     constructing ngram_indexer ...
INFO     indexer length 20000
INFO     final vocal size: 1000002
INFO     saving 

INFO     Ep: [18/50], Sp: [512/625], VAcc: 90.84, VLoss: 64.0, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [19/50], Sp: [128/625], VAcc: 90.78, VLoss: 63.9, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [19/50], Sp: [256/625], VAcc: 90.82, VLoss: 63.9, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [19/50], Sp: [384/625], VAcc: 90.78, VLoss: 63.9, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [19/50], Sp: [512/625], VAcc: 90.8, VLoss: 63.8, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [20/50], Sp: [128/625], VAcc: 90.82, VLoss: 63.8, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [20/50], Sp: [256/625], VAcc: 90.82, VLoss: 63.8, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [20/50], Sp: [384/625], VAcc: 90.82, VLoss: 63.8, TAcc: 100.0, TLoss: 196.0, LR: 0.0004
INFO     Ep: [20/50], Sp: [512/625], VAcc: 90.82, VLoss: 63.7, TAcc: 100.0, TLoss: 195.9, LR: 0.0004
INFO     Ep: [21/50], Sp: [128/625], VAcc: 90.8, VLoss: 63.7, TAcc: 100.0, TLoss: 195.9, LR:

In [None]:
torch.save(mm.model, r'model_state.st')

### Loading the model from state

In [17]:
loaded_model = torch.load(r'model_state.st')

param_overrides = {'NEPOCH': 50,
                   'LR': 0.001,
                   'LR_DECAY_RATE': 0.95,
                   'VOC_SIZE': 1000000,
                   'NGRAM_SIZE': 4,
                   'NGRAM_MODE':'spacy',
                   'EMBEDDING_DIM':100,
                   'EARLY_STOP_LOOKBACK': 32}
mm_test = mm_mod.ModelManager(hparams=param_overrides)
mm_test.load_data()
mm_test.data_to_pipe()
mm_test.model = loaded_model

mm_test.test_model(mm_test.loaders['test'])[0]

INFO     initialized model with hyperparametrs:
INFO     LR: 0.001
INFO     LR_DECAY_RATE: 0.95
INFO     NEPOCH: 50
INFO     BATCH_SIZE: 32
INFO     NGRAM_SIZE: 4
INFO     VOC_SIZE: 1000000
INFO     EMBEDDING_DIM: 100
INFO     NGRAM_MODE: spacy
INFO     VAL_SIZE: 5000
INFO     OPTIMIZER: <class 'torch.optim.adam.Adam'>
INFO     VAL_FREQ: 4
INFO     REMOVE_STOP_WORDS: True
INFO     REMOVE_PUNC: True
INFO     EARLY_STOP: True
INFO     EARLY_STOP_LOOKBACK: 32
INFO     EARLY_STOP_MIN_IMPROVE: 0.01
INFO     allow pickle loads: True, allow pickle saves: True
INFO     looking for the following file paths: ./data/pickles/trainval_spacy_4_True_True.p
./data/pickles/test_spacy_4_True_True.p
./data/pickles/idx_spacy_4_True_True_5000_1000000.p
INFO     found pickle files in ./data/pickles/, loading them instead of rebuilding ... 
INFO     found pickle files for indexer in ./data/pickles/, loading them ... 
INFO     setting each dataset's token indexes
INFO     setting each dataset's token indexes


87.2

time: 16.9 s


### recreating archives

In [None]:
import config_defaults as cd
import data_processor as dp
import ngrams
import pickle as pkl

In [14]:
mm = mm_mod.ModelManager()

INFO     initialized model with hyperparametrs:
INFO     LR: 0.01
INFO     LR_DECAY_RATE: 0.95
INFO     NEPOCH: 10
INFO     BATCH_SIZE: 32
INFO     NGRAM_SIZE: 2
INFO     VOC_SIZE: 100000
INFO     EMBEDDING_DIM: 50
INFO     NGRAM_MODE: spacy
INFO     VAL_SIZE: 5000
INFO     OPTIMIZER: <class 'torch.optim.adam.Adam'>
INFO     VAL_FREQ: 4
INFO     REMOVE_STOP_WORDS: True
INFO     REMOVE_PUNC: True
INFO     EARLY_STOP: True
INFO     EARLY_STOP_LOOKBACK: 8
INFO     EARLY_STOP_MIN_IMPROVE: 0.01
INFO     allow pickle loads: True, allow pickle saves: True
time: 23.7 ms


In [15]:
test_set = dp.construct_dataset(cd.DIR_TEST, cd.TEST_SIZE)

time: 549 ms


In [None]:
test_data = ngrams.extract_ngrams(test_set,
                                  mm.hparams['NGRAM_SIZE'],
                                  remove_stopwords=mm.hparams['REMOVE_STOP_WORDS'],
                                  remove_punc=mm.hparams['REMOVE_PUNC'],
                                  mode=mm.hparams['NGRAM_MODE'])

In [None]:
def hparam_to_str(hparams, req_params):
    final_str = ''
    for key in sorted(hparams):
        if key in req_params:
            final_str += str(hparams[key]).replace('.', 'p').replace(':', '-') + "_"
    return final_str[:-1] + '.p'

pickle_path_test = cd.DIR_PICKLE + 'test_' + hparam_to_str(mm.hparams, cd.DATA_HPARAMS)
pkl.dump(test_data, open(pickle_path_test, "wb"))

### Curves 