In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [3]:
import logging
import os
import numpy as np
import torch
import torch.optim as optim
from src.tc import utils
from collections import OrderedDict
from src.ner.encoder import CharEncoder, ClassEncoder, WordEncoder
from src.tc.model.net import CNNTC
from src.tc.data_loader import DataLoader

In [4]:
data_dir = os.path.join(root_path, 'src/tc/data/sst_binary')
model_dir = os.path.join(root_path, 'src/tc/experiments/sst_binary_testjupyter')

In [5]:
# 1. set the device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# 2. Load the parameters from json file
network_params = os.path.join(model_dir, 'params.json')
assert os.path.isfile(network_params), "No json configuration file found at {}".format(network_params)
params = utils.Params(network_params)
params.cuda = torch.cuda.is_available() # use GPU if available

In [7]:
# 3. Set the random seed for reproducible experiments
torch.manual_seed(230)
if params.cuda: torch.cuda.manual_seed(230)
np.random.seed(0)

In [8]:
# 4. Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

In [9]:
# 5. specify features
logging.info('loading encoders')
data_encoder = OrderedDict()
label_encoder = OrderedDict()
data_encoder[CharEncoder.FEATURE_NAME] = CharEncoder(os.path.join(data_dir, 'feats'))
data_encoder[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join(data_dir, 'feats'))
# data_encoder[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join('data/sst_binary', 'feats_w2v'))
label_encoder[ClassEncoder.FEATURE_NAME] = ClassEncoder(os.path.join(data_dir, 'feats'))

loading encoders


In [10]:
k_fold = None
combine_train_dev = False
train_on_dev = False

# create a data loader
data_loader = DataLoader(params, data_dir, data_encoder, label_encoder)
if k_fold:
    # each split will have a separate directory
    logging.info('K-Fold turned on with folds: {}'.format(k_fold))
    splits_dir = [os.path.join(data_dir, 'split_'+str(split_num)) for split_num in range(1, k_fold+1)]
else:
    splits_dir = [data_dir]

for split_dir in splits_dir:
    logging.info('training for: {}'.format(split_dir))
    data_dir = split_dir
    if k_fold:
        # for each split, make a new model directory for that particular split
        split_model_dir = os.path.join(model_dir, os.path.basename(split_dir))
        if not os.path.exists(split_model_dir):
            os.makedirs(split_model_dir)
    else:
        split_model_dir = model_dir
        
    # load the respective datasets
    data = data_loader.load_data(['train', 'val', 'test'], data_dir)
    train_data = data['train']
    val_data = data['val']
    test_data = data['test']

    # combine train and val data
    if combine_train_dev:
        logging.info('combining train and dev sets')
        for k, v in train_data.items():
            if isinstance(train_data[k], list):
                train_data[k].extend(val_data[k])
            elif isinstance(train_data[k], int):
                train_data[k] += val_data[k]

    # 5.3 specify the train and val dataset sizes
    params.train_size = train_data['size']
    params.val_size = val_data['size']
    params.test_size = test_data['size']
    logging.info("- done.")
    logging.info('train size: {}'.format(params.train_size))
    logging.info('val size: {}'.format(params.val_size))
    logging.info('test size: {}'.format(params.test_size))
    
    # 6. Modeling
    # 6.1 Define the model and the optimizer
    model = CNNTC(params=params,
                  char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                  num_tags=label_encoder[ClassEncoder.FEATURE_NAME].num_tags,
                  pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                  dropout=params.dropout,
                  decoder_type=params.decoder,
                  bidirectional=params.rnn_bidirectional,
                  freeze_embeddings=params.freeze_wordembeddings).to(device).float()
    optimizer = optim.Adadelta(params=filter(lambda p: p.requires_grad, model.parameters()),
                               rho=0.95)

    # 6.2 define metrics
    from src.ner.evaluation import accuracy_score
    metrics = {'accuracy': accuracy_score}

    # 7. Train the model
    from src.tc.train import train_and_evaluate
    logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
    end_epoch = train_and_evaluate(model=model,
                                   train_data=train_data,
                                   val_data=test_data if combine_train_dev else val_data,
                                   data_loader=data_loader,
                                   optimizer=optimizer,
                                   metrics=metrics,
                                   params=params,
                                   model_dir=split_model_dir,
                                   data_encoder=data_encoder,
                                   label_encoder=label_encoder,
                                   restore_file=None,
                                   save_model='val',
                                   eval=True)
    
    # 8. Train on dev set if required for n_epochs where n_epochs is returned from training on train set
    if train_on_dev:
        logging.info('training on train and dev for {} epochs'.format(end_epoch+1))
        # 8.1. combine train and dev datasets
        data = data_loader.load_data(['train', 'val', 'test'], data_dir)
        train_data = data['train']
        val_data = data['val']
        test_data = data['test']
        logging.info('combining train and dev sets')
        for k, v in train_data.items():
            if isinstance(train_data[k], list):
                train_data[k].extend(val_data[k])
            elif isinstance(train_data[k], int):
                train_data[k] += val_data[k]
        params.train_size = train_data['size']
        params.val_size = test_data['size']
        logging.info("- done.")
        logging.info('train size: {}'.format(params.train_size))
        logging.info('test size: {}'.format(params.val_size))

        # 8.2. delete old model and optimizer
        del model
        del optimizer

        # 8.3. construct a new model and optimizer
        model = CNNTC(params=params,
                      char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                      num_tags=label_encoder[ClassEncoder.FEATURE_NAME].num_tags,
                      pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                      dropout=params.dropout,
                      decoder_type=params.decoder,
                      bidirectional=params.rnn_bidirectional,
                      freeze_embeddings=params.freeze_wordembeddings).to(device).float()
        optimizer = optim.Adadelta(params=filter(lambda p: p.requires_grad, model.parameters()),
                                   rho=0.95)

        # 8.4. train without evaluating with new number of epochs
        params.num_epochs = end_epoch+1
        final_model_dir = os.path.join(split_model_dir, 'final')
        if not os.path.exists(final_model_dir):
            os.makedirs(final_model_dir)
        train_and_evaluate(model=model,
                           train_data=train_data,
                           val_data=test_data,
                           data_loader=data_loader,
                           optimizer=optimizer,
                           metrics=metrics,
                           params=params,
                           model_dir=final_model_dir,
                           data_encoder=data_encoder,
                           label_encoder=label_encoder,
                           restore_file=None,
                           save_model='train',
                           eval=True)
    del model


training for: ../../../src/tc/data/sst_binary
- done.
train size: 83881
val size: 872
test size: 1821
Starting training for 1000 epoch(s)
Epoch 1/1000
100%|██████████████████████████████████████████████████████████████████| 1677/1677 [00:25<00:00, 65.31it/s, loss=0.401]
- Train metrics: accuracy: 0.808 ; loss: 0.401
- Eval metrics : accuracy: 0.806 ; loss: 0.449


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 2/1000
100%|██████████████████████████████████████████████████████████████████| 1677/1677 [00:35<00:00, 47.68it/s, loss=0.310]
- Train metrics: accuracy: 0.857 ; loss: 0.310
- Eval metrics : accuracy: 0.814 ; loss: 0.438


Checkpoint Directory exists! 


KeyboardInterrupt: 