In [60]:
%load_ext autoreload
%autoreload 2

from common import *
from config import *
import utils
from models.model import*
from sklearn.metrics import f1_score, roc_auc_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
config = DefaultConfigs()


# 1. set random seed
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpus
try:
    print('\t\tos[\'CUDA_VISIBLE_DEVICES\']     =', os.environ['CUDA_VISIBLE_DEVICES'])
    NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
except Exception:
    print('\t\tos[\'CUDA_VISIBLE_DEVICES\']     =', 'None')
    NUM_CUDA_DEVICES = 1
warnings.filterwarnings('ignore')

if not os.path.exists('../results'):
    os.mkdir('../results')

if not os.path.exists(config.logs):
    os.mkdir(config.logs)

log = utils.Logger()
log.open('{0}{1}_log_train.txt'.format(config.logs, config.model_name),mode="a")

log.write("\n-------------------- [START %s] %s\n\n" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51))
log.write('                          |------ Train ------|------ Valid ------|----Best Results---|------------|\n')
log.write('mode    iter   epoch    lr|  loss    f1_macro |  loss    f1_macro |  loss    f1_macro | time       |\n')
log.write('----------------------------------------------------------------------------------------------------\n')

		os['CUDA_VISIBLE_DEVICES']     = 0

-------------------- [START 2019-01-18 18:53:52] ---------------------------------------------------

                          |------ Train ------|------ Valid ------|----Best Results---|------------|
mode    iter   epoch    lr|  loss    f1_macro |  loss    f1_macro |  loss    f1_macro | time       |
----------------------------------------------------------------------------------------------------


In [3]:
 # 4.1 mkdirs
if not os.path.exists(config.submit):
    os.makedirs(config.submit)
if not os.path.exists(config.weights + config.model_name + os.sep + 'fold_'+str(config.fold)):
    os.makedirs(config.weights + config.model_name + os.sep + 'fold_'+ str(config.fold))
if not os.path.exists(config.best_models):
    os.mkdir(config.best_models)
if not os.path.exists(config.logs):
    os.mkdir(config.logs)

tqdm.pandas()

start_time = time.time()
train_X, test_X, train_y, word_index = utils.load_and_prec(config)
# embedding_matrix_1 = load_glove(word_index)
# embedding_matrix_2 = load_para(word_index)

total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))

  7%|▋         | 88310/1306122 [00:00<00:03, 319194.72it/s]

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


100%|██████████| 1306122/1306122 [00:01<00:00, 670783.91it/s]
100%|██████████| 56370/56370 [00:00<00:00, 594071.96it/s]
100%|██████████| 1306122/1306122 [00:53<00:00, 24206.32it/s]
100%|██████████| 56370/56370 [00:02<00:00, 24599.86it/s]
100%|██████████| 1306122/1306122 [00:16<00:00, 77024.20it/s]
100%|██████████| 56370/56370 [00:00<00:00, 77525.62it/s]
100%|██████████| 1306122/1306122 [00:18<00:00, 72054.69it/s]
100%|██████████| 56370/56370 [00:00<00:00, 70781.54it/s]


Tokenizing.......
Tokenizing Done!
Took 2.82 minutes


In [47]:
def train(train_loader,model,loss_fn, optimizer,epoch,valid_loss,start):
    losses = utils.AverageMeter()
    model.train()

    for i, (x_batch, y_batch) in enumerate(train_loader):
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.update(loss.item(),x_batch.size(0))

        print('\r', end='', flush=True)
        message = '%s %5.1f %6.1f        |  %0.3f  |   %0.3f   | %s' % ( \
            "train", i / len(train_loader) + epoch, epoch,
            losses.avg,
            valid_loss,
            utils.time_to_str((timer() - start), 'min'))
        print(message, end='', flush=True)
    log.write("\n")
    return losses.avg


# 2. evaluate fuunction
def evaluate(val_loader,model,loss_fn,epoch,train_loss,start_time):
    losses = utils.AverageMeter()
    # switch mode for evaluation
    model.cuda()
    model.eval()

    with torch.no_grad():
        for i, (x_batch, y_batch) in enumerate(val_loader):
            y_pred = model(x_batch)
            # Concatenate all every batch
            if i == 0:
                total_output = y_pred
                total_target = y_batch
            else:
                total_output = torch.cat([total_output, y_pred], 0)
                total_target = torch.cat([total_target, y_batch], 0)

        # compute loss for the entire evaluation dataset
        print("total_output:", total_output.shape)
        print("total_target:", total_target.shape)
        
        val_loss = loss_fn(total_output, total_target)
        losses.update(val_loss.item(),total_target.shape[0])
        
        print('\r', end='', flush=True)
        message = '%s %5.1f %6.1f        |  %0.3f  |   %0.3f   | %s' % ( \
            "val", epoch, epoch,
            train_loss,
            losses.avg,
            utils.time_to_str((timer() - start_time), 'min'))
        print(message, end='', flush=True)

        log.write("\n")

    return losses.avg, sigmoid(total_output).cpu().data.numpy()[:, 0]

# 3. test model on public dataset and save the probability matrix
def test(test_loader,model):
    model.cuda()
    model.eval()
    predictions = []
    sigmoid = nn.Sigmoid()
    with torch.no_grad():
        for i, (x_batch,) in enumerate(test_loader):
            y_pred = model(x_batch)
            y_preds = sigmoid(y_pred).cpu().data.numpy()[:, 0]
            for y_pred in y_preds:
                predictions.append(y_pred)
    return np.array(predictions)


In [14]:

# -------------------------------------------------------
# training
# -------------------------------------------------------
train_preds = np.zeros((len(train_X)))
test_preds = np.zeros((len(test_X)))

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test_dataset = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_X, train_y))

sigmoid = nn.Sigmoid()
loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")

# k-fold
for fold, (train_idx, valid_idx) in enumerate(splits):
    print(f'Fold {fold + 1}')

    # tflogger
    tflogger = utils.TFLogger(os.path.join('results', 'TFlogs',
                                     config.model_name + "_fold{0}_{1}".format(config.fold, fold)))
    # initialize the early_stopping object
    early_stopping = utils.EarlyStopping(patience=7, verbose=True)

    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda()

    model = Baseline_Bidir_LSTM_GRU(config, word_index)
    model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

    train_dataset = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid_dataset = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False)

    valid_loss = np.inf
    start_time = timer()
    for epoch in range(config.epochs):
        # train
        lr = utils.get_learning_rate(optimizer)
        train_loss = train(train_loader=train_loader,model=model,loss_fn=loss_fn, optimizer=optimizer,
                           epoch=epoch,valid_loss=valid_loss,start=start_time)

        # validate
        valid_loss, valid_output = evaluate(val_loader=valid_loader, model=model, loss_fn=loss_fn, epoch=epoch,
                                            train_loss=train_loss, start_time=start_time)
        test_preds_fold = np.zeros(len(test_X))

        # save model
        utils.save_checkpoint({
            "epoch": epoch,
            "model_name": config.model_name,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "fold": config.fold,
            "kfold": config.fold,
        },config.fold, fold, config)
        # print logs
        print('\r', end='', flush=True)

        log.write("\n")
        time.sleep(0.01)

        # ================================================================== #
        #                        Tensorboard Logging                         #
        # ================================================================== #

        # 1. Log scalar values (scalar summary)
        info = {'Train_loss': train_loss,
                'Valid_loss': valid_loss,
                'Learnging_rate': lr}

        for tag, value in info.items():
            tflogger.scalar_summary(tag, value, epoch)

        # 2. Log values and gradients of the parameters (histogram summary)
        for tag, value in model.named_parameters():
            tag = tag.replace('.', '/')
            tflogger.histo_summary(tag, value.data.cpu().numpy(), epoch)
            if not value.grad is None:
                tflogger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), epoch)
        # -------------------------------------
        # end tflogger

        # ================================================================== #
        #                        Early stopping                         #
        # ================================================================== #
        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        early_stopping(valid_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

    # end looping all epochs
    train_preds[valid_idx] = valid_output
    # test
    test_preds_fold = test(test_loader=test_loader, model=model)
    test_preds += test_preds_fold / len(splits)
    # end k-fold

Fold 1
Start loading embedding....................
Embedding matrix shape: (120000, 300)
train   1.0    0.0        |  290.292  |   inf   |  0 hr 02 minn
val   0.0    0.0        |  290.292  |   34084.180   |  0 hr 02 min

train   2.0    1.0        |  263.108  |   34084.180   |  0 hr 04 min
val   1.0    1.0        |  263.108  |   37739.531   |  0 hr 04 min

EarlyStopping counter: 1 out of 7
train   3.0    2.0        |  258.640  |   37739.531   |  0 hr 07 min
val   2.0    2.0        |  258.640  |   33526.477   |  0 hr 07 min

Validation loss decreased (inf --> 33526.476562) ...
train   4.0    3.0        |  255.557  |   33526.477   |  0 hr 09 min
val   3.0    3.0        |  255.557  |   33423.453   |  0 hr 09 min

Validation loss decreased (33526.476562 --> 33423.453125) ...
train   5.0    4.0        |  253.059  |   33423.453   |  0 hr 12 min
val   4.0    4.0        |  253.059  |   31765.967   |  0 hr 12 min

Validation loss decreased (33423.453125 --> 31765.966797) ...
train   6.0    5.0  

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [48]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
test_preds_fold = test(test_loader=test_loader, model=model)

In [56]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [61]:
search_result = threshold_search(train_y, train_preds)
search_result

100%|██████████| 100/100 [00:22<00:00,  4.41it/s]


{'threshold': 0.02, 'f1': 0.25143919619551613}

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_preds > search_result['threshold']
sub.to_csv("submission.csv", index=False)