In [1]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('..')
sys.path.append('../lib/')

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from lib import bert_model

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
%env CUDA_VISIBLE_DEVICES=1

TASKS_NUM_LABELS = [4, 358]

params = {
    'num_labels': TASKS_NUM_LABELS,
    'label_list': [[str(label) for label in range(num_labels)]
                   for num_labels in TASKS_NUM_LABELS],
    'output_dir': '../output',
    'cache_dir': '../model_cache',
    'bert_model': 'bert-base-multilingual-uncased',
    'max_seq_length': 128,
    'train_batch_size': 32,
    'eval_batch_size': 8,
    'learning_rate': 2e-5,
    'warmup_proportion': 0.1,
    'num_train_epochs': 2,
    'seed': 1331,
    'device': torch.device(
        'cuda' if torch.cuda.is_available()
        else 'cpu')
}

params['lower_case'] = (params['bert_model'][-7:] == 'uncased')

env: CUDA_VISIBLE_DEVICES=1


## Preparing data

In [None]:
DATA_DIR = '../datasets'
RANDOM_STATE = 3773

first_df = pd.read_csv(
    os.path.join(DATA_DIR, 'data/df_first.csv'),
    encoding='utf-8',
)

first_df_train_valid, first_df_test = train_test_split(
    first_first_df,
    test_size=0.15,
    random_state=RANDOM_STATE,
)
first_df_train, first_df_valid = train_test_split(
    first_df_train_valid,
    test_size=0.15,
    random_state=RANDOM_STATE,
)

In [None]:
second_df_train = pd.read_csv(
    os.path.join(DATA_DIR, 'data/second_train.csv'),
    encoding='utf-8',
)
second_df_valid = pd.read_csv(
    os.path.join(DATA_DIR, 'data/second_dev.csv'),
    encoding='utf-8',
)
second_df_test = pd.read_csv(
    os.path.join(DATA_DIR, 'data/second_test.csv'),
    encoding='utf-8',
)

In [5]:
X_train = [
    first_df_train['text'].values,
    second_df_train['text'].values,
]
y_train = [
    first_df_train['label_index'].values,
    second_df_train['label_index'].values,
]

X_valid = [
    first_df_valid['text'].values,
    second_df_valid['text'].values,
]
y_valid = [
    first_df_valid['label_index'].values,
    second_df_valid['label_index'].values,
]

X_test = [
    first_df_test['text'].values,
    second_df_test['text'].values,
]
y_test = [
    first_df_test['label_index'].values,
    second_df_test['label_index'].values,
]

print('Train sizes:')
print(' '.join([str(len(X_data)) for X_data in X_train]))
total_size = sum(([len(X_data) for X_data in X_train]))
print('Total size: {}\n'.format(total_size))

print('\nValid sizes:')
print(' '.join([str(len(X_data)) for X_data in X_valid]))
total_size = sum(([len(X_data) for X_data in X_valid]))
print('Total size: {}\n'.format(total_size))

print('\nTest sizes:')
print(' '.join([str(len(X_data)) for X_data in X_test]))
total_size = sum(([len(X_data) for X_data in X_test]))
print('Total size: {}\n'.format(total_size))

Train sizes:
4330 104724
Total size: 109054


Valid sizes:
765 26182
Total size: 26947


Test sizes:
900 32727
Total size: 33627



## Training BERT model

In [6]:
model = bert_model.BertMultiTaskTextClassificationModel(params)

Downloading BERT...
Completed!


In [16]:
result = model.fit(
    X_train,
    y_train,
    batch_size=32,
    n_epochs=4,
    validation_data=(X_valid, y_valid),
    best_model_output='model_multitask.pth',
)
result


Epoch: 1


Iteration: 100%|██████████| 3408/3408 [27:21<00:00,  2.08it/s]


***** Running evaluation *****


Predicting: 100%|██████████| 3369/3369 [02:19<00:00, 25.53it/s]


{'train_log_loss': 0.9166813825777439, 'eval_log_loss': [0.33383396507679863, 1.7398296592794598], 'eval_accuracy': [0.9098039215686274, 0.6221831792834772]}

Epoch: 2


Iteration: 100%|██████████| 3408/3408 [27:29<00:00,  2.05it/s]


***** Running evaluation *****


Predicting: 100%|██████████| 3369/3369 [02:10<00:00, 25.83it/s]


{'train_log_loss': 0.731878307538693, 'eval_log_loss': [0.369583423649021, 1.7333664538607485], 'eval_accuracy': [0.9006535947712418, 0.6234435871973111]}

Epoch: 3


Iteration: 100%|██████████| 3408/3408 [27:42<00:00,  2.07it/s]


***** Running evaluation *****


Predicting: 100%|██████████| 3369/3369 [02:11<00:00, 25.61it/s]


{'train_log_loss': 0.594994216180131, 'eval_log_loss': [0.411381732010122, 1.777980334077954], 'eval_accuracy': [0.8875816993464052, 0.6185547322588038]}

Epoch: 4


Iteration: 100%|██████████| 3408/3408 [27:03<00:00,  2.17it/s]


***** Running evaluation *****


Predicting: 100%|██████████| 3369/3369 [02:11<00:00, 25.66it/s]


{'train_log_loss': 0.4927876612735571, 'eval_log_loss': [0.4012502431896138, 1.8505008661895659], 'eval_accuracy': [0.8928104575163399, 0.6133985180658468]}


{'train_log_loss': 0.9166813825777439,
 'eval_log_loss': [0.33383396507679863, 1.7398296592794598],
 'eval_accuracy': [0.9098039215686274, 0.6221831792834772],
 'best_epoch': 1,
 'model_filepath': '../output/model_multitask.pth'}