In [1]:
from transformers import BertTokenizer, BertPreTrainedModel, AdamW, BertConfig, BertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import time, random
import numpy as np
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
import torch.nn as nn
import datetime

I1027 11:02:30.675512 140119431956288 file_utils.py:39] PyTorch version 1.4.0 available.
Using TensorFlow backend.


In [2]:
train_inputs = torch.load('/notebook/BERT/tensor_data/ipc_train_input256.pt')
train_masks = torch.load('/notebook/BERT/tensor_data/ipc_train_mask256.pt')
train_labels = torch.load('./tensor_data/train_label.pt')
test_inputs = torch.load('/notebook/BERT/tensor_data/valid_input.pt')
test_masks = torch.load('/notebook/BERT/tensor_data/valid_mask.pt')
test_labels = torch.load('./tensor_data/valid_labels.pt')

In [2]:
train_inputs = torch.load('/notebook/BERT/tensor_data/ipc_train_inputs512.pt')
train_masks = torch.load('/notebook/BERT/tensor_data/ipc_train_masks512.pt')
train_labels = torch.load('./tensor_data/train_label.pt')
test_inputs = torch.load('/notebook/BERT/tensor_data/valid_input.pt')
test_masks = torch.load('/notebook/BERT/tensor_data/valid_mask.pt')
test_labels = torch.load('./tensor_data/valid_labels.pt')

In [3]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [4]:
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB


In [5]:
label_emb=np.zeros((1383,768))

with open('/notebook/LAHA/label768_wv', 'r') as f:
    for index, i in enumerate(f.readlines()):
        if index==0:
            continue
        i = i.rstrip('\n')
        n = i.split(' ')[0]
        content = i.split(' ')[1:]
        label_emb[int(n)] = [float(value) for value in content]

label_emb = torch.from_numpy(label_emb).float()

In [6]:
class LabelAttention(nn.Module):
    def __init__(self, hidden_size, labels_num, label_emb):
        super(LabelAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = labels_num
        self.label_emb = label_emb

        label_embedding = torch.FloatTensor(self.num_labels,self.hidden_size)

        if self.label_emb is None:
            nn.init.xavier_normal_(label_embedding)
        else:
            label_embedding.copy_(self.label_emb)
        
        self.label_embedding = nn.Parameter(label_embedding,requires_grad=False)
        
        self.key_layer = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        nn.init.xavier_uniform_(self.key_layer.weight)
        
        self.query_layer = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        nn.init.xavier_uniform_(self.query_layer.weight)
        
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, inputs, masks):
        
        attn_key = self.key_layer(inputs).transpose(1,2)
        
        label_emb = self.label_embedding.expand((attn_key.size(0),self.label_embedding.size(0),self.label_embedding.size(1)))
        attn_query = self.query_layer(label_emb)
        
        attention = torch.bmm(label_emb, attn_key).masked_fill(~masks, -np.inf) # label, seq
        attention = F.softmax(attention, -1)
        
        return self.dropout(torch.bmm(attention, inputs)) # label, hidden

In [7]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, labels_num):
        super(SelfAttention, self).__init__()
        self.hidden_size = hidden_size
        self.labels_num = labels_num
        
        self.attention = nn.Linear(self.hidden_size, self.labels_num, bias=False)
        nn.init.xavier_uniform_(self.attention.weight)
        
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, inputs, masks):
        attention = self.attention(inputs).transpose(1,2).masked_fill(~masks, -np.inf)
        attention = F.softmax(attention, -1)
        
        return self.dropout(torch.bmm(attention, inputs))

In [8]:
class MLinear(nn.Module):
    def __init__(self, hidden_size):
        super(MLinear, self).__init__()
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(0.5)
        
        self.linear_weight1 = nn.Linear(self.hidden_size,1)
        nn.init.xavier_uniform_(self.linear_weight1.weight)
        
        self.linear_weight2 = nn.Linear(self.hidden_size,1)
        nn.init.xavier_uniform_(self.linear_weight2.weight)
        
        self.out_linear = nn.Linear(self.hidden_size, 1, bias=False)
        
    def forward(self, self_attn, label_attn):
        factor1 = torch.sigmoid(self.linear_weight1(self_attn))
        factor2 = torch.sigmoid(self.linear_weight2(label_attn))
        factor1 = factor1 / (factor1+factor2)
        factor2 = 1 - factor1
        
        out = factor1 * self_attn + factor2 * label_attn
        
        return torch.squeeze(self.out_linear(self.dropout(out)), -1)

In [9]:
class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, label_emb):
        super(BertForMultiLabelSequenceClassification, self).__init__(config, label_emb=None)
        self.num_labels = config.num_labels
        self.hidden_size = config.hidden_size
        self.label_emb = label_emb
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.1)
        
        self.self_attn = SelfAttention(self.hidden_size, self.num_labels)
        self.label_attn = LabelAttention(self.hidden_size, self.num_labels, self.label_emb)
        self.linear = MLinear(self.hidden_size)
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        sequence, _ = self.bert(input_ids, attention_mask)
        sequence = self.dropout(sequence) # [batch, sequence, hidden_size]
        
        masks = attention_mask  != 0 # [batch, sequence]
        masks = torch.unsqueeze(masks, 1) # [batch, 1, sequence]
        
        self_attn = self.self_attn(sequence, masks)
        label_attn = self.label_attn(sequence, masks)
        
        return self.linear(self_attn, label_attn)

    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [11]:
model = BertForMultiLabelSequenceClassification.from_pretrained('./For_paper/bce_laha_7e_v6', cache_dir=None, num_labels=1383, label_emb=label_emb)

I1027 11:08:37.090913 140119431956288 configuration_utils.py:283] loading configuration file ./For_paper/bce_laha_7e_v6/config.json
I1027 11:08:37.099525 140119431956288 configuration_utils.py:321] Model config BertConfig {
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",

In [12]:
model = model.cuda()

In [12]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True, smooth=0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce
        self.smooth = smooth

    def forward(self, inputs, targets):
        
        if self.smooth != 0:
            targets = (1-self.smooth) * targets + self.smooth / inputs.size(1)
            
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
        pt = torch.exp(-BCE_loss)
        focal_term = (1-pt).pow(self.gamma)
        normalize = 1/focal_term.mean()
        F_loss = normalize * self.alpha * focal_term * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [13]:
from apex import amp

EPOCHS = 8

optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False, eps=1e-8, weight_decay=0.01)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # AMP 적용을 위한 코드

total_step = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_step/10,
    num_training_steps=total_step
)

loss_fn = nn.BCEWithLogitsLoss().to(device)
#loss_fn = FocalLoss(alpha=0.25).to(device)
#loss_fn = DataParallelCriterion(loss_fn)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [14]:
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

def predict_step(model, batch, k:int, loss_fn):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, input_labels = batch
    with torch.no_grad():
        loss = model(input_ids, token_type_ids=None, attention_mask=input_mask)
        scores, labels = torch.topk(loss, 5)
        loss = loss_fn(loss, input_labels.float())
        return torch.sigmoid(scores).cpu(), labels.cpu(), loss.cpu()

def get_p_5(predict, target, top):
    prediction = []
    for index_list in predict:
        predicts = [0]*target.shape[1]
        for index in index_list[:top]:
            predicts[index] = 1
        prediction.append(predicts)
    prediction = np.array(prediction)
    target = np.array(target)
    return np.sum(np.multiply(prediction,target))/(top*target.shape[0])

In [15]:
import time, random
import numpy as np
import logzero
logzero.setup_default_logger(logfile='./For_paper/bce_laha_7e_v6.log')
from logzero import logger
import os
import tqdm
import torch.nn.functional as F

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


model.zero_grad()
best_p5, check_step, best_loss = 0, 0, 1.0
        
for epoch_i in range(0, EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')
    
    logger.info("")
    logger.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    logger.info('Training...')
    
    t0 = time.time()
    
    total_loss = 0

    for step, batch in enumerate(train_data_loader):    
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, input_labels = batch
        model.train()
        logit = model(input_ids, token_type_ids=None, attention_mask=input_mask)
        loss = loss_fn(logit, input_labels.float())
        optimizer.zero_grad()
        with amp.scale_loss(loss, optimizer) as scaled_loss: # AMP 적용을 위한 코드
            scaled_loss.backward()
            #loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()


        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

            check_val_loss, valid_step = 0, 0
            p1, p3, p5 = 0.0, 0.0, 0.0
            for batch in test_data_loader:
                _, labels, val_loss = predict_step(model, batch, 5, loss_fn)
                targets = batch[2].cpu()
                check_val_loss += val_loss.item()
                valid_step += 1

                p1 += get_p_5(labels, targets, 1)
                p3 += get_p_5(labels, targets, 3)
                p5 += get_p_5(labels, targets, 5)

            if  best_p5 < p5:
                path = './For_paper/bce_laha_7e_v6_512'
                if os.path.exists(path):
                    model.save_pretrained(path)
                else:
                    os.mkdir(path)
                    model.save_pretrained(path)
                best_p5 = p5
                check_step = 0
            else:
                check_step += 1
                if check_step >= 25:
                    break

            print("{:>2} in {:>6}     train loss: {:.5f}     valid loss: {:5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, train_data_loader.batch_size*step, loss.item(), check_val_loss/valid_step, p1/valid_step, p3/valid_step, p5/valid_step, check_step))
            logger.info("{:>2} in {:>6}     train loss: {:.5f}     valid loss: {:5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, train_data_loader.batch_size*step, loss.item(), check_val_loss/valid_step, p1/valid_step, p3/valid_step, p5/valid_step, check_step))


        del loss, batch, input_ids, input_mask, input_labels


    avg_train_loss = total_loss / len(train_data_loader)
    print("Average training loss: {0:.5f}".format(avg_train_loss))
    print("Training epcoh took: {:}".format(format_time(time.time() - t0)))
    logger.info("")
    logger.info("Average training loss: {0:.5f}".format(avg_train_loss))
    logger.info("Training epcoh took: {:}".format(format_time(time.time() - t0)))

    check_val_loss, valid_step = 0, 0
    p1, p3, p5 = 0.0, 0.0, 0.0
    for batch in test_data_loader:
        _, labels, val_loss = predict_step(model, batch, 5, loss_fn)
        targets = batch[2].cpu()
        check_val_loss += val_loss.item()
        valid_step += 1

        p1 += get_p_5(labels, targets, 1)
        p3 += get_p_5(labels, targets, 3)
        p5 += get_p_5(labels, targets, 5)


    avg_valid_loss = check_val_loss/valid_step

    if  best_p5 < p5:
        path = './For_paper/bce_laha_7e_v6_512'
        if os.path.exists(path):
            model.save_pretrained(path)
        else:
            os.mkdir(path)
            model.save_pretrained(path)
        best_loss = avg_valid_loss
        best_p5 = p5
        check_step = 0
    else:
        check_step += 1
        if check_step >= 25:
            break

    print("{:>2}     valid loss: {:5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, check_val_loss/valid_step, p1/valid_step, p3/valid_step, p5/valid_step, check_step))
    logger.info("")
    logger.info("{:>2}     valid loss: {:5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, check_val_loss/valid_step, p1/valid_step, p3/valid_step, p5/valid_step, check_step))

    t0 = time.time()

    del val_loss, check_val_loss, avg_valid_loss, avg_train_loss, total_loss

[I 201027 11:08:50 <ipython-input-15-e3bbe2c196e8>:25] 
[I 201027 11:08:50 <ipython-input-15-e3bbe2c196e8>:27] Training...



Training...


I1027 11:20:32.702209 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 11:20:33.161027 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 11:20:33 <ipython-input-15-e3bbe2c196e8>:79]  0 in  32000     train loss: 0.00149     valid loss: 0.002366     p1 : 0.72821     p3 : 0.36187     p5 : 0.23755     check_step :  0


 0 in  32000     train loss: 0.00149     valid loss: 0.002366     p1 : 0.72821     p3 : 0.36187     p5 : 0.23755     check_step :  0


I1027 11:32:14.339190 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 11:32:16.851839 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 11:32:16 <ipython-input-15-e3bbe2c196e8>:79]  0 in  64000     train loss: 0.00072     valid loss: 0.002396     p1 : 0.72925     p3 : 0.36230     p5 : 0.23770     check_step :  0


 0 in  64000     train loss: 0.00072     valid loss: 0.002396     p1 : 0.72925     p3 : 0.36230     p5 : 0.23770     check_step :  0


I1027 11:43:58.396429 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 11:44:00.406079 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 11:44:00 <ipython-input-15-e3bbe2c196e8>:79]  0 in  96000     train loss: 0.00132     valid loss: 0.002413     p1 : 0.73093     p3 : 0.36264     p5 : 0.23784     check_step :  0


 0 in  96000     train loss: 0.00132     valid loss: 0.002413     p1 : 0.73093     p3 : 0.36264     p5 : 0.23784     check_step :  0


I1027 11:55:41.130567 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 11:55:43.598519 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 11:55:43 <ipython-input-15-e3bbe2c196e8>:79]  0 in 128000     train loss: 0.00136     valid loss: 0.002421     p1 : 0.73175     p3 : 0.36280     p5 : 0.23808     check_step :  0


 0 in 128000     train loss: 0.00136     valid loss: 0.002421     p1 : 0.73175     p3 : 0.36280     p5 : 0.23808     check_step :  0


[I 201027 12:07:25 <ipython-input-15-e3bbe2c196e8>:79]  0 in 160000     train loss: 0.00105     valid loss: 0.002445     p1 : 0.73192     p3 : 0.36287     p5 : 0.23799     check_step :  1


 0 in 160000     train loss: 0.00105     valid loss: 0.002445     p1 : 0.73192     p3 : 0.36287     p5 : 0.23799     check_step :  1


I1027 12:19:06.740440 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 12:19:08.755506 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 12:19:08 <ipython-input-15-e3bbe2c196e8>:79]  0 in 192000     train loss: 0.00109     valid loss: 0.002439     p1 : 0.73256     p3 : 0.36324     p5 : 0.23821     check_step :  0


 0 in 192000     train loss: 0.00109     valid loss: 0.002439     p1 : 0.73256     p3 : 0.36324     p5 : 0.23821     check_step :  0


I1027 12:30:49.440091 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 12:30:51.872960 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 12:30:51 <ipython-input-15-e3bbe2c196e8>:79]  0 in 224000     train loss: 0.00101     valid loss: 0.002455     p1 : 0.73355     p3 : 0.36354     p5 : 0.23824     check_step :  0


 0 in 224000     train loss: 0.00101     valid loss: 0.002455     p1 : 0.73355     p3 : 0.36354     p5 : 0.23824     check_step :  0


[I 201027 12:42:34 <ipython-input-15-e3bbe2c196e8>:79]  0 in 256000     train loss: 0.00107     valid loss: 0.002481     p1 : 0.73320     p3 : 0.36328     p5 : 0.23810     check_step :  1


 0 in 256000     train loss: 0.00107     valid loss: 0.002481     p1 : 0.73320     p3 : 0.36328     p5 : 0.23810     check_step :  1


I1027 12:54:14.705235 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 12:54:16.694710 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 12:54:16 <ipython-input-15-e3bbe2c196e8>:79]  0 in 288000     train loss: 0.00112     valid loss: 0.002473     p1 : 0.73442     p3 : 0.36346     p5 : 0.23830     check_step :  0


 0 in 288000     train loss: 0.00112     valid loss: 0.002473     p1 : 0.73442     p3 : 0.36346     p5 : 0.23830     check_step :  0


[I 201027 13:05:57 <ipython-input-15-e3bbe2c196e8>:79]  0 in 320000     train loss: 0.00114     valid loss: 0.002484     p1 : 0.73514     p3 : 0.36332     p5 : 0.23816     check_step :  1


 0 in 320000     train loss: 0.00114     valid loss: 0.002484     p1 : 0.73514     p3 : 0.36332     p5 : 0.23816     check_step :  1


I1027 13:17:38.127850 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 13:17:40.337542 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 13:17:40 <ipython-input-15-e3bbe2c196e8>:79]  0 in 352000     train loss: 0.00086     valid loss: 0.002519     p1 : 0.73482     p3 : 0.36380     p5 : 0.23833     check_step :  0


 0 in 352000     train loss: 0.00086     valid loss: 0.002519     p1 : 0.73482     p3 : 0.36380     p5 : 0.23833     check_step :  0


I1027 13:29:21.628840 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 13:29:23.536650 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 13:29:23 <ipython-input-15-e3bbe2c196e8>:79]  0 in 384000     train loss: 0.00081     valid loss: 0.002513     p1 : 0.73541     p3 : 0.36391     p5 : 0.23844     check_step :  0


 0 in 384000     train loss: 0.00081     valid loss: 0.002513     p1 : 0.73541     p3 : 0.36391     p5 : 0.23844     check_step :  0


I1027 13:41:05.238841 140119431956288 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_7e_v6_512/config.json
I1027 13:41:07.515519 140119431956288 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_7e_v6_512/pytorch_model.bin
[I 201027 13:41:07 <ipython-input-15-e3bbe2c196e8>:79]  0 in 416000     train loss: 0.00073     valid loss: 0.002513     p1 : 0.73586     p3 : 0.36397     p5 : 0.23851     check_step :  0


 0 in 416000     train loss: 0.00073     valid loss: 0.002513     p1 : 0.73586     p3 : 0.36397     p5 : 0.23851     check_step :  0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0


[I 201027 13:52:48 <ipython-input-15-e3bbe2c196e8>:79]  0 in 448000     train loss: 0.00091     valid loss: 0.002517     p1 : 0.73530     p3 : 0.36362     p5 : 0.23833     check_step :  1


 0 in 448000     train loss: 0.00091     valid loss: 0.002517     p1 : 0.73530     p3 : 0.36362     p5 : 0.23833     check_step :  1


[I 201027 14:04:29 <ipython-input-15-e3bbe2c196e8>:79]  0 in 480000     train loss: 0.00084     valid loss: 0.002521     p1 : 0.73582     p3 : 0.36385     p5 : 0.23844     check_step :  2


 0 in 480000     train loss: 0.00084     valid loss: 0.002521     p1 : 0.73582     p3 : 0.36385     p5 : 0.23844     check_step :  2


[I 201027 14:16:10 <ipython-input-15-e3bbe2c196e8>:79]  0 in 512000     train loss: 0.00084     valid loss: 0.002535     p1 : 0.73595     p3 : 0.36367     p5 : 0.23842     check_step :  3


 0 in 512000     train loss: 0.00084     valid loss: 0.002535     p1 : 0.73595     p3 : 0.36367     p5 : 0.23842     check_step :  3


[I 201027 14:27:51 <ipython-input-15-e3bbe2c196e8>:79]  0 in 544000     train loss: 0.00097     valid loss: 0.002519     p1 : 0.73541     p3 : 0.36333     p5 : 0.23819     check_step :  4


 0 in 544000     train loss: 0.00097     valid loss: 0.002519     p1 : 0.73541     p3 : 0.36333     p5 : 0.23819     check_step :  4
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


[I 201027 14:39:32 <ipython-input-15-e3bbe2c196e8>:79]  0 in 576000     train loss: 0.00095     valid loss: 0.002533     p1 : 0.73651     p3 : 0.36396     p5 : 0.23844     check_step :  5


 0 in 576000     train loss: 0.00095     valid loss: 0.002533     p1 : 0.73651     p3 : 0.36396     p5 : 0.23844     check_step :  5


[I 201027 14:51:13 <ipython-input-15-e3bbe2c196e8>:79]  0 in 608000     train loss: 0.00122     valid loss: 0.002546     p1 : 0.73670     p3 : 0.36342     p5 : 0.23798     check_step :  6


 0 in 608000     train loss: 0.00122     valid loss: 0.002546     p1 : 0.73670     p3 : 0.36342     p5 : 0.23798     check_step :  6


[I 201027 15:02:54 <ipython-input-15-e3bbe2c196e8>:79]  0 in 640000     train loss: 0.00103     valid loss: 0.002549     p1 : 0.73640     p3 : 0.36322     p5 : 0.23807     check_step :  7


 0 in 640000     train loss: 0.00103     valid loss: 0.002549     p1 : 0.73640     p3 : 0.36322     p5 : 0.23807     check_step :  7
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4194304.0


[I 201027 15:14:35 <ipython-input-15-e3bbe2c196e8>:79]  0 in 672000     train loss: 0.00086     valid loss: 0.002544     p1 : 0.73588     p3 : 0.36356     p5 : 0.23819     check_step :  8


 0 in 672000     train loss: 0.00086     valid loss: 0.002544     p1 : 0.73588     p3 : 0.36356     p5 : 0.23819     check_step :  8


[I 201027 15:26:15 <ipython-input-15-e3bbe2c196e8>:79]  0 in 704000     train loss: 0.00097     valid loss: 0.002535     p1 : 0.73721     p3 : 0.36360     p5 : 0.23829     check_step :  9


 0 in 704000     train loss: 0.00097     valid loss: 0.002535     p1 : 0.73721     p3 : 0.36360     p5 : 0.23829     check_step :  9


[I 201027 15:37:57 <ipython-input-15-e3bbe2c196e8>:79]  0 in 736000     train loss: 0.00055     valid loss: 0.002565     p1 : 0.73737     p3 : 0.36350     p5 : 0.23827     check_step : 10


 0 in 736000     train loss: 0.00055     valid loss: 0.002565     p1 : 0.73737     p3 : 0.36350     p5 : 0.23827     check_step : 10


[I 201027 17:44:32 <ipython-input-15-e3bbe2c196e8>:79]  1 in 288000     train loss: 0.00077     valid loss: 0.002732     p1 : 0.73609     p3 : 0.36227     p5 : 0.23738     check_step : 21


 1 in 288000     train loss: 0.00077     valid loss: 0.002732     p1 : 0.73609     p3 : 0.36227     p5 : 0.23738     check_step : 21


[I 201027 17:56:14 <ipython-input-15-e3bbe2c196e8>:79]  1 in 320000     train loss: 0.00058     valid loss: 0.002698     p1 : 0.73675     p3 : 0.36284     p5 : 0.23761     check_step : 22


 1 in 320000     train loss: 0.00058     valid loss: 0.002698     p1 : 0.73675     p3 : 0.36284     p5 : 0.23761     check_step : 22


[I 201027 18:07:55 <ipython-input-15-e3bbe2c196e8>:79]  1 in 352000     train loss: 0.00051     valid loss: 0.002707     p1 : 0.73807     p3 : 0.36282     p5 : 0.23744     check_step : 23


 1 in 352000     train loss: 0.00051     valid loss: 0.002707     p1 : 0.73807     p3 : 0.36282     p5 : 0.23744     check_step : 23


[I 201027 18:19:38 <ipython-input-15-e3bbe2c196e8>:79]  1 in 384000     train loss: 0.00111     valid loss: 0.002720     p1 : 0.73746     p3 : 0.36265     p5 : 0.23760     check_step : 24


 1 in 384000     train loss: 0.00111     valid loss: 0.002720     p1 : 0.73746     p3 : 0.36265     p5 : 0.23760     check_step : 24


[I 201027 18:31:19 <ipython-input-15-e3bbe2c196e8>:88] 
[I 201027 18:31:19 <ipython-input-15-e3bbe2c196e8>:89] Average training loss: 0.00039
[I 201027 18:31:19 <ipython-input-15-e3bbe2c196e8>:90] Training epcoh took: 2:31:59


Average training loss: 0.00039
Training epcoh took: 2:31:59


In [16]:
path = './For_paper/bce_laha_v6'
os.mkdir(path)
model.save_pretrained(path)

I1027 11:01:43.965672 140497889576768 configuration_utils.py:144] Configuration saved in ./For_paper/bce_laha_v6/config.json
I1027 11:01:44.509656 140497889576768 modeling_utils.py:450] Model weights saved in ./For_paper/bce_laha_v6/pytorch_model.bin


In [16]:
model = BertForMultiLabelSequenceClassification.from_pretrained('./For_paper/bce_laha_7e_v6_512', cache_dir=None, num_labels=1383, label_emb=label_emb)

I1027 18:38:22.037728 140119431956288 configuration_utils.py:283] loading configuration file ./For_paper/bce_laha_7e_v6_512/config.json
I1027 18:38:22.046288 140119431956288 configuration_utils.py:321] Model config BertConfig {
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_

In [17]:
model = model.cuda()

In [19]:
from scipy import sparse

def get_ndcg_5(predict, target, top):

    target = sparse.csr_matrix(np.array(target))
    log = 1.0 / np.log2(np.arange(top) + 2)
    dcg = np.zeros((target.shape[0], 1))
    
    for i in range(top):
        prediction = []
        for index_list in predict:
            p = index_list[i: i+1]
            predicts = [0]*target.shape[1]
            predicts[p] = 1
            prediction.append(predicts)
        prediction = sparse.csr_matrix(np.array(prediction))
        dcg += prediction.multiply(target).sum(axis=-1) * log[i]
        
    return np.average(dcg / log.cumsum()[np.minimum(target.sum(axis=-1), top) - 1])

In [20]:
from tqdm import tqdm
batch_size = 1024

#loss_fn = FocalLoss(alpha=0.25).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device)
import torch.nn.functional as F

test_inputs = torch.load('/notebook/BERT/tensor_data/test_input.pt')
test_masks = torch.load('/notebook/BERT/tensor_data/test_mask.pt')
test_labels = torch.load('./tensor_data/test_labels.pt')

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
valid_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

p1, p3, p5, valid_step = 0.0, 0.0, 0.0, 0
for batch in tqdm(valid_data_loader):
        labels = predict_step(model, batch, 5, loss_fn)[1]
        targets = batch[2]

        p1 += get_ndcg_5(labels, targets, 1)
        p3 += get_ndcg_5(labels, targets, 3)
        p5 += get_ndcg_5(labels, targets, 5)
        valid_step += 1

print("p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}".format(p1/valid_step, p3/valid_step, p5/valid_step))

 64%|██████▍   | 62/97 [03:08<01:46,  3.04s/it]


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm

batch_size = 512

loss_fn = nn.BCEWithLogitsLoss().to(device)
import torch.nn.functional as F

test_inputs = torch.load('/notebook/BERT/tensor_data/test_input512.pt')
test_masks = torch.load('/notebook/BERT/tensor_data/test_mask512.pt')
test_labels = torch.load('./tensor_data/test_labels.pt')

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
valid_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

p1, p3, p5, valid_step = 0.0, 0.0, 0.0, 0
for batch in tqdm(valid_data_loader):
        labels = predict_step(model, batch, 5, loss_fn)[1]
        targets = batch[2]

        p1 += get_ndcg_5(labels, targets, 1)
        p3 += get_ndcg_5(labels, targets, 3)
        p5 += get_ndcg_5(labels, targets, 5)
        valid_step += 1

print("p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}".format(p1/valid_step, p3/valid_step, p5/valid_step))

 66%|██████▌   | 128/194 [06:09<03:10,  2.89s/it]

In [9]:
batch_size = 512

loss_fn = FocalLoss(alpha=0.25).to(device)
import torch.nn.functional as F
import tqdm

test_inputs = torch.load('../../ipc_vs_non/ipc_test_inputs512.pt')
test_masks = torch.load('../../ipc_vs_non/ipc_test_masks512.pt')
test_labels = torch.load('../../test_label.pt')

test_data = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_data)
valid_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

scores1 = np.array([])
labels1 = np.array([])
#scores2 = torch.tensor([])
#labels2 = torch.tensor([])

def predict_step(model, batch, k:int):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask = batch
    with torch.no_grad():
        loss = model(input_ids, token_type_ids=None, attention_mask=input_mask)
        scores, labels = torch.topk(loss, k)
        return torch.sigmoid(scores).cpu(), labels.cpu()

for step, batch in enumerate(valid_data_loader):
    
    s1, l1 = predict_step(model, batch, 100)
    
    if step == 0:
        scores1 = s1.numpy()
        labels1 = l1.numpy()
    else:
        scores1 = np.concatenate((scores1, s1.numpy()))
        labels1 = np.concatenate((labels1, l1.numpy()))

In [10]:
np.save('/notebook/ensemble/512/label_labels_v2.npy', labels1)
np.save('/notebook/ensemble/512/label_scores_v2.npy', scores1)