In [None]:
from transformers import BertTokenizer, BertPreTrainedModel, AdamW, BertConfig, BertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import time, random
import numpy as np
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
import torch.nn as nn
import datetime

In [None]:
train_inputs = torch.load('../../ipc_vs_non/ipc_train_input256.pt')
train_masks = torch.load('../../ipc_vs_non/ipc_train_mask256.pt')
train_labels = torch.load('../../train_label.pt')
test_inputs = torch.load('../../ipc_vs_non/valid_input.pt')
test_masks = torch.load('../../ipc_vs_non/valid_mask.pt')
test_labels = torch.load('../../ipc_vs_non/valid_labels.pt')

In [None]:
train_inputs = torch.load('../../ipc_vs_non/ipc_train_inputs512.pt')
train_masks = torch.load('../../ipc_vs_non/ipc_train_masks512.pt')
train_labels = torch.load('../../train_label.pt')
test_inputs = torch.load('../../ipc_vs_non/valid_input.pt')
test_masks = torch.load('../../ipc_vs_non/valid_mask.pt')
test_labels = torch.load('../../ipc_vs_non/valid_labels.pt')

In [None]:
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}')
torch.cuda.set_device(device)

In [None]:
label_emb=np.zeros((1383,768))

with open('../../label768_wv', 'r') as f:
    for index, i in enumerate(f.readlines()):
        if index==0:
            continue
        i = i.rstrip('\n')
        n = i.split(' ')[0]
        content = i.split(' ')[1:]
        label_emb[int(n)] = [float(value) for value in content]

label_emb = torch.from_numpy(label_emb).float()

In [None]:
class LabelAttention(nn.Module):
    def __init__(self, hidden_size, labels_num, label_emb):
        super(LabelAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = labels_num
        self.label_emb = label_emb

        label_embedding = torch.FloatTensor(self.num_labels,self.hidden_size)

        if self.label_emb is None:
            nn.init.xavier_normal_(label_embedding)
        else:
            label_embedding.copy_(self.label_emb)
        
        self.label_embedding = nn.Parameter(label_embedding,requires_grad=False)
        
        self.key_layer = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        nn.init.xavier_uniform_(self.key_layer.weight)
        
        self.query_layer = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        nn.init.xavier_uniform_(self.query_layer.weight)
        
    def forward(self, inputs, masks):
        
        attn_key = self.key_layer(inputs).transpose(1,2)
        
        label_emb = self.label_embedding.expand((attn_key.size(0),self.label_embedding.size(0),self.label_embedding.size(1)))
        attn_query = self.query_layer(label_emb)
        
        attention = torch.bmm(label_emb, attn_key).masked_fill(~masks, -np.inf)
        attention = F.softmax(attention, -1)
        
        return torch.bmm(attention, inputs)

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, labels_num):
        super(SelfAttention, self).__init__()
        self.hidden_size = hidden_size
        self.labels_num = labels_num
        
        self.attention = nn.Linear(self.hidden_size, self.labels_num, bias=False)
        nn.init.xavier_uniform_(self.attention.weight)
        
    def forward(self, inputs, masks):
        attention = self.attention(inputs).transpose(1,2).masked_fill(~masks, -np.inf)
        attention = F.softmax(attention, -1)
        
        return torch.bmm(attention, inputs)

In [None]:
class MLinear(nn.Module):
    def __init__(self, hidden_size, label_num):
        super(MLinear, self).__init__()
        self.hidden_size = hidden_size
        self.label_num = label_num
        
        self.linear_weight1 = nn.Linear(self.hidden_size,1)
        nn.init.xavier_uniform_(self.linear_weight1.weight)
        
        self.linear_weight2 = nn.Linear(self.hidden_size,1)
        nn.init.xavier_uniform_(self.linear_weight2.weight)
        
        self.fusion_linear = nn.Linear(self.hidden_size*2, self.hidden_size)
        nn.init.xavier_uniform_(self.fusion_linear.weight)
        
        #self.relu = nn.LeakyReLU(0.1)
        self.ln = nn.LayerNorm([self.label_num, self.hidden_size])
        self.dropout = nn.Dropout(0.5)
        
        self.out_linear = nn.Linear(self.hidden_size, 1)
        nn.init.xavier_uniform_(self.out_linear.weight)
        
        
    def forward(self, self_attn, label_attn):
        factor1 = torch.sigmoid(self.linear_weight1(self_attn))
        factor2 = torch.sigmoid(self.linear_weight2(label_attn))
        factor1 = factor1 / (factor1+factor2)
        factor2 = 1 - factor1
        
        out1 = factor1 * self_attn #[batch, label, hidden]
        out2 = factor2 * label_attn #[batch, label, hidden]
        
        out = torch.cat((out1, out2), dim=-1)
        
        out = self.fusion_linear(out)
        out = self.ln(out)
        out = F.gelu(out)
        out = self.dropout(out)
        out = self.out_linear(out)
        
        return torch.squeeze(out, -1)

In [None]:
class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, label_emb):
        super(BertForMultiLabelSequenceClassification, self).__init__(config, label_emb=None)
        self.num_labels = config.num_labels
        self.hidden_size = config.hidden_size
        self.label_emb = label_emb
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.1)
        
        self.self_attn = SelfAttention(self.hidden_size, self.num_labels)
        self.label_attn = LabelAttention(self.hidden_size, self.num_labels, self.label_emb)
        self.linear = MLinear(self.hidden_size, self.num_labels)
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        sequence, _ = self.bert(input_ids, attention_mask)
        sequence = self.dropout(sequence) # [batch, sequence, hidden_size]
        
        masks = attention_mask  != 0 # [batch, sequence]
        masks = torch.unsqueeze(masks, 1) # [batch, 1, sequence]
        
        self_attn = self.self_attn(sequence, masks)
        label_attn = self.label_attn(sequence, masks)
        
        return self.linear(self_attn, label_attn)

    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [None]:
model = BertForMultiLabelSequenceClassification.from_pretrained('bert-base-uncased', cache_dir=None, num_labels=1383, label_emb=label_emb)

In [None]:
model = model.cuda()

In [None]:
class NormalizedFocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True, smooth=0):
        super(NormalizedFocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce
        self.smooth = smooth

    def forward(self, inputs, targets):
        
        if self.smooth != 0:
            targets = (1-self.smooth) * targets + self.smooth / inputs.size(1)
            
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
        pt = torch.exp(-BCE_loss)
        focal_term = (1-pt).pow(self.gamma)
        normalize = 1/focal_term.mean()
        F_loss = normalize * self.alpha * focal_term * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [None]:
from apex import amp

EPOCHS = 20

optimizer = AdamW(model.parameters(), lr=7e-5, correct_bias=False, eps=1e-8, weight_decay=0.01)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # AMP 적용을 위한 코드

total_step = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_step/10,
    num_training_steps=total_step
)

#loss_fn = nn.BCEWithLogitsLoss().to(device)
loss_fn = NormalizedFocalLoss(alpha=0.25).to(device)

In [None]:
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

def predict_step(model, batch, k:int, loss_fn):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, input_labels = batch
    with torch.no_grad():
        loss = model(input_ids, token_type_ids=None, attention_mask=input_mask)
        scores, labels = torch.topk(loss, 5)
        loss = loss_fn(loss, input_labels.float())
        return torch.sigmoid(scores).cpu(), labels.cpu(), loss.cpu()

def get_p_5(predict, target, top):
    prediction = []
    for index_list in predict:
        predicts = [0]*target.shape[1]
        for index in index_list[:top]:
            predicts[index] = 1
        prediction.append(predicts)
    prediction = np.array(prediction)
    target = np.array(target)
    return np.sum(np.multiply(prediction,target))/(top*target.shape[0])

In [None]:
import time, random
import numpy as np
import logzero
logzero.setup_default_logger(logfile='./fusion.log')
from logzero import logger
import os
import tqdm
import torch.nn.functional as F

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


model.zero_grad()
best_p5, check_step, best_loss = 0, 0, 1.0
        
for epoch_i in range(0, EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')
    
    logger.info("")
    logger.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    logger.info('Training...')
    
    t0 = time.time()
    
    total_loss = 0
        
    if epoch_i > 40:
        for step, batch in enumerate(train_data_loader):
            scheduler.step()
    else:
        for step, batch in enumerate(train_data_loader):    
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, input_labels = batch
            model.train()
            logit = model(input_ids, token_type_ids=None, attention_mask=input_mask)
            loss = loss_fn(logit, input_labels.float())
            optimizer.zero_grad()
            with amp.scale_loss(loss, optimizer) as scaled_loss: # AMP 적용을 위한 코드
                scaled_loss.backward()
            #loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()


            if step % 1000 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)

                check_val_loss, valid_step = 0, 0
                p1, p3, p5 = 0.0, 0.0, 0.0
                for batch in test_data_loader:
                    _, labels, val_loss = predict_step(model, batch, 5, loss_fn)
                    targets = batch[2].cpu()
                    check_val_loss += val_loss.item()
                    valid_step += 1

                    p1 += get_p_5(labels, targets, 1)
                    p3 += get_p_5(labels, targets, 3)
                    p5 += get_p_5(labels, targets, 5)


                print("{:>2} in {:>6}     train loss: {:.5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, train_data_loader.batch_size*step, loss.item(), p1/valid_step, p3/valid_step, p5/valid_step, check_step))
                logger.info("{:>2} in {:>6}     train loss: {:.5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, train_data_loader.batch_size*step, loss.item(), p1/valid_step, p3/valid_step, p5/valid_step, check_step))


            del loss, batch, input_ids, input_mask, input_labels


        avg_train_loss = total_loss / len(train_data_loader)
        print("Average training loss: {0:.5f}".format(avg_train_loss))
        print("Training epcoh took: {:}".format(format_time(time.time() - t0)))
        logger.info("")
        logger.info("Average training loss: {0:.5f}".format(avg_train_loss))
        logger.info("Training epcoh took: {:}".format(format_time(time.time() - t0)))

        check_val_loss, valid_step = 0, 0
        p1, p3, p5 = 0.0, 0.0, 0.0
        for batch in test_data_loader:
            _, labels, val_loss = predict_step(model, batch, 5, loss_fn)
            targets = batch[2].cpu()
            check_val_loss += val_loss.item()
            valid_step += 1

            p1 += get_p_5(labels, targets, 1)
            p3 += get_p_5(labels, targets, 3)
            p5 += get_p_5(labels, targets, 5)


        avg_valid_loss = check_val_loss/valid_step

        if  best_p5 < p5:
            path = './fusion'
            if os.path.exists(path):
                model.save_pretrained(path)
            else:
                os.mkdir(path)
                model.save_pretrained(path)
            best_loss = avg_valid_loss
            best_p5 = p5
            check_step = 0
        else:
            check_step += 1
            if check_step >= 25:
                break

        print("{:>2}    valid loss: {:.5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, avg_valid_loss, p1/valid_step, p3/valid_step, p5/valid_step, check_step))
        logger.info("")
        logger.info("{:>2}    valid loss: {:.5f}     p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}     check_step : {:>2}".format(epoch_i, avg_valid_loss, p1/valid_step, p3/valid_step, p5/valid_step, check_step))            

        t0 = time.time()

        del val_loss, check_val_loss, avg_valid_loss, avg_train_loss, total_loss

In [None]:
model = BertForMultiLabelSequenceClassification.from_pretrained('./fusion', cache_dir=None, num_labels=1383, label_emb=label_emb)

model = model.cuda()

In [None]:
from tqdm import tqdm

batch_size = 1024

loss_fn = nn.BCEWithLogitsLoss().to(device)
import torch.nn.functional as F

test_inputs = torch.load('./test_input.pt')
test_masks = torch.load('./test_mask.pt')
test_labels = torch.load('./test_labels.pt')

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
valid_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

p1, p3, p5, valid_step = 0.0, 0.0, 0.0, 0
for batch in tqdm(valid_data_loader):
        labels = predict_step(model, batch, 5, loss_fn)[1]
        targets = batch[2]

        p1 += get_p_5(labels, targets, 1)
        p3 += get_p_5(labels, targets, 3)
        p5 += get_p_5(labels, targets, 5)
        valid_step += 1

print("p1 : {:.5f}     p3 : {:.5f}     p5 : {:.5f}".format(p1/valid_step, p3/valid_step, p5/valid_step))