In [1]:
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F 

from transformers import BertModel


from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer


import pandas as pd
import numpy as np
import random
import time
import datetime
from tqdm import tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold


# Data Preprocessing

In [2]:
# load preprocessed text data 
train_data = np.load('./storage/fintech_nlp/train_text_morphed.npy', allow_pickle = True) 
test_data = np.load('./storage/fintech_nlp/test_text_morphed.npy', allow_pickle = True)

In [3]:
train_df = pd.read_csv('./storage/fintech_nlp/lgbm_train_df.csv') 
y_train = train_df['info'] 
y_train = np.asarray(y_train)

In [4]:
train_data.shape, y_train.shape, test_data.shape

((118745,), (118745,), (142565,))

In [5]:
tokenizer = Tokenizer() 

In [6]:
# to train tokenizer on full data 
full_data = np.concatenate([train_data, test_data])

In [7]:
tokenizer.fit_on_texts(full_data)

In [8]:
MAX_LEN = 128 
input_sequences = tokenizer.texts_to_sequences(train_data)

In [9]:
input_sequences = pad_sequences(input_sequences, 
                                maxlen = MAX_LEN, dtype = 'long', 
                                truncating = 'post', padding = 'post') 

In [10]:
attention_masks = [] 
for seq in input_sequences: 
    seq_mask = [float(i > 0) for i in seq] 
    attention_masks.append(seq_mask) 

attention_masks = np.asarray(attention_masks)

In [11]:
test_input_sequences = tokenizer.texts_to_sequences(test_data)

In [12]:
test_input_sequences = pad_sequences(test_input_sequences, 
                                maxlen = MAX_LEN, dtype = 'long', 
                                truncating = 'post', padding = 'post') 

In [13]:
test_attention_masks = [] 
for seq in test_input_sequences: 
    seq_mask = [float(i > 0) for i in seq] 
    test_attention_masks.append(seq_mask) 

test_attention_masks = np.asarray(test_attention_masks)

## Train test split and prepare for training 

In [14]:
## simple train-validation split
train_id, val_id, train_y, val_y = train_test_split(input_sequences, y_train, random_state = 42, test_size = 0.1) 

train_mask, val_mask, _, _ = train_test_split(attention_masks, input_sequences, random_state = 42, test_size = 0.1)

In [15]:
train_id.shape, train_mask.shape, train_y.shape, val_id.shape, val_mask.shape, val_y.shape

((106870, 128), (106870, 128), (106870,), (11875, 128), (11875, 128), (11875,))

In [16]:
train_id = torch.tensor(train_id) 
train_y = torch.tensor(train_y) 
train_mask = torch.tensor(train_mask) 

val_id = torch.tensor(val_id) 
val_y = torch.tensor(val_y) 
val_mask = torch.tensor(val_mask)

In [17]:
train_id.shape, train_y.shape, train_mask.shape, val_id.shape, val_y.shape, val_mask.shape

(torch.Size([106870, 128]),
 torch.Size([106870]),
 torch.Size([106870, 128]),
 torch.Size([11875, 128]),
 torch.Size([11875]),
 torch.Size([11875, 128]))

In [18]:
BATCH_SIZE = 128 
train_set = TensorDataset(train_id, train_mask, train_y)
train_sampler = RandomSampler(train_set)
train_dataloader = DataLoader(train_set, sampler=train_sampler, batch_size=BATCH_SIZE)

In [19]:
val_set = TensorDataset(val_id, val_mask, val_y) 
val_sampler = SequentialSampler(val_set)
val_dataloader = DataLoader(val_set, sampler=val_sampler, batch_size=BATCH_SIZE)

# Model Training

In [15]:
class CustomBERTModel(nn.Module): 
    def __init__(self): 
        super(CustomBERTModel, self).__init__() 
        self.bert = BertModel.from_pretrained('HanBert-54kN-torch')
        self.dropout = nn.Dropout(0.2) 
        self.linear1 = nn.Linear(768,256)
        self.linear2 = nn.Linear(256,1) 
        self.output = nn.Sigmoid() 
        
    def forward(self, ids, mask):
        sequence_output, pooled_output = self.bert(ids, 
                                                   attention_mask=mask).values()
        #linear1_output = self.linear1(sequence_output[:,0,:].view(-1,768))  
        dropout = self.dropout(sequence_output)
        linear1_output = self.linear1(sequence_output[:,0,:].view(-1,768)) 
        linear2_output = self.linear2(linear1_output) 
        logit = self.output(linear2_output) 
        return logit   

In [21]:
model = CustomBERTModel() 
model.to(torch.device("cuda"))

CustomBERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(54000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [22]:
epochs = 20
criterion = nn.BCELoss()
#optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters())) 
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [23]:
device = torch.device("cuda")

In [24]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [25]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [27]:
model.zero_grad()

for epoch_i in range(0,epochs):
    print("")
    print("======== Epoch {:} / {:} ========".format(epoch_i + 1, epochs))
    print("Training...")
    t0 = time.time()
    model.train() 
    running_loss = 0 
    train_steps = 0 
    train_accuracy = 0 
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print("  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.".format(step, len(train_dataloader), elapsed))
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_labels = b_labels.float()
        #optimizer.zero_grad() 
        outputs = model(b_input_ids, b_input_mask) 
        outputs = outputs.flatten()  
        
        loss = criterion(outputs, b_labels)
        if step % 100 == 0 and not step == 0: 
            print("  current loss = {}".format(loss.item())) # average loss across batch 
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() 
        scheduler.step() 
        model.zero_grad()
        running_loss += loss.item()    
        
        outputs = outputs.detach().cpu().numpy() 
        b_labels = b_labels.to('cpu').numpy()  
        classes = np.where(outputs > 0.5, 1, 0)
        
        train_accuracy += np.sum(classes == b_labels)/len(b_labels)
        
        train_steps += 1 
        
    print("Average training loss = {}".format(running_loss / len(train_dataloader))) 
    print("Average training accuracy = {}".format(train_accuracy / train_steps))
    
    print("") 
    print("Running Validation ...")
    t0 = time.time()
    model.eval() 
    eval_loss, eval_accuracy = 0, 0
    eval_steps = 0 
    for batch in val_dataloader: 
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_labels = b_labels.float() 
        with torch.no_grad():     
            outputs = model(b_input_ids, b_input_mask)
        outputs = outputs.flatten() 
        loss = criterion(outputs, b_labels) 
        eval_loss += loss.item()   
        
        outputs = outputs.detach().cpu().numpy() 
        b_labels = b_labels.to('cpu').numpy()  
        classes = np.where(outputs > 0.5, 1, 0)
        
        eval_accuracy += np.sum(classes == b_labels)/len(b_labels) 
        
        eval_steps += 1
        
    avg_val_loss = eval_loss / len(val_dataloader)  
    avg_val_accuracy = eval_accuracy / eval_steps 
    print("Average validation loss = {}".format(avg_val_loss)) 
    print("Average validation accuracy = {}".format(avg_val_accuracy))
    
    savepath = './storage/hanbert_models/model'
    print("saving model...")
    torch.save({'state_dict':model.state_dict()}, savepath + str(epoch_i+1) + '.pth.tar') 
    
    


Training...
  Batch   100  of    835.    Elapsed: 0:02:20.
  current loss = 0.12086381763219833
  Batch   200  of    835.    Elapsed: 0:04:43.
  current loss = 0.06908093392848969
  Batch   300  of    835.    Elapsed: 0:07:06.
  current loss = 0.11486254632472992
  Batch   400  of    835.    Elapsed: 0:09:29.
  current loss = 0.061554186046123505
  Batch   500  of    835.    Elapsed: 0:11:52.
  current loss = 0.04220367968082428
  Batch   600  of    835.    Elapsed: 0:14:16.
  current loss = 0.031889986246824265
  Batch   700  of    835.    Elapsed: 0:16:39.
  current loss = 0.07420255988836288
  Batch   800  of    835.    Elapsed: 0:19:02.
  current loss = 0.05796804651618004
Average training loss = 0.08807991882563083
Average training accuracy = 0.9654635643966305

Running Validation ...
Average validation loss = 0.040440571752266695
Average validation accuracy = 0.9884666558053655
saving model...

Training...
  Batch   100  of    835.    Elapsed: 0:02:22.
  current loss = 0.0422361

KeyboardInterrupt: 

## Make Prediction

In [17]:
best_model = CustomBERTModel() 
checkpoint = torch.load('./storage/hanbert_models/model7.pth.tar') 
best_model.load_state_dict(checkpoint['state_dict']) 

<All keys matched successfully>

In [18]:
best_model.eval() 
print()




In [19]:
test_input_sequences = torch.tensor(test_input_sequences)
test_attention_masks = torch.tensor(test_attention_masks) 

In [20]:
BATCH_SIZE = 128
test_set = TensorDataset(test_input_sequences, test_attention_masks) 
test_sampler = SequentialSampler(test_set) 
test_dataloader = DataLoader(test_set, sampler = test_sampler, batch_size = BATCH_SIZE, shuffle = False)

In [21]:
# define device again 
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Quadro P6000


In [None]:
t0 = time.time() 
predictions = [] 
for step, batch in enumerate(test_dataloader):  
    if step % 100 == 0 and not step == 0: 
        elapsed = format_time(time.time() - t0) 
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))
    b_input_ids, b_input_mask = batch 
    with torch.no_grad():
        outputs = best_model(b_input_ids, b_input_mask) 
    outputs = outputs.flatten() 
    predictions.append(outputs) 
        
print("converting prediction array to numpy...")
predictions = np.asarray(predictions) 