## Import necessary libraries 
This example is carried out in pytorch 

In [1]:
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F 

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
from tqdm import tqdm
from sklearn.metrics import log_loss


In [2]:
# load preprocessed text data 
train_data = np.load('./storage/fintech_nlp/train_text_morphed.npy', allow_pickle = True) 
test_data = np.load('./storage/fintech_nlp/test_text_morphed.npy', allow_pickle = True)

In [3]:
train_data.shape, test_data.shape

((118745,), (142565,))

In [4]:
train_df = pd.read_csv('./storage/fintech_nlp/lgbm_train_df.csv') 
y_train = train_df['info'] 
y_train = np.asarray(y_train)

In [5]:
y_train.shape

(118745,)

## Tokenize using the BertTokenizer

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = False) 
#train_tokenized = [tokenizer.tokenize(s) for s in train_bert]  

## Pad tokenized sequences 

In [7]:
MAX_LEN = 128 
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in train_data] 
input_ids = pad_sequences(input_ids, maxlen = MAX_LEN, dtype = 'long', truncating = 'post', padding = 'post') 
input_ids[0]

array([  101,   164,   100,   166,   100,   100, 69015,  9547,   100,
         164,  9638,   100,   100,   166, 10208,   131, 10842, 26565,
         100,   100, 69015,  9547,   100,   102,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0]

## Create Attention Mask 

In [8]:
attention_masks = [] 
for seq in input_ids: 
    seq_mask = [float(i > 0) for i in seq] 
    attention_masks.append(seq_mask) 

print(attention_masks[0]) 

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


## Creating train-validation split

In [9]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, y_train,
                                                                                    random_state = 42, test_size = 0.1)

In [10]:
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                       random_state = 42, test_size = 0.1)

In [11]:
## convert to pytorch tensor 
train_inputs = torch.tensor(train_inputs) 
train_labels = torch.tensor(train_labels) 
train_masks = torch.tensor(train_masks) 

validation_inputs = torch.tensor(validation_inputs) 
validation_labels = torch.tensor(validation_labels) 
validation_masks = torch.tensor(validation_masks)   

## Create Data Loader 

In [12]:
batch_size = 128
train_data = TensorDataset(train_inputs, train_masks, train_labels) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

In [13]:
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) 
validation_sampler = SequentialSampler(validation_data) 
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = batch_size)

## Repeat the preprocessing steps for test data 

In [14]:
test_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in test_data] 
test_input_ids = pad_sequences(test_input_ids, maxlen = MAX_LEN, dtype = 'long',  truncating = 'post', padding = 'post') 

In [15]:
test_attention_masks = [] 
for seq in test_input_ids: 
    seq_mask = [float(i > 0) for i in seq] 
    test_attention_masks.append(seq_mask)     


In [16]:
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)

In [17]:
test_inputs.shape, test_masks.shape 

(torch.Size([142565, 128]), torch.Size([142565, 128]))

In [29]:
test_data = TensorDataset(test_inputs, test_masks) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size, shuffle = False) 

## Model Training

### training scheduler

In [19]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels = 2)
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [20]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)   
epochs = 30
total_steps = len(train_dataloader) * epochs 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

## Training process 

In [40]:
def flat_accuracy(preds, labels): 
    pred_flat = np.argmax(preds, axis = 1).flatten() 
    labels_flat = labels.flatten()  
    return np.sum(pred_flat == labels_flat)/len(labels_flat) 

In [41]:
def format_time(elapsed):
    elapsed_rounded = int(round(elapsed)) 
    return str(datetime.timedelta(seconds = elapsed_rounded))

In [42]:
def lossFunc(inputs, targets):
    return F.binary_cross_entropy(inputs,targets)


In [24]:
seed_val = 42 
random.seed(seed_val) 
np.random.seed(seed_val) 
torch.manual_seed(seed_val) 
torch.cuda.manual_seed_all(seed_val)

In [25]:
device = torch.device('cuda') 
device

device(type='cuda')

In [26]:
model.zero_grad() # model gradient initialization 

In [27]:
PATH = './storage/bert_multilingual_test/'

In [None]:
for epoch_i in range(0, epochs): 
    print("")
    print("======== Epoch {:} / {:} ========".format(epoch_i + 1, epochs)) 
    print("training...") 
    t0 = time.time() 
    # loss initialization 
    total_loss = 0 
    # change model settings to train mode  
    model.train()  
    for step, batch in enumerate(train_dataloader): 
        # denote time information 
        if step % 100 == 0 and not step == 0: 
            elapsed = format_time(time.time() - t0) 
            print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) 
        
        # put batch inside cpu 
        batch = tuple(t.to(device) for t in batch) 
        
        # extract data from batch 
        b_input_ids, b_input_mask, b_labels = batch 
        
        # forward propagation 
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # compute loss 
        loss = outputs[0] 
        
        # compute total loss 
        total_loss += loss.item() 
        
        # backward propagation 
        loss.backward() 
        
        # gradient clipping 
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0) 
        
        # weight parameter update 
        optimizer.step() 
        
        # reduce learning rate with scheduler 
        scheduler.step() 
        
        # gradient initialization 
        model.zero_grad() 
        
    avg_train_loss = total_loss / len(train_dataloader)  
    
    
    print("")
    print("  Average training loss: {0:.10f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))   
    
    print("")
    print("Runnning Validation...")
    t0 = time.time() 
    model.eval() # evaluation mode 
    total_val_loss = 0 
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:
        # insert batch inside GPU 
        batch = tuple(t.to(device) for t in batch)
        
        # extract data from batch 
        b_input_ids, b_input_mask, b_labels = batch
        
        # we are in eval mode and we do not calculate the gradient 
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # calculate loss 
        logits = outputs[0] 

        eval_loss += (lossFunc(torch.sigmoid(logits)[:,1], b_labels.float())) 

        
        # move data to CPU 
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy() 
        
        # compare logit and labels to derive important metrics 
        tmp_eval_accuracy = flat_accuracy(logits, label_ids) 
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_steps += 1

    avg_val_loss = eval_loss / len(validation_dataloader)  

    print("  Accuracy: {0:.8f}".format(eval_accuracy/nb_eval_steps))
    print(" average val loss = {0:.10f}".format(avg_val_loss)) 
    print("  Validation took: {:}".format(format_time(time.time() - t0))) 
    
    print("saving model")
    torch.save(model, PATH + 'model'  + str(epoch_i))

print("")
print("Training complete!")
    


training...
 Batch   100 of   835. Elapsed: 0:02:19.
 Batch   200 of   835. Elapsed: 0:04:39.
 Batch   300 of   835. Elapsed: 0:06:59.
 Batch   400 of   835. Elapsed: 0:09:19.
 Batch   500 of   835. Elapsed: 0:11:39.
 Batch   600 of   835. Elapsed: 0:13:59.
 Batch   700 of   835. Elapsed: 0:16:19.
 Batch   800 of   835. Elapsed: 0:18:39.

  Average training loss: 0.0746698816
  Training epcoh took: 0:19:27

Runnning Validation...
  Accuracy: 0.98768600
 average val loss = 0.0580594353
  Validation took: 0:00:44
saving model

training...
 Batch   100 of   835. Elapsed: 0:02:19.
 Batch   200 of   835. Elapsed: 0:04:39.
 Batch   300 of   835. Elapsed: 0:06:59.
 Batch   400 of   835. Elapsed: 0:09:19.
 Batch   500 of   835. Elapsed: 0:11:39.
 Batch   600 of   835. Elapsed: 0:13:59.
 Batch   700 of   835. Elapsed: 0:16:19.
 Batch   800 of   835. Elapsed: 0:18:38.

  Average training loss: 0.0318244666
  Training epcoh took: 0:19:27

Runnning Validation...
  Accuracy: 0.98871867
 average va

# make predictions

In [23]:
# use the model with the lowest validation l
best_model = torch.load('./storage/bert_multilingual_test/model14')

In [24]:
best_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [26]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')


There are 1 GPU(s) available.
We will use the GPU: Quadro P6000


In [43]:
t0 = time.time() 
predictions = [] 
for step, batch in enumerate(test_dataloader): 
    if step % 100 == 0 and not step == 0: 
        elapsed = format_time(time.time() - t0) 
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))
    
    batch = tuple(t.to(device) for t in batch) 
    b_input_ids, b_input_mask = batch 

    
    with torch.no_grad():  
        outputs = best_model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask) 
    
    logits = outputs[0] 
    probs = torch.sigmoid(logits)[:,1] 
    probs = probs.detach().cpu().numpy() 
    
    predictions.append(probs) 


  Batch   100  of  1,114.    Elapsed: 0:00:46.
  Batch   200  of  1,114.    Elapsed: 0:01:33.
  Batch   300  of  1,114.    Elapsed: 0:02:20.
  Batch   400  of  1,114.    Elapsed: 0:03:06.
  Batch   500  of  1,114.    Elapsed: 0:03:53.
  Batch   600  of  1,114.    Elapsed: 0:04:39.
  Batch   700  of  1,114.    Elapsed: 0:05:26.
  Batch   800  of  1,114.    Elapsed: 0:06:12.
  Batch   900  of  1,114.    Elapsed: 0:06:59.
  Batch 1,000  of  1,114.    Elapsed: 0:07:46.
  Batch 1,100  of  1,114.    Elapsed: 0:08:32.


In [44]:
predictions = np.asarray(predictions) 
predictions.shape 

  return array(a, dtype, copy=False, order=order)


(1114,)

In [49]:
final_pred = [] 
for i in range(predictions.shape[0]): 
    for p in predictions[i]: 
        final_pred.append(p) 

In [50]:
final_pred = np.asarray(final_pred)

In [51]:
final_pred.shape

(142565,)

In [53]:
class_pred = np.where(final_pred > 0.5, 1, 0).reshape(-1) 

In [54]:
class_pred

array([1, 0, 0, ..., 1, 1, 1])

In [55]:
submission = pd.read_csv('./storage/fintech_nlp/sample_submission.csv') 

In [None]:
sumbission.head()