In [None]:
# Install transformer Model from Hugging Face
!pip install transformers

In [2]:
# Import
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [None]:
# Clone data of movie reviews from naver
!git clone https://github.com/e9t/nsmc.git

In [None]:
# Organize data in to train and test
train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("nsmc/ratings_test.txt", sep='\t')

# Reorganize the text data in the form of BERT input data
sentences = train['document']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

# Reorganize label data
labels = train['label'].values
labels

# Tokenize the sentences (wordpiece statistical method)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [5]:
# Max length of input token
MAX_LEN = 128

# Transform the tokens into number indices
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Cut sentences according to MAX_LEN and fill the empty spots with 0
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [6]:
# Initialize attention mask
attention_masks = []

# Set as 1 if the value is not a padding and 0 if otherwise
# This imporves the speed
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [7]:
# Dividing data into train and test dataset
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)

# Divide attention mask into train and test dataset
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2018, 
                                                       test_size=0.1)

# Transform data into pytorch tensor
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [8]:
# Batch size
batch_size = 32

# Set data as input, mast and label using pytorch DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [9]:
# Same process for the test dataset
sentences = test['document']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = test['label'].values
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)
batch_size = 32
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
#GPU
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

# Creating BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

In [None]:
# Set optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# Set number of epochs
epochs = 2

# Total steps
total_steps = len(train_dataloader) * epochs

# Scheduler for regularization
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [17]:
# Accuracy calculation
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Time display
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [18]:
#Train Data

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
model.zero_grad()

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()
        
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  4,219.    Elapsed: 0:05:54.
  Batch 1,000  of  4,219.    Elapsed: 0:11:50.
  Batch 1,500  of  4,219.    Elapsed: 0:17:45.
  Batch 2,000  of  4,219.    Elapsed: 0:23:41.
  Batch 2,500  of  4,219.    Elapsed: 0:29:36.
  Batch 3,000  of  4,219.    Elapsed: 0:35:32.
  Batch 3,500  of  4,219.    Elapsed: 0:41:28.
  Batch 4,000  of  4,219.    Elapsed: 0:47:24.

  Average training loss: 0.35
  Training epcoh took: 0:49:59

Running Validation...
  Accuracy: 0.85
  Validation took: 0:01:59

Training...
  Batch   500  of  4,219.    Elapsed: 0:05:56.
  Batch 1,000  of  4,219.    Elapsed: 0:11:52.
  Batch 1,500  of  4,219.    Elapsed: 0:17:48.
  Batch 2,000  of  4,219.    Elapsed: 0:23:43.
  Batch 2,500  of  4,219.    Elapsed: 0:29:39.
  Batch 3,000  of  4,219.    Elapsed: 0:35:34.
  Batch 3,500  of  4,219.    Elapsed: 0:41:32.
  Batch 4,000  of  4,219.    Elapsed: 0:47:28.

  Average training loss: 0.27
  Training epcoh took: 0:50:04

Running Validation...
  Accura

In [19]:
# Evaluate using test dataset

t0 = time.time()
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for step, batch in enumerate(test_dataloader):
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))

  Batch   100  of  1,563.    Elapsed: 0:00:24.
  Batch   200  of  1,563.    Elapsed: 0:00:49.
  Batch   300  of  1,563.    Elapsed: 0:01:13.
  Batch   400  of  1,563.    Elapsed: 0:01:37.
  Batch   500  of  1,563.    Elapsed: 0:02:02.
  Batch   600  of  1,563.    Elapsed: 0:02:26.
  Batch   700  of  1,563.    Elapsed: 0:02:51.
  Batch   800  of  1,563.    Elapsed: 0:03:16.
  Batch   900  of  1,563.    Elapsed: 0:03:42.
  Batch 1,000  of  1,563.    Elapsed: 0:04:07.
  Batch 1,100  of  1,563.    Elapsed: 0:04:32.
  Batch 1,200  of  1,563.    Elapsed: 0:04:57.
  Batch 1,300  of  1,563.    Elapsed: 0:05:23.
  Batch 1,400  of  1,563.    Elapsed: 0:05:48.
  Batch 1,500  of  1,563.    Elapsed: 0:06:14.

Accuracy: 0.87
Test took: 0:06:30


In [20]:
# Test new text input
def convert_input_data(sentences):

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    MAX_LEN = 128
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

def test_sentences(sentences):

    model.eval()

    inputs, masks = convert_input_data(sentences)
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    return logits


In [22]:
logits = test_sentences(['1분 1초도 아까운 망작이었다'])

print(logits)
print(np.argmax(logits))

[[ 3.1004376 -2.8408906]]
0


In [23]:
logits = test_sentences(['이게 나라냐'])

print(logits)
print(np.argmax(logits))

[[ 0.9570602  -0.99937314]]
0


In [25]:
logits = test_sentences(['오 미친 좋은 영화다'])

print(logits)
print(np.argmax(logits))

[[-1.7070786  1.6351452]]
1


In [26]:
logits = test_sentences(['이걸 돈주고 봤네'])

print(logits)
print(np.argmax(logits))

[[ 2.957375  -2.8033872]]
0


In [27]:
logits = test_sentences(['또 봐야지'])

print(logits)
print(np.argmax(logits))

[[-1.9120269  1.7566904]]
1


In [28]:
logits = test_sentences(['또 안봐야지'])

print(logits)
print(np.argmax(logits))

[[ 1.0317816 -1.0915736]]
0


In [29]:
logits = test_sentences(['또 봐야되나 고민되네'])

print(logits)
print(np.argmax(logits))

[[-0.38455206  0.39431792]]
1


In [30]:
logits = test_sentences(['또 봐야되나 고민이 안되네'])

print(logits)
print(np.argmax(logits))

[[ 0.6876189 -0.7478671]]
0
