# BERT Fine-Tuning Sentence Classification

> BERT Fine-Tuning Tutorial with PyTorch
> 
> ref: https://colab.research.google.com/drive/1Y4o3jh3ZH70tl6mCd76vz_IxX23biCPP

- toc: true 
- badges: true
- comments: true
- categories: [bert, jupyter]

# 1. Setup
## 1.1. Using Colab GPU for Training

In [35]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [36]:
import torch

# If there's a GPU available,
if torch.cuda.is_available():

    # Tell PyTorch to user the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will user the GPU:', torch.cuda.get_device_name(0))

# If not,
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will user the GPU: Tesla K80


## 1.2. Installing the Hugging Face Library

In [0]:
! pip install transformers -q

In [0]:
! pip install wget -q

In [39]:
import wget 
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

# Download the file (if we haven't already)
if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

Downloading dataset...


In [0]:
# Unzip the dataset (if we haven't already)
if not os.path.exists('./cola_public/'):
    ! unzip cola_public_1.1.zip

In [41]:
! head cola_public/*/*

==> cola_public/raw/in_domain_dev.tsv <==
gj04	1		The sailors rode the breeze clear of the rocks.
gj04	1		The weights made the rope stretch over the pulley.
gj04	1		The mechanical doll wriggled itself loose.
cj99	1		If you had eaten more, you would want less.
cj99	0	*	As you eat the most, you want the least.
cj99	0	*	The more you would want, the less you would eat.
cj99	0	*	I demand that the more John eat, the more he pays.
cj99	1		Mary listens to the Grateful Dead, she gets depressed.
cj99	1		The angrier Mary got, the more she looked at pictures.
cj99	1		The higher the stakes, the lower his expectations are.

==> cola_public/raw/in_domain_train.tsv <==
gj04	1		Our friends won't buy this analysis, let alone the next one we propose.
gj04	1		One more pseudo generalization and I'm giving up.
gj04	1		One more pseudo generalization or I'm giving up.
gj04	1		The more we study verbs, the crazier they get.
gj04	1		Day by day the facts are getting murkier.
gj04	1		I'll fix you a drink.
gj04	1		

## 2.2. Parse

In [42]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
1337,r-67,1,,I want to peruse that contract before filing a...
1670,r-67,1,,This hat Tom said Al thought you wanted me to ...
6952,m_02,1,,Jane visits Emma.
449,bc01,0,??,Who is he reading a book that criticizes?
4988,ks08,1,,The fact that scientists have now established ...
1830,r-67,0,*,That informers they never use is claimed by th...
6508,d_98,0,*,"Every cat doesn't like mice, but Felix doesn't."
8329,ad03,1,,I asked did Medea poison Jason.
1358,r-67,0,*,They will give me a hat that I won't like whic...
5503,b_73,1,,Her mother wants Mary to be such an eminent wo...


In [43]:
df[df.label == 0].sample(5)[['sentence', 'label']]

Unnamed: 0,sentence,label
6544,"The table, I put Kim on which supported the book.",0
5993,Has Calvin a bowl?,0
379,How do you wonder who could solve this problem.,0
588,the branch dropped bare of its apple.,0
7194,"Your desk before, this girl in the red coat wi...",0


In [0]:
# Get the lists of sentences and their labels.
sentences = df.sentence.values
labels = df.label.values

# 3. Tokenization & Input Formatting

## 3.1. BERT Tokenizer

In [45]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [46]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized:  ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
Token IDs:  [14008, 16119, 11441, 112, 162, 35172, 10372, 15559, 117, 12421, 19145, 10103, 12878, 10399, 11312, 25690, 119]


In [47]:
korean = "안녕하세요. 반갑습니다. 너는 이름이 뭐니? 오늘 날씨가 맑고 좋구나."
# Print the original sentence.
print(' Original: ', korean)

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(korean))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(korean)))

 Original:  안녕하세요. 반갑습니다. 너는 이름이 뭐니? 오늘 날씨가 맑고 좋구나.
Tokenized:  ['ᄋ', '##ᅡᆫ', '##녀', '##ᆼ', '##하', '##세', '##요', '.', 'ᄇ', '##ᅡᆫ', '##가', '##ᆸ', '##스', '##ᆸ니다', '.', 'ᄂ', '##ᅥ', '##는', '이', '##름이', 'ᄆ', '##ᅯ', '##니', '?', 'ᄋ', '##ᅩ', '##ᄂ', '##ᅳᆯ', '날', '##씨', '##가', 'ᄆ', '##ᅡ', '##ᆰ', '##고', 'ᄌ', '##ᅩ', '##ᇂ', '##구', '##나', '.']
Token IDs:  [1174, 26646, 49345, 13045, 35132, 25169, 47024, 119, 1170, 26646, 11376, 17360, 13212, 79427, 119, 1165, 33645, 11192, 12398, 89420, 1169, 97090, 25536, 136, 1174, 29347, 97071, 63277, 76818, 47928, 11376, 1169, 25539, 97098, 12300, 1175, 29347, 97109, 16336, 16801, 119]


When we actually convert all of our sentences, we'll use the `tokenize.encode` functio to handle both steps, rather than calling `tokenize` and `convert_tokens_to_ids` seperately.

## 3.2. Required Formatting

## 3.3. Tokenize Dataset

In [48]:
max_len = 0

# For every sentence,
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence langth.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  48


In [67]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence,
for sent in sentences:
    # `encode_plus` will:
    # (1) Tokenize the sentence.
    # (2) Prepend the `[CLS]` token to the start.
    # (3) Append the `[SEP]` token to the end.
    # (4) Map tokens to their IDs.
    # (5) Pad or truncate the sentence to `max_length`
    # (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
        sent,                                       # Sentence to encode.
        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
        max_length = 64,                           # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True,   # Construct attn. masks.
        return_Tensors = 'pt',                  # Return pytorch tensors.
    )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding.)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.FloatTensor(input_ids)
attention_masks = torch.FloatTensor(attention_masks)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original:  ', sentences[0])
print('Token IDs: ', input_ids[0])

Original:   Our friends won't buy this analysis, let alone the next one we propose.
Token IDs:  tensor([  101., 14008., 16119., 11441.,   112.,   162., 35172., 10372., 15559.,
          117., 12421., 19145., 10103., 12878., 10399., 11312., 25690.,   119.,
          102.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.])




## 3.4. Training & Validation Split

In [68]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Devide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5} training samples'.format(train_size))
print('{:>5} validation samples'.format(val_size))

 7695 training samples
  856 validation samples


In [0]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
    train_dataset,   # The training samples.
    sampler = RandomSampler(train_dataset),   # Select batches randomly
    batch_size = batch_size   # Trains with this batch size.
)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
    val_dataset,   # The validation samples.
    sampler = SequentialSampler(val_dataset),   # Pull out batches sequentially.
    batch_size = batch_size   # Evaluate with this batch size.
)