In [28]:
import torch

from transformers import BertTokenizer

import statistics

from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_score,recall_score

import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import time
import pickle

In [29]:
from google.colab import drive

In [30]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [53]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [54]:
data_train = pd.read_excel('/content/gdrive/MyDrive/Colab Notebooks/Latihan/Data Augmentation NLP/data_train_edari.xlsx')
data_test = pd.read_excel('/content/gdrive/MyDrive/Colab Notebooks/Latihan/Data Augmentation NLP/data_test.xlsx')

In [55]:
X_train = data_train['Kalimat_prep']
y_train = data_train['label'] - 1

X_test = data_test['Kalimat_prep']
y_test = data_test['label'] - 1

In [56]:
torch.cuda.empty_cache()

In [57]:
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1", do_lower_case=True)

Loading BERT tokenizer...


In [58]:
sentences = X_train.values
labels = y_train.astype(int).values

test_sentences = X_test.values

In [59]:
sent_length = []

# For every sentence...
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    token_ids = tokenizer.encode(sent, add_special_tokens=True)
    sent_length.append(len(token_ids))

print('Average length = ', sum(sent_length)/len(sent_length))
print('Median length = ', statistics.median(sent_length))

Average length =  25.759841842874334
Median length =  19


In [60]:
# Tokenize all of the sentences and map the tokens to their word IDs.
token_ids  = []
attention_masks = []

# For every sentence...
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 40,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    token_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
token_ids  = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
# labels = torch.nn.functional.one_hot(labels.to(torch.int64))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [61]:
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', token_ids[0])
print('Attention Masks:', attention_masks[0])

Original:  esia yang terlupakan gak kayak smartfren sih yang udah suport jaringan tidak kalo perusahaan gak ikuti jaman ya begini lah jadinya
Token IDs: tensor([    2,  1660,   102,    34, 14195,  1489,  5788, 10905,  1966,    34,
         2137,   888,   869,  1799,   119,  1686,   742,  1489,  5576,  4881,
          286,  6838,  1389,  9802,     3,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
Attention Masks: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [62]:
# Tokenize all of the sentences and map the tokens to their word IDs.
test_token_ids  = []
test_attention_masks = []

# For every sentence...
for test_sent in test_sentences:
    test_encoded_dict = tokenizer.encode_plus(
                        test_sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 40,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    test_token_ids.append(test_encoded_dict['input_ids'])
    test_attention_masks.append(test_encoded_dict['attention_mask'])

# Convert the lists into tensors.
test_token_ids  = torch.cat(test_token_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_labels = torch.tensor(y_test.astype(int).values)
# labels = torch.nn.functional.one_hot(labels.to(torch.int64))

In [63]:
# Combine the training inputs into a TensorDataset.
train_dataset = TensorDataset(token_ids, attention_masks, labels)
test_dataset = TensorDataset(test_token_ids, test_attention_masks, test_labels)


In [64]:
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For test the order doesn't matter, so we'll just read them sequentially.
test_dataloader = DataLoader(
            test_dataset, # The test samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [76]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels = 3,
    output_attentions = False, # return attentions weights
    output_hidden_states = False, # returns all hidden-states
)

model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [77]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 1e-5,
                  eps = 1e-8
                )

In [78]:
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
print('Jumlah batch :', len(train_dataloader))
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Jumlah batch : 182


In [79]:
def acc_score(y_pred,y_test):
    acc_count = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc_count

In [80]:
EPOCHS = epochs

loss_values = []
y_true_test=[]
y_pred_test = []

total_step = len(train_dataloader)

for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (token_ids, attention_masks, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        b_input_ids = token_ids.to(device)
        b_input_mask = attention_masks.to(device)
        labels = labels.to(device)

        loss, prediction = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=labels).values()

#         print(batch_idx)
#         print(labels)
        acc = acc_score(prediction, labels)

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        total_train_acc  += acc.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)

    train_acc  = total_train_acc/len(train_dataloader)
    train_loss = total_train_loss/len(train_dataloader)
    model.eval()
    torch.set_grad_enabled(False)

    total_test_acc  = 0
    total_test_loss = 0
    with torch.no_grad():
        for batch_idx, (token_ids, attention_masks, labels) in enumerate(test_dataloader):
            optimizer.zero_grad()

            b_input_ids = token_ids.to(device)
            b_input_mask = attention_masks.to(device)
            labels = labels.to(device)

            loss, prediction = model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_mask,
                                  labels=labels).values()

            acc = acc_score(prediction, labels)

            logits = prediction
            logits = logits.detach().cpu().numpy()
            b_labels = labels.to('cpu').numpy()

            y_pred_test.append(logits)
            y_true_test.append(b_labels)

            total_test_loss += loss.item()
            total_test_acc  += acc.item()

    test_acc  = total_test_acc/len(test_dataloader)
    test_loss = total_test_loss/len(test_dataloader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | test_loss: {test_loss:.4f} test_acc: {test_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

Epoch 1: train_loss: 0.4934 train_acc: 0.7992 | test_loss: 0.4660 test_acc: 0.8626
00:00:43.56
Epoch 2: train_loss: 0.0622 train_acc: 0.9839 | test_loss: 0.6192 test_acc: 0.8454
00:00:42.42


In [49]:
model.device# Prediction on validation set

# Put model in evaluation mode
model.eval()
torch.set_grad_enabled(False)

# Tracking variables
test_pred = []

# Predict
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask,_ = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    # Store predictions and true labels
    test_pred.append(logits)

print('    DONE.')

    DONE.


In [50]:
test_pred = np.concatenate(test_pred, axis=0)
test_pred = test_pred.argmax(axis=1)

In [51]:
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       874
           1       0.06      0.40      0.10         5
           2       0.49      0.48      0.48        75

    accuracy                           0.89       954
   macro avg       0.50      0.60      0.51       954
weighted avg       0.92      0.89      0.90       954

