## - Import

In [1]:
!pip install datasets



In [2]:
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

2022-11-07 01:30:47.640881: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-07 01:30:47.736866: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-11-07 01:30:47.736884: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-07 01:30:47.754265: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022

## - Load Review Datasets

In [3]:
kr3 = load_dataset("Wittgensteinian/KR3", split='train')

Using custom data configuration Wittgensteinian--KR3-91924806189fc1d6
Found cached dataset parquet (/root/.cache/huggingface/datasets/Wittgensteinian___parquet/Wittgensteinian--KR3-91924806189fc1d6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [4]:
kr3 = kr3.remove_columns(['__index_level_0__'])

In [5]:
kr3

Dataset({
    features: ['Rating', 'Review'],
    num_rows: 641762
})

In [6]:
kr3.features

{'Rating': Value(dtype='int32', id=None),
 'Review': Value(dtype='string', id=None)}

- Remove ambiguous reviews whose rating is 2

In [7]:
kr3_binary = kr3.filter(lambda x: x['Rating'] != 2)

Loading cached processed dataset at /root/.cache/huggingface/datasets/Wittgensteinian___parquet/Wittgensteinian--KR3-91924806189fc1d6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-c212988a2a873f5e.arrow


In [8]:
kr3_binary

Dataset({
    features: ['Rating', 'Review'],
    num_rows: 459021
})

## - KoGPT2 Tokenzier & Model
    - GPT-2 trained on Korean corpus: https://github.com/SKT-AI/KoGPT2

In [9]:
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", pad_token='<pad>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
model = AutoModelForCausalLM.from_pretrained("skt/kogpt2-base-v2")

- Vocab of tokenizer

In [11]:
len(tokenizer.vocab)

51201

## - Token Examples

In [12]:
idx = 0
review = kr3_binary['Review'][idx]
label = kr3_binary['Rating'][idx]

In [13]:
review

'숙성 돼지고기 전문점입니다. 건물 모양 때문에 매장 모양도 좀 특이하지만 쾌적한 편이고 살짝 레트로 감성으로 분위기 잡아놨습니다. 모든 직원분들께서 전부 가능하다고 멘트 쳐주시며, 고기는 초반 커팅까지는 구워주십니다. 가격 저렴한 편 아니지만 맛은 준수합니다. 등심덧살이 인상 깊었는데 구이로 별로일 줄 알았는데 육향 짙고 얇게 저며 뻑뻑하지 않았습니다. 하이라이트는 된장찌개. 진짜 굿입니다. 버터 간장밥, 골뱅이 국수 등 나중에 더 맛봐야 할 것들은 남겨뒀습니다.'

In [14]:
tokenized_review = tokenizer(review, return_tensors='pt')

In [15]:
tokenized_review

{'input_ids': tensor([[44381, 26367,  6958, 10161,  8191, 21154, 10637,  9777,  9355, 13669,
          9777,  7235, 11732, 15846, 11686, 43752,  9266,  9466, 20387, 10286,
         11714,  9244, 12041, 33684, 13364,  7130, 16691,  9548, 18401,  7671,
          7285, 23916, 17483,  9826, 12524,   739, 18221, 13673,  8236,  7888,
          9061,  9065,  9446, 18622, 10114,  8614, 12109, 26089,  8236,  7895,
         12521, 11562, 29932,  9266, 22804, 32837, 22033, 37194,  9030,  7894,
          7216, 16912, 15464,  9958, 16693,  9073, 11434, 15126,  8149,  9566,
          9181, 31231,  9719,  8721, 14591,  6889, 25446,  9265,  7530,   739,
          7723,  7723,  9328, 10171, 16691,  9078,  9131, 51000,  9498,  8168,
          8326,  6841,   389, 23971, 15669, 21154,  9848,  8539, 49375,  7605,
           387, 10187,  7616,  8146,  9092,  7847,  9030, 13348,  9267, 11355,
          7661,  7991,  9337, 24860, 18525,  7268, 16691]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

- Feed the model with the tokenized review, and the model gives an output

In [16]:
y = model(**tokenized_review)

- Output tensor represents: the unnormalized (before passing softmax) probabilities of each token coming out in the end of the sentence

In [17]:
prediction = y.logits[0][-1]

In [18]:
prediction

tensor([-6.5961, -6.9506, -5.8478,  ..., -1.8356, -5.1506, -3.0455],
       grad_fn=<SelectBackward0>)

In [19]:
prediction.argmax()

tensor(9394)

In [20]:
tokenizer.decode(prediction.argmax())

'그리고'

## - Compare model's score of *positive* and *negative* tokens

- Select a pair of tokens to represent *positive* and *negative* respectively.

In [21]:
print(tokenizer.tokenize('최고입니다')) 
print(tokenizer.tokenize('별로입니다')) 

['▁최고', '입', '니', '다']
['▁별로', '입', '니', '다']


In [22]:
print(tokenizer.encode('최고'))
print(tokenizer.encode('별로'))

[10281]
[15126]


In [23]:
prediction[10281]

tensor(7.1346, grad_fn=<SelectBackward0>)

In [24]:
prediction[15126]

tensor(5.9310, grad_fn=<SelectBackward0>)

- label==1 is for positive reviews, and 0 is for negative reviews
- The probability assigned to token 'Best' higher than those assigned to token 'Not good'
- We can predict the input review as *positive* or *negative* by comparing model's score of 'Best' and 'Not good' tokens as follow

In [25]:
((prediction[10281] > prediction[15126]) == label).item()

True

## - Tokenization
- *truncation* is when you truncate(=cut) the input text because it's too long (i.e. it exceeds the max_length).
- *padding* is when you add extra tokens in the end of the input text to create a batch. We do not pad here. Instead, we set dynamic padding when we create PyTorch DataLoader.

In [26]:
# tokenize
def tokenize_func(x):
    return tokenizer(x['Review'], max_length=256, truncation=True)

kr3_tokenized = kr3_binary.map(tokenize_func, batched=True)

  0%|          | 0/460 [00:00<?, ?ba/s]

Check the new features: `attention_mask` and `input_ids`. These are the parameters for the model(GPT-2).

In [27]:
kr3_tokenized

Dataset({
    features: ['Rating', 'Review', 'input_ids', 'attention_mask'],
    num_rows: 459021
})

Now that we do not need the feature `Review`, we remove it. Plus, we set the format of this dataset as 'torch', as we're going to use PyTorch.

In [28]:
kr3_tokenized = kr3_tokenized.remove_columns(['Review'])
kr3_tokenized.set_format('torch')
kr3_tokenized

Dataset({
    features: ['Rating', 'input_ids', 'attention_mask'],
    num_rows: 459021
})

## - KoGPT2 Inference using PyTorch

- We make PyTorch dataloader. See the exmaple batch.  
    - `Rating[i]` represents the label (0 or 1) for (*i+1*)th review in the batch.
    - `attention_mask[i]` and `input_ids[i]` are the tokenized (*i+1*)th review in the batch.

- Dynamic Padding
> We set dynamic padding using `DataCollatorWithPadding` from `transformers`. Dynamic padding pads to the longest sequence in the batch, instead of padding to certain fixed length. In the example below, we can deduce that the longest sequence in the batch had a length of 117.

In [29]:
batch_size = 8

In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_loader = DataLoader(kr3_tokenized, batch_size=batch_size, collate_fn=data_collator)

In [31]:
batch = next(iter(data_loader))
print({k:v.size() for k,v in batch.items()})

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'Rating': torch.Size([8]), 'input_ids': torch.Size([8, 117]), 'attention_mask': torch.Size([8, 117])}


In [32]:
import torch
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU available')
else:
    print('GPU not ready')

GPU available


In [33]:
model = model.to(device)

- We predict the sentiment of each review via inference of GPT-2. This loop will take some time.

> `input_lens` represents the length of each input text. This is used to obtain the token predicted right after the input text. 

> If you run out of GPU memory, try to reduce `max_length` (in tokenization) or `batch_size`.

In [34]:
confusion_matrix = [[0,0],[0,0]]

for batch in tqdm(data_loader):
    batch = {k:v.to(device) for k,v in batch.items()} # move the data to the GPU
    y = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask']) # forward
    input_lens = batch['attention_mask'].sum(axis=1) # length of inputs in the batch

    for i in range(len(batch['Rating'])):
        next_token_prediction = y.logits[i, input_lens[i]-1] # output of the model for single review

        # prediction result
        predicted_label = (next_token_prediction[10281] > next_token_prediction[15126]).item() 
        true_label = batch['Rating'][i].item()
        confusion_matrix[true_label][predicted_label] += 1

  0%|          | 0/57378 [00:00<?, ?it/s]

## - Confusion Matrix

In [35]:
import numpy as np
confusion_np_matrix = np.array(confusion_matrix)
confusion_np_matrix

array([[ 33148,  37762],
       [ 76503, 311608]])

In [36]:
print('Accuracy:', confusion_np_matrix.diagonal().sum() / confusion_np_matrix.sum())
print('Precision for positive:', confusion_np_matrix[1,1] / confusion_np_matrix[:,1].sum())
print('Precision for negative:', confusion_np_matrix[0,0] / confusion_np_matrix[:,0].sum())
print('Recall for positive:', confusion_np_matrix[1,1] / confusion_np_matrix[1,:].sum())
print('Recall for negative:', confusion_np_matrix[0,0] / confusion_np_matrix[0,:].sum())

Accuracy: 0.7510680339243738
Precision for positive: 0.8919140166585569
Precision for negative: 0.30230458454551257
Recall for positive: 0.8028837111032668
Recall for negative: 0.46746580172049074
