### BERT with sentiment analysis

In [1]:
## ref 
# english : https://skimai.com/fine-tuning-bert-for-sentiment-analysis/
# korean : https://github.com/monologg/KoBERT-nsmc

In [1]:
import os
import sys
import logging
import math
import numpy as np
from tqdm import tqdm, trange
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import random_split, TensorDataset, DataLoader, RandomSampler, SequentialSampler
import json, copy

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [2]:
from transformers import BertConfig, BertForSequenceClassification 
from tokenization_kobert import KoBertTokenizer
pretrained_model_name='monologg/kobert'
tokenizer = KoBertTokenizer.from_pretrained(pretrained_model_name)
tokenizer.tokenize('무리뉴')

SyntaxError: invalid syntax (<ipython-input-2-9389b6ac3cb0>, line 2)

In [2]:
from transformers import BertConfig, BertForSequenceClassification 
from transformers import BertTokenizerFast
pretrained_model_name='kykim/bert-kor-base'
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
tokenizer.tokenize('맛있는 거 먹구 싶어')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/bert-kor-base/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/bert-kor-base/resolve/main/tokenizer.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/bert-kor-base/resolve/main/added_tokens.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/bert-kor-base/resolve/main/special_tokens_map.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/b

['맛있는', '거', '먹구', '싶어']

In [3]:
import random
import numpy as np
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [129]:
# make training dataset
model_name=''
cache_dir='./cache'
model_dir='./model'
version='model_v1'
model_save_path=os.path.join(model_dir, version)
data_dir='./data'
train_file='ratings_train.txt'
test_file='ratings_test.txt'
max_seq_len=100
# no weight decay, gradient accumulation steps

train_batch_size=10
dev_batch_size=32
test_batch_size=32

num_train_epochs=5
learning_rate=5e-5
max_grad_norm=1.0
weight_decay=0.0
save_steps=2000
adam_epsilon=1e-8


In [5]:
# read files or read from cache and make train_dataset, test_dataset  and save cache

cached_train_filename='cached_{}_{}'.format(pretrained_model_name.replace('/','-'), train_file.split('.')[0])
cached_test_filename='cached_{}_{}'.format(pretrained_model_name.replace('/','-'), test_file.split('.')[0])
cached_train_file=os.path.join(cache_dir, cached_train_filename)
cached_test_file=os.path.join(cache_dir, cached_test_filename)
train_data_file=os.path.join(data_dir, train_file)
test_data_file=os.path.join(data_dir, test_file)

#### convert trainfile into features

In [6]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, label_id):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label_id = label_id

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [7]:
if os.path.exists(cached_train_file):
    logger.info("Loading features from cached file %s", cached_train_file)
    features=torch.load(cached_train_file)
else:
    logger.info("Creating features from dataset file at %s with tokenizer %s", data_dir, model_name)
    lines=[]
    with open(train_data_file, 'r', encoding='utf-8') as f:
        for line in f:
            lines.append(line.strip())
    examples=[]
    for i, line in enumerate(lines[1:]):
        splits=line.split('\t')
        assert len(splits)>2, 'wrong length at {}th line {}'.format(i, line)
        examples.append(('train_'+str(i), splits[1], int(splits[2])))
        if i%5000==0:
            logger.info(line)

    # features = convert_into_features(cached_train_file,max_seq_len, tokenizer)


    cls_token=tokenizer.cls_token
    sep_token=tokenizer.sep_token
    pad_token_id=tokenizer.pad_token_id

    features=[]
    for (ex_idx, example) in enumerate(examples):
        if ex_idx % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_idx, len(examples)))
        tokens=tokenizer.tokenize(example[1])

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]


        tokens=[cls_token]+tokens+[sep_token]
        token_type_ids=[0]*len(tokens)

        input_ids=tokenizer.convert_tokens_to_ids(tokens)

        attention_mask=[1]*len(input_ids)

        padding_length=max_seq_len-len(input_ids)

        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        features.append(InputFeatures(input_ids, attention_mask, token_type_ids, example[2]))
        if ex_idx < 5:
            logger.info("*** Example ***")
            logger.info("idx: %s" % example[0])
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s" % example[2])
        
    logger.info("Saving features into cached train file %s", cached_train_file)
    torch.save(features, cached_train_file)

INFO:__main__:Loading features from cached file ./cache/cached_kykim-bert-kor-base_ratings_train


#### set seed for reproducibility

In [8]:
set_seed()

In [9]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

train_dev_dataset = TensorDataset(all_input_ids, all_attention_mask,
                        all_token_type_ids, all_label_ids)

sample_rate=0.01
dev_size = math.ceil(0.2*len(train_dev_dataset))
train_size=len(train_dev_dataset)-dev_size

train_dataset, dev_dataset = random_split(train_dev_dataset, [train_size, dev_size])
# dev_size=math.ceil(sample_rate*dev_size)
# train_size=math.ceil(sample_rate*train_size)
# rest_size=len(train_dev_dataset)-train_size-dev_size
# train_dataset, dev_dataset, rest_dataset = random_split(train_dev_dataset, [train_size, dev_size, rest_size])



In [33]:
all_input_ids.shape

torch.Size([50000, 100])

#### convert testfile into features

In [10]:
if os.path.exists(cached_test_file):
    logger.info("Loading features from cached test file %s", cached_test_file)
    features=torch.load(cached_test_file)
else: 
    logger.info("Creating features from dataset file at %s with tokenizer %s", data_dir, model_name)
    lines=[]
    with open(test_data_file, 'r', encoding='utf-8') as f:
        for line in f:
            lines.append(line.strip())
    examples=[]
    for i, line in enumerate(lines[1:]):
        splits=line.split('\t')
        assert len(splits)>2, 'wrong length at {}th line {}'.format(i, line)
        examples.append(('test_'+str(i), splits[1], int(splits[2])))
        if i%5000==0:
            logger.info(line)
    features=[]
    for (ex_idx, example) in enumerate(examples):
        if ex_idx % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_idx, len(examples)))
        tokens=tokenizer.tokenize(example[1])

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]


        tokens=[cls_token]+tokens+[sep_token]
        token_type_ids=[0]*len(tokens)

        input_ids=tokenizer.convert_tokens_to_ids(tokens)

        attention_mask=[1]*len(input_ids)

        padding_length=max_seq_len-len(input_ids)

        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        features.append(InputFeatures(input_ids, attention_mask, token_type_ids, example[2]))
        
        if ex_idx < 5:
            logger.info("*** Example ***")
            logger.info("idx: %s" % example[0])
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s" % example[2])
        
    logger.info("Saving features into cached test file %s", cached_test_file)
    torch.save(features, cached_test_file)

INFO:__main__:Loading features from cached test file ./cache/cached_kykim-bert-kor-base_ratings_test


In [11]:
from torch.utils.data import TensorDataset
all_input_ids=torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

test_dataset = TensorDataset(all_input_ids, all_attention_mask,
                        all_token_type_ids, all_label_ids)

In [25]:
all_input_ids.shape

torch.Size([50000, 100])

#### model loading

In [12]:
label_list=[0,1]
id2label={str(i):label for i, label in enumerate(label_list)}
label2id={label:i for i, label in enumerate(label_list)}

In [13]:
model_config=BertConfig.from_pretrained(pretrained_model_name,
                                       num_labels=len(label_list),
                                       finetuning_task='nsmc',
                                       id2label=id2label,
                                       label2id=label2id)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/bert-kor-base/resolve/main/config.json HTTP/1.1" 200 0


In [14]:
model=BertForSequenceClassification.from_pretrained(pretrained_model_name, config=model_config)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /kykim/bert-kor-base/resolve/main/pytorch_model.bin HTTP/1.1" 302 0


Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [15]:
device="cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Model Training

In [16]:
train_sampler=RandomSampler(train_dataset)
train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)

dev_sampler=SequentialSampler(dev_dataset)
dev_dataloader=DataLoader(dev_dataset, sampler=dev_sampler, batch_size=dev_batch_size)


In [17]:
from transformers import AdamW, get_linear_schedule_with_warmup
enable_weight_decay=False
t_total=len(train_dataloader)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer=AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
#     optimizer=AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler=get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total)

In [18]:
classifier_param=[(n,p) for n, p in model.named_parameters() if 'classifier' in n]

In [19]:
classifier_param

[('classifier.weight',
  Parameter containing:
  tensor([[-0.0033, -0.0223,  0.0195,  ..., -0.0075,  0.0172,  0.0187],
          [ 0.0212, -0.0007,  0.0149,  ..., -0.0155, -0.0101, -0.0068]],
         device='cuda:0', requires_grad=True)),
 ('classifier.bias',
  Parameter containing:
  tensor([0., 0.], device='cuda:0', requires_grad=True))]

In [20]:
global_step=0
total_tr_loss=0.0
logging_steps=1000
import time
logger.info(f'num examples {len(train_dataset)}')
logger.info(f'train batch size {train_batch_size}')
logger.info(f'num of train epochs : {num_train_epochs}')
logger.info(f'num of train steps each epoch : {len(train_dataloader)}')

for epoch in range(num_train_epochs):
    logger.info('epoch : %s', epoch)

    # train
    logger.info('train mode')
    model.train()
    train_step=0
    tr_loss=0.0
    start_t=time.time()
    model.zero_grad()
    for step, (input_ids, attention_masks, token_type_ids, labels) in enumerate(train_dataloader):
        input_ids=input_ids.to(device)
        attention_masks=attention_masks.to(device)
        token_type_ids=token_type_ids.to(device)
        labels=labels.to(device)
        outputs=model(input_ids=input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids, labels=labels)

        loss=outputs[0]

        loss.backward()

        tr_loss += loss.item()

        # every step
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step+=1
        train_step+=1
        if train_step % logging_steps ==0:
            logger.info('train_step : %s, elapsed time[%s]', train_step, time.time()-start_t)
    tr_loss=tr_loss/train_step
    logger.info(f'train_loss : {tr_loss}')
    # eval
    logger.info('eval mode')
    eval_loss=0.0
    eval_step=0
    out_label_ids=None
    preds=None
    model.eval()
    for batch in dev_dataloader:
        batch=tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels': batch[3]
            }
            tmp_eval_loss, logits =model(**inputs)
            #logger.info(type(tmp_eval_loss))
            eval_loss+=tmp_eval_loss.mean().item() #??
        eval_step+=1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
    eval_loss=eval_loss/eval_step
    preds = np.argmax(preds, axis=1)
    acc=(preds==out_label_ids).mean()
    logger.info(f'eval loss : {eval_loss}, acc : {acc}')
    model_to_save=model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(model_save_path)
    logger.info(f'model saved to {model_save_path}')


INFO:__main__:num examples 120000
INFO:__main__:train batch size 10
INFO:__main__:num of train epochs : 5
INFO:__main__:num of train steps each epoch : 12000
INFO:__main__:epoch : 0
INFO:__main__:train mode
INFO:__main__:train_step : 1000, elapsed time[420.3530158996582]
INFO:__main__:train_step : 2000, elapsed time[843.7727801799774]
INFO:__main__:train_step : 3000, elapsed time[1267.1378231048584]
INFO:__main__:train_step : 4000, elapsed time[1690.947556734085]
INFO:__main__:train_step : 5000, elapsed time[2114.791724920273]
INFO:__main__:train_step : 6000, elapsed time[2538.8696088790894]
INFO:__main__:train_step : 7000, elapsed time[2963.0130043029785]
INFO:__main__:train_step : 8000, elapsed time[3386.9477636814117]
INFO:__main__:train_step : 9000, elapsed time[3810.881938457489]
INFO:__main__:train_step : 10000, elapsed time[4234.661637067795]
INFO:__main__:train_step : 11000, elapsed time[4658.297646045685]
INFO:__main__:train_step : 12000, elapsed time[5082.092885971069]
INFO:_

In [42]:
test_dataset[1][0]

tensor([    2,  2047,  8167, 30738, 16545, 28382, 19973, 23059, 37460, 28382,
        21422,     3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [37]:
test_sampler=SequentialSampler(test_dataset)
test_dataloader=DataLoader(test_dataset, sampler=test_sampler, batch_size=test_batch_size)

### Testset Evaluation

In [48]:
eval_loss=0.0
eval_step=0
out_label_ids=None
preds=None
model.eval()
for batch in test_dataloader:
    batch=tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'token_type_ids': batch[2],
            'labels': batch[3]
        }
        tmp_eval_loss, logits =model(**inputs)
#         logger.info(tmp_eval_loss.mean())
        eval_loss+=tmp_eval_loss.mean().item() #??
    eval_step+=1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
eval_loss=eval_loss/eval_step
preds = np.argmax(preds, axis=1)
acc=(preds==out_label_ids).mean()
logger.info(f'test loss : {eval_loss}, acc : {acc}')

INFO:__main__:test loss : 0.305619674174071, acc : 0.883


In [81]:
eval_loss=eval_loss/eval_step
logger.info(preds.shape)
logger.info(out_label_ids.shape)
preds = np.argmax(preds, axis=1)
acc=(preds==out_label_ids).mean()
logger.info(f'eval loss : {eval_loss}, acc : {acc}')

        

INFO:__main__:(300, 2)
INFO:__main__:(300,)
INFO:__main__:eval loss : 0.5587934672832489, acc : 0.7333333333333333


### Test Query

In [126]:
test_sentence="이 영화 정말 별로다"

tokens=tokenizer.tokenize(test_sentence)
special_tokens_count = 2
if len(tokens) > max_seq_len - special_tokens_count:
    tokens = tokens[:(max_seq_len - special_tokens_count)]
tokens=[tokenizer.cls_token]+tokens+[tokenizer.sep_token]
token_type_ids=[0]*len(tokens)
input_ids=tokenizer.convert_tokens_to_ids(tokens)
attention_mask=[1]*len(input_ids)

padding_length=max_seq_len-len(input_ids)

input_ids= input_ids + ([tokenizer.pad_token_id]* padding_length)
attention_mask = attention_mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)

all_input_ids=torch.tensor([input_ids], dtype=torch.long)
all_attention_mask=torch.tensor([attention_mask], dtype=torch.long)
all_token_type_ids=torch.tensor([token_type_ids], dtype=torch.long)
#evaluation
encoding={
    'input_ids': all_input_ids,
    'attention_mask': all_attention_mask,
    'token_type_ids': all_token_type_ids
}
model.eval()
with torch.no_grad():
    encoding_={key:t.to(device) for key,t in encoding.items()}
    outputs=model(**encoding_)
    logits=outputs[0]
    preds=logits.detach().cpu().numpy()
    
from scipy.special import softmax
preds=softmax(preds)
logger.info(preds)

INFO:__main__:[[0.96186066 0.0381393 ]]


In [None]:
tokens=tokenizer.tokenize(example[1])



tokens=[cls_token]+tokens+[sep_token]
token_type_ids=[0]*len(tokens)

input_ids=tokenizer.convert_tokens_to_ids(tokens)

attention_mask=[1]*len(input_ids)

padding_length=max_seq_len-len(input_ids)

input_ids = input_ids + ([tokenizer.pad_token_id] * padding_length)
attention_mask = attention_mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)
features.append(InputFeatures(input_ids, attention_mask, token_type_ids, example[2]))
