# 네이버 영화 댓글 분석 및 긍정/부정 예측

### 총 200,000개의 리뷰로 구성된 데이터셋입니다.

### 해당 텍스트가 긍정인지, 부정인지를 학습하고 예측하기 위한 노트북입니다.

* 데이터 전처리에 대한 아이디어의 일부는 도서 "딥러닝을 통한 자연어 처리 입문" 에서 가져왔습니다.

* 데이터 출처 : https://github.com/e9t/nsmc/

In [1]:
!pip install mxnet
!pip install gluonnlp
!pip install sentencepiece
!pip install tensorflow
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-r54t2cxs/kobert-tokenizer_ce6054d882b74241aef4837a86f33803
  Running command git clone -q https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-r5

In [2]:
# 프로젝트에 필요한 모듈

import datetime
import time
import random

# 데이터 분석과 랭글링
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from tqdm import tqdm, tqdm_notebook

# machine learning
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
import gluonnlp as nlp

# BERT 한국어 성능향상 koBERT
from kobert_tokenizer import KoBERTTokenizer


ModuleNotFoundError: No module named 'kobert_tokenizer'

In [3]:
# 데이터셋을 불러옵니다
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train = pd.read_table('ratings_train.txt')
test = pd.read_table('ratings_test.txt')
print(f'numbers of train data: {len(train)}')
print(f'numbers of test data: {len(test)}')

numbers of train data: 150000
numbers of test data: 50000


In [4]:
# 데이터 확인
print(train.head())

         id                                           document  label
0   9976970                                아 더빙.. 진짜 짜증나네요 목소리      0
1   3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1
2  10265843                                  너무재밓었다그래서보는것을추천한다      0
3   9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0
4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1


In [5]:
train['label'].value_counts(normalize=True)

0    0.501153
1    0.498847
Name: label, dtype: float64

In [6]:
# id column 은 사용하지 않습니다, 큰 의미가 없다고 판단

train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [7]:
# 중복 데이터 확인

train.duplicated().sum(), test.duplicated().sum()

(3659, 794)

In [8]:
# 중복 데이터 제거

train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)
train.shape, test.shape

((146341, 2), (49206, 2))

In [9]:
train.isnull().sum()

document    2
label       0
dtype: int64

In [10]:
train = train.dropna(how = 'any')

In [11]:
# erase content of documents except the korean&space
train['document'] = train['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train[:5]

  train['document'] = train['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")


Unnamed: 0,document,label
0,아 더빙 진짜 짜증나네요 목소리,0
1,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,너무재밓었다그래서보는것을추천한다,0
3,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [12]:
train['document'] = train['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
train['document'].replace('', np.nan, inplace=True)
print(train.isnull().sum())

document    819
label         0
dtype: int64


  train['document'] = train['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경


In [13]:
# 데이터 다시 확인
train.shape, test.shape

((146339, 2), (49206, 2))

In [14]:
# CLS : classifier
# SEP : separator
document_bert = ["[CLS] " + str(s) + " [SEP]" for s in train['document']]

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(s) for s in document_bert]
print(tokenized_texts[0])

['[CLS]', '아', '더', '##빙', '진', '##짜', '짜', '##증', '##나', '##네', '##요', '목', '##소', '##리', '[SEP]']


In [16]:
MAX_LEN = max([len(s) for s in tokenized_texts]) + 1
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
input_ids[0]

array([   101,   9519,   9074, 119005,   9708, 119235,   9715, 119230,
        16439,  77884,  48549,   9284,  22333,  12692,    102,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [17]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
    
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [18]:
train_inputs, validation_inputs, train_labels, validation_labels = \
train_test_split(input_ids, train['label'].values, random_state=42, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=42, 
                                                       test_size=0.1)

In [19]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [20]:
BATCH_SIZE = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
# colate_fn 추가

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

In [21]:
# 테스트 데이터 전처리

test['document'] = test['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test['document'] = test['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
test['document'].replace('', np.nan, inplace=True)
test = test.dropna(how = 'any')

sentences = test['document']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = test['label'].values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

  test['document'] = test['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
  test['document'] = test['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경


In [22]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM3-32GB


In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [24]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 4

# 총 훈련 스텝
total_steps = len(train_dataloader) * epochs

# lr 조금씩 감소시키는 스케줄러
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



In [25]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [26]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        # **batch
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  4,116.    Elapsed: 0:01:46.
  Batch 1,000  of  4,116.    Elapsed: 0:03:32.
  Batch 1,500  of  4,116.    Elapsed: 0:05:18.
  Batch 2,000  of  4,116.    Elapsed: 0:07:04.
  Batch 2,500  of  4,116.    Elapsed: 0:08:50.
  Batch 3,000  of  4,116.    Elapsed: 0:10:36.
  Batch 3,500  of  4,116.    Elapsed: 0:12:22.
  Batch 4,000  of  4,116.    Elapsed: 0:14:08.

  Average training loss: 0.41
  Training epcoh took: 0:14:32

Running Validation...
  Accuracy: 0.85
  Validation took: 0:00:29

Training...
  Batch   500  of  4,116.    Elapsed: 0:01:46.
  Batch 1,000  of  4,116.    Elapsed: 0:03:32.
  Batch 1,500  of  4,116.    Elapsed: 0:05:18.
  Batch 2,000  of  4,116.    Elapsed: 0:07:04.
  Batch 2,500  of  4,116.    Elapsed: 0:08:50.
  Batch 3,000  of  4,116.    Elapsed: 0:10:36.
  Batch 3,500  of  4,116.    Elapsed: 0:12:22.
  Batch 4,000  of  4,116.    Elapsed: 0:14:08.

  Average training loss: 0.31
  Training epcoh took: 0:14:33

Running Validation...
  Accura

In [27]:
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))
bert_base_multilingual_cased_accuracy = eval_accuracy/nb_eval_steps

  Batch   100  of  1,528.    Elapsed: 0:00:06.
  Batch   200  of  1,528.    Elapsed: 0:00:13.
  Batch   300  of  1,528.    Elapsed: 0:00:19.
  Batch   400  of  1,528.    Elapsed: 0:00:26.
  Batch   500  of  1,528.    Elapsed: 0:00:32.
  Batch   600  of  1,528.    Elapsed: 0:00:39.
  Batch   700  of  1,528.    Elapsed: 0:00:45.
  Batch   800  of  1,528.    Elapsed: 0:00:51.
  Batch   900  of  1,528.    Elapsed: 0:00:58.
  Batch 1,000  of  1,528.    Elapsed: 0:01:04.
  Batch 1,100  of  1,528.    Elapsed: 0:01:11.
  Batch 1,200  of  1,528.    Elapsed: 0:01:17.
  Batch 1,300  of  1,528.    Elapsed: 0:01:23.
  Batch 1,400  of  1,528.    Elapsed: 0:01:30.
  Batch 1,500  of  1,528.    Elapsed: 0:01:36.

Accuracy: 0.86
Test took: 0:01:38


### koBERT

In [28]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [29]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [30]:
# !wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1
# !wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1

--2022-07-22 02:26:41--  https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.18, 2620:100:6035:18::a27d:5512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/374ftkec978br3d/ratings_train.txt [following]
--2022-07-22 02:26:42--  https://www.dropbox.com/s/dl/374ftkec978br3d/ratings_train.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca0b167eceee67ed6c37e1bbc6a.dl.dropboxusercontent.com/cd/0/get/BpjfJ493TXSProoy0ywIZRkQkQfKA37UePWsG13c1erwPDmOMxB_44RFfVKBSRyToPTbMO0AFHQc5R93rszuWS7P7z7J-1OBglhPnoA7tNfFn4QYk8oits1cVxmuLSSeFEzU5Fq2ESvH8GcJ4tAQ_d3uEVIHARhddHL6yjPuLCbcGQ/file?dl=1# [following]
--2022-07-22 02:26:42--  https://uca0b167eceee67ed6c37e1bbc6a.dl.dropboxusercontent.com/cd/0/get/BpjfJ493TXSProoy0ywIZRkQkQfKA37UePWsG13c1erwPDmO

In [31]:
dataset_train = nlp.data.TSVDataset("ratings_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("ratings_test.txt", field_indices=[1,2], num_discard_samples=1)


In [32]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [33]:
tok = tokenizer.tokenize

data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, vocab, max_len, True, False)

In [34]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)



In [35]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [36]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [37]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [38]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [39]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [40]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    kobert_accuracy = test_acc / (batch_id+1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.7344117760658264 train acc 0.515625
epoch 1 batch id 201 loss 0.48209065198898315 train acc 0.5621890547263682
epoch 1 batch id 401 loss 0.3977402448654175 train acc 0.6783821695760599
epoch 1 batch id 601 loss 0.38942602276802063 train acc 0.7293053244592346
epoch 1 batch id 801 loss 0.4626966118812561 train acc 0.7576857053682896
epoch 1 batch id 1001 loss 0.32754215598106384 train acc 0.7764423076923077
epoch 1 batch id 1201 loss 0.3939151167869568 train acc 0.7907863238967527
epoch 1 batch id 1401 loss 0.4163482189178467 train acc 0.8001985189150607
epoch 1 batch id 1601 loss 0.35314521193504333 train acc 0.8084790755777639
epoch 1 batch id 1801 loss 0.26871761679649353 train acc 0.8151374236535258
epoch 1 batch id 2001 loss 0.3085208833217621 train acc 0.8211909670164917
epoch 1 batch id 2201 loss 0.2953130602836609 train acc 0.8259810881417537
epoch 1 train acc 0.829478188993174


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 1 test acc 0.8865888746803069


  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.4734460711479187 train acc 0.8125
epoch 2 batch id 201 loss 0.20949922502040863 train acc 0.8815298507462687
epoch 2 batch id 401 loss 0.25270670652389526 train acc 0.8816240648379052
epoch 2 batch id 601 loss 0.37719231843948364 train acc 0.8855553244592346
epoch 2 batch id 801 loss 0.3511367440223694 train acc 0.8884987515605494
epoch 2 batch id 1001 loss 0.31541603803634644 train acc 0.8906562187812188
epoch 2 batch id 1201 loss 0.20281028747558594 train acc 0.893161948376353
epoch 2 batch id 1401 loss 0.20634989440441132 train acc 0.8956437366167024
epoch 2 batch id 1601 loss 0.3473527133464813 train acc 0.8977787320424735
epoch 2 batch id 1801 loss 0.18530327081680298 train acc 0.8995176290949473
epoch 2 batch id 2001 loss 0.2726406753063202 train acc 0.901689780109945
epoch 2 batch id 2201 loss 0.23638248443603516 train acc 0.903410381644707
epoch 2 train acc 0.9049123649032993


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 2 test acc 0.8952205882352942


  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.4624304473400116 train acc 0.8125
epoch 3 batch id 201 loss 0.09981527924537659 train acc 0.9242848258706468
epoch 3 batch id 401 loss 0.18978601694107056 train acc 0.9256546134663342
epoch 3 batch id 601 loss 0.22931869328022003 train acc 0.9278026206322796
epoch 3 batch id 801 loss 0.2520446181297302 train acc 0.9303214731585518
epoch 3 batch id 1001 loss 0.2561132311820984 train acc 0.9318181818181818
epoch 3 batch id 1201 loss 0.17186960577964783 train acc 0.9338832223147377
epoch 3 batch id 1401 loss 0.09029442071914673 train acc 0.9350687009279086
epoch 3 batch id 1601 loss 0.19777844846248627 train acc 0.9368363522798251
epoch 3 batch id 1801 loss 0.13096904754638672 train acc 0.9385757912270961
epoch 3 batch id 2001 loss 0.15011174976825714 train acc 0.940279860069965
epoch 3 batch id 2201 loss 0.17940635979175568 train acc 0.9412482962289869
epoch 3 train acc 0.9423261518771331


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 3 test acc 0.8972786125319693


  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.5514039397239685 train acc 0.828125
epoch 4 batch id 201 loss 0.06348883360624313 train acc 0.956856343283582
epoch 4 batch id 401 loss 0.076243557035923 train acc 0.9580346009975063
epoch 4 batch id 601 loss 0.1895039826631546 train acc 0.9592866056572379
epoch 4 batch id 801 loss 0.20004978775978088 train acc 0.9599914169787765
epoch 4 batch id 1001 loss 0.06324191391468048 train acc 0.9612731018981019
epoch 4 batch id 1201 loss 0.06607352942228317 train acc 0.9625442339716903
epoch 4 batch id 1401 loss 0.08287683874368668 train acc 0.9634859029264811
epoch 4 batch id 1601 loss 0.11866708099842072 train acc 0.9642703778888195
epoch 4 batch id 1801 loss 0.0750623568892479 train acc 0.9652102998334259
epoch 4 batch id 2001 loss 0.1448424607515335 train acc 0.9661185032483758
epoch 4 batch id 2201 loss 0.08144963532686234 train acc 0.9664712062698774
epoch 4 train acc 0.967021295506257


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 4 test acc 0.8970588235294118


  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.28983157873153687 train acc 0.90625
epoch 5 batch id 201 loss 0.10696572065353394 train acc 0.9735696517412935
epoch 5 batch id 401 loss 0.057329241186380386 train acc 0.9750623441396509
epoch 5 batch id 601 loss 0.1353132724761963 train acc 0.9758735440931781
epoch 5 batch id 801 loss 0.12451830506324768 train acc 0.9761431023720349
epoch 5 batch id 1001 loss 0.010031292214989662 train acc 0.9768981018981019
epoch 5 batch id 1201 loss 0.11907986551523209 train acc 0.9774666944213156
epoch 5 batch id 1401 loss 0.017878200858831406 train acc 0.9776721984296931
epoch 5 batch id 1601 loss 0.04319653660058975 train acc 0.9781289038101186
epoch 5 batch id 1801 loss 0.09606559574604034 train acc 0.9786576901721266
epoch 5 batch id 2001 loss 0.03187907859683037 train acc 0.9790495377311345
epoch 5 batch id 2201 loss 0.03501778095960617 train acc 0.9790507155838255
epoch 5 train acc 0.9792288822525598


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 5 test acc 0.8984574808184144
