<a href="https://colab.research.google.com/github/soopark0221/korean_sentiment_analysis/blob/main/Naver_sentiment_kobert_psy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load dataset

In [None]:
pip install transformers 

In [None]:
!git clone https://github.com/SKTBrain/KoBERT.git
%cd KoBERT
!pip install -r requirements.txt
!pip install .

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing import sequence

from transformers import BertModel, BertTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert.utils import get_tokenizer
import gluonnlp as nlp
import sklearn.metrics

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/BERT/ratings_train.txt', sep = '\t')
X = train_data.document
Y = train_data.label

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.2, random_state=1)

print('train_data : {}. test_data : {}'.format(len(x_train), len(x_test)))

In [None]:
x_train = [str(doc) for doc in x_train]
x_test = [str(doc) for doc in x_test]
y_train = np.array(y_train)
y_test = np.array(y_test)

check imbalance

In [None]:
count = train_data.label.value_counts()
print(count)

x = np.array(count.index)
y = np.array(count.values)
plt.figure(figsize=(8,5))
sns.barplot(x,y)
plt.xlabel('Emotion')
new = plt.ylabel('Number of sentences')

check review length

In [None]:
review_len = [len(str(doc)) for doc in train_data.document]

pd.Series(review_len).hist()
plt.show()
pd.Series(review_len).describe()

#Preprocessing

In [None]:
kobertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False) #tokenize
transform = nlp.data.BERTSentenceTransform(tok, 40, pad = True, pair = False) #embedding

In [None]:
def Convert_to_id(data):
  trans = [transform([i]) for i in data]
  masks = []
  ids = []
  for i in range(len(data)):
    mask = [int(j>1) for j in trans[i][0]]
    masks.append(mask)
    ids.append(trans[i][0])
  return ids, masks

In [None]:
def Get_dataloader(x, y, batchsize):
  x_tensor = torch.tensor(Convert_to_id(x)[0], dtype= torch.long)
  a_tensor = torch.tensor(Convert_to_id(x)[1], dtype = torch.long)
  y_tensor = torch.tensor(y, dtype = torch.long)
  my_dataset = TensorDataset(x_tensor.cuda(),y_tensor.cuda(), a_tensor.cuda())
  my_dataloader = DataLoader(my_dataset, batch_size = batchsize, sampler = RandomSampler(my_dataset))
  return my_dataloader

In [None]:
batchsize = 16
train_dataloader = Get_dataloader(x_train,y_train, batchsize)
test_dataloader = Get_dataloader(x_test, y_test, batchsize)

#Modeling

In [None]:
class koModel(nn.Module):
  def __init__(self, kobertmodel):
    super().__init__()
    self.bert_model = kobertmodel
    self.linear = torch.nn.Linear(768,2)
  
  def forward(self, input_tensor, attention_masks):
    pooler = self.bert_model(input_ids = input_tensor, attention_mask = attention_masks)[1]
    logit = self.linear(pooler) # sequence_output, pooled_output = model(input_ids, attention_mask, token_type_ids)
    return logit

In [None]:
model = koModel(kobertmodel)
model.cuda()

#Training

In [None]:
from torch.optim.lr_scheduler import StepLR
lr = 0.00001
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)
epochs = 4
criterion = torch.nn.CrossEntropyLoss()

In [None]:
epochs = 4
lr = 0.000005

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

In [None]:
for epoch in range(epochs):
  train_loss = 0
  for x,y,a in train_dataloader:
    model.train()
    out = model(x,a)
    loss = criterion(out, y)
    loss.backward() #backprop. gradient calc
    optimizer.step() #update w
    optimizer.zero_grad() #reset gradient
    train_loss += loss.item()
  avg_train_loss = train_loss/len(x_train)
  print('epoch : {}'.format(epoch+1))
  print('avg train loss : {}'.format(avg_train_loss))
  scheduler.step()
  print('LR:', scheduler.get_last_lr()) 

  correct, total, test_loss = 0, 0, 0
  pred = []
  true = []
  for x_t,y_t,a_t in test_dataloader:
    model.eval()
    with torch.no_grad():
      testout = model(x_t,a_t)
      testloss = criterion(testout, y_t)
      _,predicted = torch.max(testout,dim=1) #torch.max returns (value, index)
      pred += predicted.tolist()
      true += y_t.tolist()
      total += y_t.size(0) #batchsize 16
      correct += (predicted == y_t).sum().item()
      test_loss += testloss.item()
  accuracy = correct/total
  f1 = sklearn.metrics.f1_score(true,pred, average = 'binary')
  recall = sklearn.metrics.recall_score(true,pred,average='binary')
  avg_test_loss = test_loss/len(x_test)
  print('avg test loss : {}'.format(avg_test_loss))
  print('accuracy: {}'.format(accuracy))
  print('f1: {}'.format(f1))
  print('recall: {}'.format(recall))
  print('----------------------------------------')

Model Save

In [None]:
model_path = '/content/drive/MyDrive/BERT/naverr_kobert_weights.pt'
torch.save(model.state_dict(),model_path)

#Test

In [None]:
import pandas as pd
test = pd.read_csv('/content/drive/MyDrive/BERT/ko_data_2.csv', engine='python')

pred_list =[]
sent_test = [sent for sent in test.Id]
trans = [transform([i]) for i in sent_test]

In [None]:
pred_list = []
for idx, s in enumerate(sent_test):
  mask = [int(j>1) for j in trans[idx][0]]
  id = trans[idx][0]
  mask=torch.tensor(mask, dtype = torch.long).unsqueeze(0).cuda()
  id = torch.tensor(id, dtype = torch.long).unsqueeze(0).cuda()
  output = model(id, mask)
  _,predicted = torch.max(output,dim=1)
  pred_list+=predicted.tolist()

In [None]:
pred = pd.DataFrame(pred_list, columns = ['Predicted'])
pred.to_csv('/content/drive/MyDrive/BERT/sample_naver.csv')