In [None]:
! git clone https://github.com/simonjisu/nsmc_study.git

fatal: destination path 'nsmc_study' already exists and is not an empty directory.


In [None]:
!pip install -U torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

import re
import random
import numpy as np
import pandas as pd

from torchtext.legacy import data
from sklearn.model_selection import train_test_split    
from copy import deepcopy

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
config = {
        "batch_size": 64,
        "gpu_id": 0,
        "rnn": True,
        "hidden_size": 256,
        "n_layers": 2,
        "cnn": True,
        "use_batch_norm": True,
        "window_size":[3, 4, 5],
        "n_filters":[100, 100, 100],
        "word_vector_size": 300,
        "dropout":0.3,
        "n_epochs":5,
        "verbose": 1,
        "model_fn": "./model.pth",
        "max_length":256
}

In [None]:
from konlpy.tag import Okt
tokenizer = Okt()

# utils

In [None]:
def preprocess_func(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    sentence = sentence.strip()
    return sentence

In [None]:
def convert_dataset(input_data, text, label):
    list_of_example = [data.Example.fromlist(row.tolist(), fields=[('text', text), ('label', label)])  for _, row in input_data.iterrows()]
    dataset = data.Dataset(examples=list_of_example, fields=[('text', text), ('label', label)])
    return dataset

In [None]:
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs, lower=False, batch_first=True, fix_length=50)
LABEL = data.LabelField(dtype = torch.float)
BATCH_SIZE = 64

train = pd.read_csv("/content/nsmc_study/data/ratings_train.txt", sep='\t')
test = pd.read_csv("/content/nsmc_study/data/ratings_test.txt", sep='\t')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

print(train.shape)
print(test.shape)

train_data = train.dropna() #말뭉치에서 nan 값을 제거함
test_data  = test.dropna()


train_data['document'] = train_data['document'].apply(preprocess_func)
test_data['document']  = test_data['document'].apply(preprocess_func)

# split data
train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32)
print(len(train_data))
print(len(valid_data))
print(len(test_data))

print(train_data.shape)
print(valid_data.shape)  
print(test_data.shape)

train_data = convert_dataset(train_data, TEXT, LABEL)
valid_data = convert_dataset(valid_data, TEXT, LABEL)
test_data  = convert_dataset(test_data, TEXT, LABEL)


print(f'Number of training examples   : {len(train_data)}')
print(f'Number of validation examples : {len(valid_data)}')
print(f'Number of testing examples    : {len(test_data)}')


MAX_VOCAB_SIZE = 20000

# 단어 사전 생성
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary : {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

train_loader, valid_loader, test_loader = data.Iterator.splits(
  (train_data, valid_data, test_data),
  batch_size = BATCH_SIZE,
  sort = False,
  device = device)

print('Number of minibatch for training dataset   : {}'.format(len(train_loader)))
print('Number of minibatch for validation dataset : {}'.format(len(valid_loader)))
print('Number of minibatch for testing dataset    : {}'.format(len(test_loader)))

(150000, 2)
(50000, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['document'] = train_data['document'].apply(preprocess_func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['document']  = test_data['document'].apply(preprocess_func)


104996
44999
49997
(104996, 2)
(44999, 2)
(49997, 2)
Number of training examples   : 104996
Number of validation examples : 44999
Number of testing examples    : 49997
Unique tokens in TEXT vocabulary : 20002
Unique tokens in LABEL vocabulary: 2
Number of minibatch for training dataset   : 1641
Number of minibatch for validation dataset : 704
Number of minibatch for testing dataset    : 782


#model

In [None]:
class RNNModel(nn.Module):
    
    def __init__(
        self,
        input_size,
        word_vector_size,
        hidden_size,
        n_classes,
        n_layers=4,
        dropout_p=0.4
    ):
        self.input_size = input_size
        self.word_vector_size = word_vector_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        super().__init__()
        
        self.emb = nn.Embedding(input_size, word_vector_size)
        self.rnn = nn.LSTM(
            input_size=self.word_vector_size,
            hidden_size=self.hidden_size,
            num_layers=self.n_layers,
            dropout=self.dropout_p,
            batch_first=True,
            bidirectional=True
        )
        
        self.linear_layer = nn.Linear(self.hidden_size*2, self.n_classes)
        self.softmax_layer = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        # |x| = (batch_size, length)
        x = self.emb(x)
        # |x| = (batch_size, length, word_vector_size)
        x, _ = self.rnn(x)
        # |x| = (batch_size, length, hidden_size*2)
        x = self.linear_layer(x[:,-1])
        # |x| = (batch_size, n_classes)
        y = self.softmax_layer(x)
        # |x| = (batch_size, n_classes)
        
        return y
        

#trainer

In [None]:
class Trainer():

  def __init__(self, model, optimizer, crit):
      self.model = model
      self.optimizer = optimizer
      self.crit = crit

      super().__init__()

  def _train(self, train_loader, config):
      self.model.train()

      total_loss = 0

      for i, (x_i, y_i) in enumerate(train_loader):
          y_hat_i = self.model(x_i)
          loss_i = self.crit(y_hat_i, y_i.squeeze().long())

          # Initialize the gradients of the model.
          self.optimizer.zero_grad()
          loss_i.backward()

          self.optimizer.step()
          
          if config['verbose'] >= 2:
              print("Train Iteration(%d/%d): loss=%.4e" % (i + 1, len(train_loader), float(loss_i)))

          # Don't forget to detach to prevent memory leak.
          total_loss += float(loss_i)

      return total_loss / len(train_loader)

  def _validate(self, valid_loader, config):
      # Turn evaluation mode on.
      self.model.eval()

      # Turn on the no_grad mode to make more efficintly.
      with torch.no_grad():
          total_loss = 0

          for i, (x_i, y_i) in enumerate(valid_loader):
              y_hat_i = self.model(x_i)
              loss_i = self.crit(y_hat_i, y_i.squeeze().long())
              
              if config['verbose'] >= 2:
                  print("Valid Iteration(%d/%d): loss=%.4e" % (i + 1, len(valid_loader), float(loss_i)))

              total_loss += float(loss_i)

          return total_loss / len(valid_loader)

  def train(self, train_loader, valid_loader, config):
      lowest_loss = np.inf
      best_model = None

      for epoch_index in range(config['n_epochs']):
          train_loss = self._train(train_loader, config)
          valid_loss = self._validate(valid_loader, config)

          # You must use deep copy to take a snapshot of current best weights.
          if valid_loss <= lowest_loss:
              lowest_loss = valid_loss
              best_model = deepcopy(self.model.state_dict())

          print("Epoch(%d/%d): train_loss=%.4e  valid_loss=%.4e  lowest_loss=%.4e" % (
              epoch_index + 1,
              config['n_epochs'],
              train_loss,
              valid_loss,
              lowest_loss,
          ))

      # Restore to best model.
      self.model.load_state_dict(best_model)
      return self.model


In [None]:
def main(config):
    print(TEXT.vocab.freqs.most_common(20))
    print(TEXT.vocab.itos[:10])
    print(LABEL.vocab.stoi)
    
    if config["rnn"] is False and config["cnn"] is False:
        raise Exception("you should select the rnn model or cnn model or both")

    if config["rnn"]:
        model = RNNModel(
            input_size=len(TEXT.vocab),
            word_vector_size=256,
            hidden_size=128,
            n_classes=len(LABEL.vocab),
            n_layers=2,
            dropout_p=0.3

        )
        optimizer = optim.Adam(model.parameters())
        crit = nn.NLLLoss()
        print(model)
        
        if config["gpu_id"] >= 0:
            model.cuda(config["gpu_id"])
            crit.cuda(config["gpu_id"])
        rnn_trainer = Trainer(model, optimizer, crit)

        rnn_model = rnn_trainer.train(
            train_loader,
            valid_loader,
            config
        )

    torch.save(
        {
            'rnn': rnn_model.state_dict() if config["rnn"] else None,
            'config': config,
            'vocab': TEXT.vocab,
            'classes': LABEL.vocab
        }, config["model_fn"]
    )

In [None]:
main(config)

[('.', 169097), ('이', 39289), ('영화', 35555), ('!', 22218), ('의', 21676), ('가', 19298), ('에', 18786), ('을', 16184), (',', 15841), ('도', 14951), ('들', 13303), ('?', 12819), ('는', 12332), ('를', 11339), ('은', 11165), ('너무', 7775), ('한', 7643), ('다', 7209), ('정말', 6885), ('적', 6135)]
['<unk>', '<pad>', '.', '이', '영화', '!', '의', '가', '에', '을']
defaultdict(None, {0: 0, 1: 1})
RNNModel(
  (emb): Embedding(20002, 256)
  (rnn): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (linear_layer): Linear(in_features=256, out_features=2, bias=True)
  (softmax_layer): LogSoftmax(dim=-1)
)
Epoch(1/5): train_loss=5.4586e-01  valid_loss=4.0652e-01  lowest_loss=4.0652e-01
Epoch(2/5): train_loss=3.4921e-01  valid_loss=3.5634e-01  lowest_loss=3.5634e-01
Epoch(3/5): train_loss=2.7910e-01  valid_loss=3.5643e-01  lowest_loss=3.5634e-01
Epoch(4/5): train_loss=2.2752e-01  valid_loss=3.6463e-01  lowest_loss=3.5634e-01
Epoch(5/5): train_loss=1.8194e-01  valid_loss=4.2292e-01  lowest_

# test

In [None]:
model_fn = "./model.pth"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def load(fn, device):
  d = torch.load(fn, map_location=device)
  
  return d['rnn'], d['config'], d['vocab'], d['classes']

In [None]:
def test(model, test_loader):
  model.eval()
  
  correct_cnt, total_cnt = 0, 0
  with torch.no_grad():
    for x, y in test_loader:
      x, y = x.to(device), y.to(device)
      y_hat = model(x)
      correct_cnt += (y.squeeze() == torch.argmax(y_hat, dim=-1)).sum()
      total_cnt += len(x)

    accuracy = correct_cnt / total_cnt
    print("Accuracy: %.4f" % accuracy)
    

In [None]:
model_dict, train_config, vocab, classes = load(model_fn, device)

model = RNNModel(
                input_size=len(vocab),
                word_vector_size=256,
                hidden_size=128,
                n_classes=len(classes),
                n_layers=2,
                dropout_p=0.3
        )
model.load_state_dict(model_dict)
print(model)

test(model.to(device), test_loader)

RNNModel(
  (emb): Embedding(20002, 256)
  (rnn): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (linear_layer): Linear(in_features=256, out_features=2, bias=True)
  (softmax_layer): LogSoftmax(dim=-1)
)
Accuracy: 0.8389


In [None]:
def predict(model,sentence):
    model.eval()
    with torch.no_grad():
        sent = tokenizer.morphs(sentence)
        sent = torch.tensor([TEXT.vocab.stoi[i] for i in sent])
        sent = F.pad(sent,pad = (1,50-len(sent)-1),value = 1)
        sent = sent.unsqueeze(dim = 0) #for batch
        output = torch.argmax(model(sent), dim=-1)
        print(torch.exp(model(sent)))
        
        return output.item()
        
examples = [
  "딥러닝 수업 오늘 하기 좋네요!",
  "독감 정말 싫어요",
  "영화가 정말 재밌네요",
  "영화가 재밌습니다",
  "최악의 영화"
]

model = model.to('cpu')
for idx in range(len(examples)) :

    sentence = examples[idx]
    pred = predict(model,sentence)
    print("\n",sentence)
    if pred == 1 :
        print(f">>>긍정 리뷰입니다. ({pred : .2f})")
    else:
        print(f">>>부정 리뷰입니다.({pred : .2f})")

tensor([[0.0223, 0.9777]])

 딥러닝 수업 오늘 하기 좋네요!
>>>긍정 리뷰입니다. ( 1.00)
tensor([[0.7653, 0.2347]])

 독감 정말 싫어요
>>>부정 리뷰입니다.( 0.00)
tensor([[0.0498, 0.9502]])

 영화가 정말 재밌네요
>>>긍정 리뷰입니다. ( 1.00)
tensor([[0.1119, 0.8881]])

 영화가 재밌습니다
>>>긍정 리뷰입니다. ( 1.00)
tensor([[0.9819, 0.0181]])

 최악의 영화
>>>부정 리뷰입니다.( 0.00)
