In [1]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 43), reused 22 (delta 6), pack-reused 0[K
Unpacking objects: 100% (91/91), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 76.3MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 43.8MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd5

In [11]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.legacy import data
import urllib.request
import pandas as pd
from konlpy.tag import Mecab
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import Iterator

In [12]:
batch_size = 64
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [13]:
# Load Data
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

tokenizer = Mecab()

# Define Field 
ID = data.Field(sequential = False,
                use_vocab = False) 

TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=tokenizer.morphs, 
                  lower=True,
                  batch_first=True,
                  fix_length=20)
# ex) pytorch, Pytorch
LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   is_target=True)

# Make Dataset + Tokenizing
train_data, test_data = TabularDataset.splits(
        path='.', train='ratings_train.txt', test='ratings_test.txt', format='tsv',
        fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True)

# Make Vocab.
TEXT.build_vocab(train_data, min_freq=10)
LABEL.build_vocab(train_data)

# Make validation data (80% training data, 20% validation data)
train_data, val_data = train_data.split(split_ratio=0.8)

# Make Iterator (for Batch)
train_iter = Iterator(dataset=train_data, batch_size = batch_size, shuffle=True)
test_iter = Iterator(dataset=test_data, batch_size = batch_size, shuffle=True)
val_iter = Iterator(dataset=val_data, batch_size = batch_size, shuffle=True)

# Define number of words and number of lables in the 'word vocabuary'
vocab_size = len(TEXT.vocab)
n_classes = 2

print("[train]: %d [val]: %d [test]: %d [vocab]: %d [class] %d"
      % (len(train_data),len(val_data), len(test_data), vocab_size, n_classes))

[train]: 120000 [val]: 30000 [test]: 50000 [vocab]: 10070 [class] 2


In [14]:
class RNN(nn.Module):
  def __init__(self, embed_size, n_vocab, hidden_size, num_layers, num_classes, dropout_p=0.2):
    super(RNN, self).__init__()
   
    self.embed = nn.Embedding(n_vocab, embed_size)
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout_p)
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers,
                        batch_first=True)
    self.fc = nn.Linear(hidden_size, num_classes)

  def forward(self, x):

    x = self.embed(x)
    
    # Initialize the hidden_state and cell_state
    h0 = torch.zeros(self.num_layers, x.size(0),
                     self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers, x.size(0),
                     self.hidden_size).to(device)

    out, _ = self.lstm(x, (h0, c0)) # [batch_size, sequence length, hidden_dim]
    ht = out[:,-1,:] # [batch_size, hidden_dim]
    self.dropout(ht) 

    logit = self.fc(ht) # [batch_size, hidden_dim] -> [batch_size, n_classes]
    
    return logit
                  

In [15]:
embed_size = 128
n_vocab = vocab_size
hidden_size = 256
num_layers = 2
num_classes = n_classes
dropout = 0.2
learning_rate = 0.001
EPOCHS = 20

model = RNN(embed_size, n_vocab, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [16]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(device), batch.label.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = F.cross_entropy(out, y)
        loss.backward()
        optimizer.step()

In [17]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(device), batch.label.to(device)
        out = model(x)
        loss = F.cross_entropy(out, y, reduction='sum')
        total_loss += loss.item()
        corrects += (out.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [18]:
best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("[epoch: %d] val loss:%5.2f | val accu:%5.2f" % (e, val_loss, val_accuracy))
    
    # Save the model having the smallest validation loss
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss


[epoch: 1] val loss: 0.38 | val accu:83.47
[epoch: 2] val loss: 0.34 | val accu:84.88
[epoch: 3] val loss: 0.36 | val accu:85.40
[epoch: 4] val loss: 0.37 | val accu:85.24
[epoch: 5] val loss: 0.43 | val accu:84.77
[epoch: 6] val loss: 0.55 | val accu:84.90
[epoch: 7] val loss: 0.59 | val accu:84.53
[epoch: 8] val loss: 0.63 | val accu:84.32
[epoch: 9] val loss: 0.73 | val accu:84.39
[epoch: 10] val loss: 0.72 | val accu:84.48
[epoch: 11] val loss: 0.82 | val accu:84.65
[epoch: 12] val loss: 0.85 | val accu:84.22
[epoch: 13] val loss: 0.85 | val accu:84.33
[epoch: 14] val loss: 0.86 | val accu:84.20
[epoch: 15] val loss: 0.85 | val accu:84.09
[epoch: 16] val loss: 0.89 | val accu:84.08
[epoch: 17] val loss: 0.88 | val accu:83.99
[epoch: 18] val loss: 0.95 | val accu:84.23
[epoch: 19] val loss: 0.92 | val accu:84.23
[epoch: 20] val loss: 0.91 | val accu:84.24


In [19]:
model.load_state_dict(torch.load('./snapshot/txtclassification.pt'))
test_loss, test_acc = evaluate(model, test_iter)
print('test loss: %5.2f | test accu: %5.2f' % (test_loss, test_acc))

test loss:  0.35 | test accu: 84.61
