<a href="https://colab.research.google.com/github/ssooni/sentiment_analysis/blob/main/sentiment_analysis(Korean).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/dataset', force_remount=True)

Drive not mounted, so nothing to flush and unmount.
Mounted at /dataset


In [None]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.3MB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 14.1MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b175aa4e

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.vocab import Vectors
from torchtext.data import TabularDataset, Field
from torchtext import data, datasets

import random
import os
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from konlpy.tag import Komoran

BATCH_SIZE = 64
lr = 0.0005
EPOCHS = 10
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

komoran = Komoran()

In [None]:
id = Field(sequential=False, unk_token=None)
text = Field(sequential=True, use_vocab=True, tokenize=komoran.morphs, lower=True, init_token = '<sos>', eos_token = '<eos>', batch_first=True)
label = Field(sequential=False, use_vocab=True, batch_first=True, unk_token=None)

train_data, test_data = TabularDataset.splits(path="/dataset/My Drive/movie",  train='ratings_train.txt', test='ratings_test.txt', format='tsv', fields=[('id', id), ('document', text), ('label', label)], skip_header=True)

### 단어 사전 생성 
1. 2회 이상 나온 단어만 단어 사전에 수록
2. 형태소 분석기는 Komoran을 사용하였음

In [None]:
import pickle

id.build_vocab(train_data) 
text.build_vocab(train_data, min_freq=2) # 적어도 2회 이상 출현한 단어만 사용
label.build_vocab(train_data)

vocab_size = len(text.vocab)
n_classes = len(label.vocab)

print('단어 집합의 크기 : {}'.format(vocab_size))
print('클래스 : {}'.format(n_classes))
print('{}'.format(text.vocab.freqs))

output = open('./vocab_list.pkl', 'wb')
torch.save(text.vocab, output)
output.close()


단어 집합의 크기 : 24296
클래스 : 2


## GRU 


In [None]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) 
        x, _ = self.gru(x, h_0)  
        h_t = x[:,-1,:]           
        self.dropout(h_t)
        logit = self.out(h_t)     
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()
    print(batch.document)

def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    
    for batch in val_iter:
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    print(logit.max(1)[1].view(y.size()).data)
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy




## TRAIN
1. Hidden Layer dimension 조정
2. GRU Layer 수를 조정
3. Loss가 제일 작은 모델을 저장

In [None]:
best_val_loss = None
for hidden_dim in range(400, 450, 50):
  train_set, val_set = train_data.split(split_ratio=0.8)
  train_iter, val_iter, test_iter = data.BucketIterator.splits((train_set, val_set, test_data), batch_size=BATCH_SIZE, shuffle=True, repeat=False, sort=False)

  print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
  print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
  print('검증 데이터의 미니 배치의 개수 : {}'.format(len(val_iter)))

  # n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p
  model = GRU(1, hidden_dim, vocab_size, 200, n_classes, 0.4).to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  loss_list = list()
  for e in range(1, EPOCHS+1):
      train(model, optimizer, train_iter)
      val_loss, val_accuracy = evaluate(model, val_iter)

      print("[%d, Epoch: %d, %d] val loss : %5.2f | val accuracy : %5.2f" % (1, e,hidden_dim, val_loss, val_accuracy))
      loss_list.append([e, 1, val_loss])
      # 검증 오차가 가장 적은 최적의 모델을 저장
      if not best_val_loss or val_loss < best_val_loss:
          if not os.path.isdir("snapshot"):
              os.makedirs("snapshot")
          print("Current Best : ", hidden_dim)
          torch.save(model.state_dict(), './snapshot/txtclassification.pt')
          best_val_loss = val_loss

훈련 데이터의 미니 배치의 개수 : 1875
테스트 데이터의 미니 배치의 개수 : 782
검증 데이터의 미니 배치의 개수 : 469
tensor([[   2,  308,    3,  ...,    1,    1,    1],
        [   2, 1211,  106,  ...,    1,    1,    1],
        [   2, 7245, 4376,  ...,    1,    1,    1],
        ...,
        [   2,  108,   28,  ...,    1,    1,    1],
        [   2,  303,   23,  ...,    1,    1,    1],
        [   2,  143,  433,  ...,    1,    1,    1]])
[1, Epoch: 1, 400] val loss :  0.37 | val accuracy : 83.04
Current Best :  400
tensor([[    2, 12546,   670,  ...,     1,     1,     1],
        [    2,  9179,    24,  ...,     1,     1,     1],
        [    2,   619,   688,  ...,     1,     1,     1],
        ...,
        [    2,    94,    26,  ...,     1,     1,     1],
        [    2,   161,    13,  ...,     1,     1,     1],
        [    2,   202,   447,  ...,     1,     1,     1]])
[1, Epoch: 2, 400] val loss :  0.35 | val accuracy : 84.57
Current Best :  400
tensor([[   2,    0,  196,  ...,    1,    1,    1],
        [   2,   81,   27,  

In [None]:
model.load_state_dict(torch.load('./snapshot/txtclassification.pt'))
test_loss, test_acc = evaluate(model, test_iter)
print('테스트 오차: %5.2f | 테스트 정확도: %5.2f' % (test_loss, test_acc))

tensor([0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1], device='cuda:0')
테스트 오차:  0.34 | 테스트 정확도: 85.68


In [None]:
import pandas as pd
dataset_ = pd.read_csv("/dataset/My Drive/movie/ko_data.csv", engine="python", encoding="ms949")
dataset_.to_csv("/dataset/My Drive/movie/ko_data_utf8.csv", encoding="utf-8", index=None) 
print(dataset_)

          Id                                           Sentence
0          0                                   정말 많이 울었던 영화입니다.
1          1                                           시간 낭비예요.
2          2             포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3          3               지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4          4                          이걸 영화로 만드는 거야?얼마나 가는지 보자.
...      ...                                                ...
11182  11182  이 영화를 커플에게 추천합니다. 영화관에 가다보면 평생 잊지 못할 추억이 하나 생길...
11183  11183                                     심심__ 그냥 한효주 cf
11184  11184  공감해서 눈물나는 영화. 안 보신분들이 전부 제가 울었다고 하면 의아해하실텐데 보면...
11185  11185                                      오토바이 신은 최고네요.
11186  11186                                   개병헌 쓰면 엉망이 된다ㅋㅋㅋ

[11187 rows x 2 columns]


In [None]:
def predict(model, val_iter):
    model.eval()
    print("predict")
    predict_list = list()
    for batch in val_iter:
      indexed = [text.vocab.stoi[t] for t in batch.Sentence]          #convert to integer sequence
      length = [len(indexed)]                                    #compute no. of words
      tensor = torch.LongTensor(indexed).to(DEVICE)              #convert to tensor
      tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
      length_tensor = torch.LongTensor(length)                   #convert to tensor
      prediction = model(tensor)                  #prediction 
      predict_list.append([batch.Id, prediction.max(1)[1].view([1]).cpu().data.numpy()[0]])
    return predict_list


In [None]:
submit_data = TabularDataset(path="/dataset/My Drive/movie/ko_data_utf8.csv", format='csv', fields=[('Id', id), ('Sentence', text)], skip_header=True)

predict


In [None]:
a = predict(model, submit_data)
print(a)
pd.DataFrame(a, columns=["id", "Predicted"]).to_csv("submission.csv", index=None)

predict
[['0', 1], ['1', 0], ['2', 0], ['3', 1], ['4', 0], ['5', 1], ['6', 0], ['7', 1], ['8', 0], ['9', 1], ['10', 1], ['11', 0], ['12', 0], ['13', 0], ['14', 0], ['15', 1], ['16', 1], ['17', 1], ['18', 1], ['19', 1], ['20', 1], ['21', 1], ['22', 1], ['23', 1], ['24', 0], ['25', 0], ['26', 0], ['27', 0], ['28', 1], ['29', 0], ['30', 1], ['31', 1], ['32', 1], ['33', 0], ['34', 1], ['35', 0], ['36', 1], ['37', 0], ['38', 0], ['39', 0], ['40', 0], ['41', 0], ['42', 0], ['43', 1], ['44', 1], ['45', 0], ['46', 0], ['47', 1], ['48', 0], ['49', 1], ['50', 0], ['51', 1], ['52', 0], ['53', 0], ['54', 1], ['55', 1], ['56', 1], ['57', 1], ['58', 0], ['59', 0], ['60', 1], ['61', 1], ['62', 1], ['63', 0], ['64', 0], ['65', 0], ['66', 0], ['67', 1], ['68', 0], ['69', 1], ['70', 1], ['71', 1], ['72', 0], ['73', 1], ['74', 1], ['75', 1], ['76', 1], ['77', 1], ['78', 1], ['79', 0], ['80', 1], ['81', 1], ['82', 0], ['83', 1], ['84', 1], ['85', 0], ['86', 1], ['87', 0], ['88', 1], ['89', 0], ['90', 1], 

In [None]:
from google.colab import files
files.download('submission.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>