<a href="https://colab.research.google.com/github/submouse9903/uos-deepLearning/blob/main/U47768_CH05_RNN(%EA%B0%90%EC%84%B1%EB%B6%84%EC%84%9D_Modeling).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN을 이용한 텍스트 분류(Text Classification)

## 1) 데이터 전처리 

### - 데이터
: 이 실습에서 사용하는 데이터는 https://github.com/lih0905/korean-pytorch-sentiment-analysis 에 있는 영화 평점 데이터를 활용하였다.

In [None]:
!pip install konlpy
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torchtext
from konlpy.tag import Okt
import collections
import matplotlib.pyplot as plt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [None]:
okt=Okt()  

In [None]:
train_data = pd.read_csv("http://ranking.uos.ac.kr/class/RB/rating_train.csv")
test_data = pd.read_csv("http://ranking.uos.ac.kr/class/RB/rating_test.csv")

In [None]:
train_data.head()

Unnamed: 0,id,text,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [None]:
from tqdm import tqdm 
morph_counter = collections.defaultdict(int)
text_line = train_data["text"]
for text in tqdm(text_line):
  for m in okt.morphs(text):
    morph_counter[m] += 1

100%|██████████| 149995/149995 [06:11<00:00, 403.64it/s]


In [None]:
morph_to_id = {'[PAD]': 0, '[UNK]': 1}
for w, cnt in tqdm(morph_counter.items()):
    morph_to_id[w] = len(morph_to_id)
print(len(morph_to_id))

100%|██████████| 104677/104677 [00:00<00:00, 1097370.99it/s]

104679





In [None]:
id_to_morph = {}
for w, idx in tqdm(morph_to_id.items()):
  id_to_morph[idx] = w
print(len(id_to_morph))  

100%|██████████| 104679/104679 [00:00<00:00, 1892456.34it/s]

104679





In [None]:
# pre-padding
def textEncoding(text_list, max_length, morph_to_id):
  m =  max_length-len(text_list)
  if m>=0:
    text_list_id = [0 if i<m else morph_to_id.get(text_list[i-m],1) for i in range(max_length)]
  # 입력 text 가 최대 길이보다 긴 경우
  else:
    text_list_id = [morph_to_id.get(text_list[i],1) for i in range(max_length)]
  return text_list_id

In [None]:
max_length = 10
X = []
for text in tqdm(train_data["text"]):
  text_list = okt.morphs(text)
  X.append(textEncoding(text_list, max_length , morph_to_id))

100%|██████████| 149995/149995 [05:56<00:00, 421.07it/s]


In [None]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
X_t = torch.tensor(X).long()
Y_t = torch.tensor(train_data["label"]).long()
tr_data = TensorDataset(X_t, Y_t)
train_dataloader = DataLoader(tr_data, batch_size=5, shuffle=True)

In [None]:
Y_t

tensor([0, 1, 0,  ..., 0, 1, 0])

In [None]:
embedding = torch.nn.Embedding(len(morph_to_id), 40, padding_idx = 0)

In [None]:
for x, y in train_dataloader:
  z = embedding(x)
  print("shape of embedding vector:", z.shape)
  break

shape of embedding vector: torch.Size([5, 10, 40])


In [None]:
y

tensor([0, 0, 1, 1, 0])

## 2) 모델링

### - 모형

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(1)
if device == 'cuda':
    torch.cuda.manual_seed_all(1)

In [None]:
# Define the RNN model
class myRNN(nn.Module):
    def __init__(self, size_vocab, input_size, hidden_size, num_layers):
        super(myRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(size_vocab, hidden_size, padding_idx = 0)
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Sequential(nn.Linear(hidden_size, 2),
                                nn.Softmax(dim=1))
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        x = self.embedding(x).to(device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [None]:
for x, y in train_dataloader:
  z = embedding(x)
  print("shape of embedding vector:", z.shape)
  break

shape of embedding vector: torch.Size([5, 10, 40])


In [None]:
size_vocab = len(morph_to_id)
input_size = max_length
hidden_size = 10
num_layer = 1

In [None]:
model = myRNN(size_vocab, input_size, hidden_size, num_layer)
for x, y in train_dataloader:
  z = model(x)
  print("shape of embedding vector:", z.shape)
  break

shape of embedding vector: torch.Size([5, 2])


In [None]:
learning_rate = 0.005
model = myRNN(size_vocab, input_size, hidden_size, num_layer).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer.zero_grad()


In [None]:
num_epochs = 10
for epoch in range(num_epochs):
  for inputs, labels in tqdm(train_dataloader):
    input = inputs.to(device)
    label = labels.to(device)
    output = model(input)
    loss = criterion(output, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(loss)

 16%|█▌        | 4669/29999 [01:26<07:47, 54.19it/s]


KeyboardInterrupt: ignored