#1. 데이터 전처리
- 클래스 파악
- 원하는 컬럼만 남기기
- 최대 텍스트 길이만큼 자르기
- 중복 제거
- (길이 순서로 정렬)
- 셔플
- Train, Test 나누기
- 저장

## 1) 데이터 불러오기

In [1]:
max_length = 256  # sms 최대 길이

In [2]:
import pandas as pd

In [10]:
df = pd.read_csv('/content/drive/MyDrive/Module11/sms.tsv', sep='\t',)
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5575, 2)


In [11]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,spam,"%^^×？×^×&#****,>,;//×&>>*(*^%=÷#~^&,****)"
4,ham,U dun say so early hor... U c already then say...


In [13]:
# 클래스 파악
classes = sorted(set(df['label']))
class_to_idx = {}

for i, c in enumerate(classes):  # 모든 클래스에 대해
    class_to_idx.update({c: i})

nclass = len(classes)

print("# of classes: %d" %nclass)
print(classes)
print(class_to_idx)

## 2) 새로운 DataFrame
### (1) label, sms만 남기기
### (2) 최대 텍스트 길이만큼 자르기 # pandas.Series.str.slice
- 성별, 가사만 남기려면?

In [16]:
new_df = pd.DataFrame({'label': df['label'],
                       'sms': df['sms'].str.slice(  # 최대 가사 텍스트만큼 자르기
                            start=0, stop=max_length
                       )})

### (3) 중복 제거

In [17]:
new_df = pd.DataFrame(new_df.drop_duplicates())

### (4) 셔플

In [18]:
df_shuffled = new_df.sample(frac=1).reset_index(drop= True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,My stomach has been thru so much trauma I swea...
1,ham,Ok can...
2,ham,it's still not working. And this time i also t...
3,spam,Ur cash-balance is currently 500 pounds - to m...
4,ham,You have registered Sinco as Payee. Log in at ...


### (5) train, test 나누기


In [19]:
# train: test = 9:1
# train: test = 540: 60 -> train:valid:test = 432: 108: 60
train_ratio = 0.9

# train_dataset
s, e = 0, int(df_shuffled.shape[0]*train_ratio)  # # of rows
df_train = pd.DataFrame({'label': df_shuffled['label'][s:e],
                         'sms': df_shuffled['sms'][s:e]})
print('index for train: %d~%d'%(s,e))

# test_dataset
s, e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print('index for test: %d~%d'%(s,e))
df_test = pd.DataFrame({'label': df_shuffled['label'][s:e],
                        'sms': df_shuffled['sms'][s:e]})

index for train: 0~4654
index for test: 4654~5171


In [20]:
# column 수 확인
print(df_train.shape)
print(df_test.shape)

(4654, 2)
(517, 2)


## 4) 저장


In [21]:
# new_df.columns: ['성별', '가사']
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',
                header = False, index = False, sep = '\t')

df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',
               header = False, index = False, sep='\t')

# 2. 데이터 로드
- data_loader.py

# 3. RNN + SMS 구현

## 0-1) 라이브러리 임포트

In [None]:
!pip install torchtext==0.4.0

In [4]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

## 0-2) 하이퍼파라미터 셋팅

In [5]:
batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

# yhk 추가
learning_rate = 0.001

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1) SMS train, test dataset 가져오기

In [6]:
%cd drive/MyDrive/Module11
from data_loader import DataLoader

/content/drive/MyDrive/Module11


In [7]:
loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.train.tsv',
    batch_size = batch_size, 
    valid_ratio = 0.2, 
    device = -1, 
    max_vocab = 999999,
    min_freq = 5,
)

In [8]:
test_loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.test.tsv',
    batch_size = batch_size, 
    valid_ratio = 0.01,   # val 없이 모두 train
    device = -1, 
    max_vocab = 999999,
    min_freq = 5,
)

## 2) 대략적인 데이터 형태

In [9]:
print('[train]=', len(loaders.train_loader.dataset),
      '[valid]=', len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print('[vocab]=', vocab_size, '|classes|=', num_classes)

[train]= 3723 [valid]= 931
[vocab]= 1532 |classes|= 2


## 3) 데이터 로드함수 
학습시킬 때 batch_size 단위로 끊어서 로드하기 위함

In [13]:
n = 3  # 샘플로 출력할 데이터 개수
for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i>n:
        break
    print('[%d]'%i)
    print('한 번에 로드되는 데이터 크기: ', len(labels))

    # 출력
    for j in range(n):
        label = labels[j].numpy()  # tensor -> numpy로 변환
        text = texts[j].numpy()
        print('label: ', label)
        print('text: ', text.shape)

[0]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (13,)
label:  0
text:  (13,)
label:  1
text:  (13,)
[1]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (6,)
label:  0
text:  (6,)
label:  0
text:  (6,)
[2]
한 번에 로드되는 데이터 크기:  128
label:  1
text:  (18,)
label:  0
text:  (18,)
label:  0
text:  (18,)
[3]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (5,)
label:  0
text:  (5,)
label:  0
text:  (5,)


## 4) 모델 선언

In [11]:
class RNN(nn.Module):
    def __init__(self, input_size, word_vec_size, hidden_size, n_classes, num_layers=4, dropout_p=0.3):
        # input_size: vocab_size
        # word_vec_size: word embedding vector 차원
        # hidden_size: bidirectional LSTM의 hidden state & cell state의 size
        # num_layers: 쌓을 레이어 개수

        super(RNN, self).__init__()

        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = num_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p

        # 입력 차원(vocab_size), 출력 차원(word_vec_size)
        self.emb = nn.Embedding(input_size, word_vec_size) # 부터
        self.lstm = nn.LSTM(input_size = word_vec_size,
                            hidden_size = hidden_size,
                            num_layers= num_layers,
                            dropout= dropout_p,
                            batch_first= True,
                            bidirectional= True)
        self.fc = nn.Linear(hidden_size*2, n_classes)  
        
        # LogSoftmax + NLLLoss instead of Softmax + CrossEntropy
        self.activation = nn.LogSoftmax(dim= -1) # 마지막차원에 softmax 씌워줌

    def forward(self, x):
        # x: (batch_size, length)
        x = self.emb(x)

        # x: (batch_size, length, word_vec_size)
        x, _ = self.lstm(x)  # x: output, _ : 마지막 time step의 hidden state & cell state

        # x: (batch_size, length, word_vec_size)
        # x[:, -1]: (batch_size, 1, hidden_size*2)
        out = self.activation(self.fc(x[:, -1]))  # 마지막 time step
        # self.fc(x[:, -1]): (batch_size, num_classes)

        return out

In [14]:
# RNN class 객체 선언
model = RNN(input_size= vocab_size, 
            word_vec_size= word_vec_size,
            hidden_size= hidden_size,
            n_classes= num_classes,
            num_layers= num_layers,
            dropout_p= dropout_p)

In [16]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total= 0

    model.eval()  # test mode
    for i, data in enumerate(dloader):  # batch_size 만큼
        texts = data.text.to(device)    # (batch_size, length)
        labels = data.label.to(device)  # (batch_size, num_classes)

        # Forward prop
        output = model(texts)  # (batch_size, num_classes)
        _, output_index = torch.max(output, 1)  # (batch_size, 1)

        total += labels.size(0)
        correct += (output_index == labels).sum().float()
    # print('Accuracy of Test Data:{}'.format(100*correct/total))

    model.train()
    return (100*correct/total).numpy()  # tensor -> numpy

print('Accuracy of Test Data: %.2f'%ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 80.56


## 5) loss, optimizer

In [17]:
loss_func = nn.NLLLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)

## 6) 학습

In [None]:
# train the model
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts = data.text.to(device)  # (batch_size, length)
        labels = data.label.to(device)  # (batch_size, num_classes)

        print("[%d]"%i)

        # Forward prop
        outputs = model(texts) 
        loss = loss_func(outputs, labels)

        # Backward prop & optimizer
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1)%10 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss:{:.4f}, Accr:{:.2f}'
            .format(epoch+1, num_epochs, i+1, total_step, loss.item(), ComputeAccr(loaders.valid_loader, model)))


[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], Step [10/30], Loss:0.0538, Accr:87.43
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], Step [20/30], Loss:0.4190, Accr:87.43
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [1/10], Step [30/30], Loss:1.4332, Accr:87.43
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [2/10], Step [10/30], Loss:0.1872, Accr:87.43
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [2/10], Step [20/30], Loss:0.1481, Accr:87.43
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [2/10], Step [30/30], Loss:1.0850, Accr:87.43
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [3/10], Step [10/30], Loss:0.1916, Accr:87.43
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [3/10], Step [20/30], Loss:0.8252, Accr:87.43
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [3/10], Step [30/30], Loss:0.1390, Accr:87.43
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [4/10], Step [10/30], Loss:0.9365, Accr:87.43
[10]
[11]
[12]
[13]


## 7) 테스트

In [None]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(test_loaders.valid_loader, model))

## 8) 학습된 파라미터 저장

In [None]:
netname = './nets/rnn_weight.pkl'
torch.save(model, netname, )

## 9) 학습된 파라미터 로드
실무에서 학습된(pretrained) 파라미터 로드하고 싶다면 5,6,8 과정 생략한 채 실행

In [None]:
netname = './nets/rnn_weight.pkl'
model = torch.load(netname)

print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))