In [1]:
max_length=256

### 01 데이터 불러오기

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sms2.tsv', sep='\t')
print(df.columns)
print(df.shape)

Index([u'label', u'sms'], dtype='object')
(5572, 2)


In [4]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 02 데이터 전처리

In [5]:
# 클래스 파악
classes=sorted(set(df['label']))
class_to_idx={}

# 모든 클래스에 대해
for i, c in enumerate(classes):
    class_to_idx.update({c:i})
    
nclass=len(classes)

print("# of claases: %d" %nclass)
print(classes)
print(class_to_idx)

# of claases: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


In [6]:
# label, sms만 남기기
# 최대 텍스트 길이만큼 자르기
new_df=pd.DataFrame({'label': df['label'],
                    'sms': df['sms'].str.slice(start=0, stop=max_length)})

In [7]:
# 중복 제거
len(new_df)

5572

In [8]:
new_df=pd.DataFrame(new_df.drop_duplicates())

In [9]:
len(new_df)

5169

In [10]:
# shuffle
df_shuffled=new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,I wont get concentration dear you know you are...
1,ham,"Only just got this message, not ignoring you. ..."
2,spam,WIN a year supply of CDs 4 a store of ur choic...
3,spam,URGENT! Your Mobile number has been awarded wi...
4,ham,It should take about &lt;#&gt; min


In [11]:
# train, test split
train_ratio=0.9

s, e= 0, int(df_shuffled.shape[0] * train_ratio)
df_train=pd.DataFrame({'label': df_shuffled['label'][s:e],
                      'sms':df_shuffled['sms'][s:e]})
print("index for train: %d~%d" %(s,e))

s, e = e, e+int(df_shuffled.shape[0] * (1.0-train_ratio))
print("index for test: %d~%d" %(s,e))
df_test=pd.DataFrame({'label': df_shuffled['label'][s:e],
                     'sms': df_shuffled['sms'][s:e]})

index for train: 0~4652
index for test: 4652~5168


In [12]:
print(df_train.shape)
print(df_test.shape)

(4652, 2)
(516, 2)


In [13]:
# ↑이렇게 해도되지않았을까
train_ratio=0.9
index=int(df_shuffled.shape[0] * train_ratio)

df_train_=df_shuffled[0:index]
df_test_=df_shuffled[index:-1]

print(df_train_.shape)
print(df_test_.shape)

(4652, 2)
(516, 2)


In [14]:
# 저장
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',
               header=False, index=False, sep='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',
               header=False, index=False, sep='\t')

### 03 데이터 로더

In [15]:
import torch
print(torch.__version__)

1.4.0


In [16]:
!pip install torchtext==0.4.0

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
Collecting torchtext==0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/31/80/1cde2a940fe42d5572487e47533f4b08302a0dd2c64bbd04116731cd7109/torchtext-0.4.0.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 18.7MB/s eta 0:00:01
Building wheels for collected packages: torchtext
  Building wheel for torchtext (setup.py) ... [?25ldone
[?25h  Created wheel for torchtext: filename=torchtext-0.4.0-cp27-none-any.whl size=52131 sha256=428313ab50756fd001218f991f2a8b3c424103cb3f4431829586251105bcfc6d
  Stored in directory: /home/ec2-user/.cache/pip/wheels/7f/0b/a7/53f554f01d205ac7039ef96028eb886f52e235cdfae5ecf7ef
Su

In [17]:
import torchtext
import numpy as np

data_loader.py 오픈소스 로드하기<br>
조건확인
- 두 개 필드로 이루어져있어야
- tab으로 분리되어 있어야 <br>

스펙
- train, valid 파일로 나눌 것임
- label, text 필두 두개 컬럼으로 이루어짐 <br>

-> train용 loader, valid용 loader 만들것임

# RNN + SMS 구현

### 01 라이브러리 임포트

In [19]:
from data_loader import DataLoader

import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

### 02 하이퍼파라미터 셋팅

In [20]:
# hyper-parameters
batch_size=128
num_epochs=10

word_vec_size=256
dropout_p=0.3

hidden_size=512
num_layers=4

learning_rate=0.001

In [21]:
# device configuration
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 03 sms train, test dataset 가져오기

In [26]:
loaders=DataLoader(
    train_fn='sms.maxlen.uniq.shuf.train.tsv',
        batch_size=batch_size,
        valid_ratio=.2, # 8:2
        device=-1,
        max_vocab=999999,
        min_freq=5,)

In [27]:
test_loaders=DataLoader(
    train_fn='sms.maxlen.uniq.shuf.test.tsv',
        batch_size=batch_size,
        valid_ratio=.01, # 모두 train(0이 안되서 0.01로)
        device=-1,
        max_vocab=999999,
        min_freq=5,)

### 04 대략적인 데이터 형태

In [28]:
print("|train| = ", len(loaders.train_loader.dataset),
     '|valid| = ', len(loaders.valid_loader.dataset))

('|train| = ', 3722, '|valid| = ', 930)


In [30]:
vocab_size=len(loaders.text.vocab)
num_classes=len(loaders.label.vocab)

print("|vocab| = ", vocab_size, '|classes| = ', num_classes)

('|vocab| = ', 1527, '|classes| = ', 2)


### 05 데이터 로드함수 이해하기

In [32]:
n=3
for i, data in enumerate(loaders.train_loader):
    labels=data.label
    texts=data.text
    
    if i>n:
        break
    print("[%d]" %i)
    print("size of data loaded at once: ", len(labels))
    
    # 출력
    for j in range(n):
        label=labels[j].numpy()
        text=texts[j].numpy()
        print("label: ", label)
        print("text: ", text.shape)

[0]
('size of data loaded at once: ', 128)
('label: ', array(1))
('text: ', (30,))
('label: ', array(1))
('text: ', (30,))
('label: ', array(0))
('text: ', (30,))
[1]
('size of data loaded at once: ', 128)
('label: ', array(0))
('text: ', (24,))
('label: ', array(0))
('text: ', (24,))
('label: ', array(0))
('text: ', (24,))
[2]
('size of data loaded at once: ', 128)
('label: ', array(0))
('text: ', (15,))
('label: ', array(0))
('text: ', (15,))
('label: ', array(0))
('text: ', (15,))
[3]
('size of data loaded at once: ', 128)
('label: ', array(0))
('text: ', (8,))
('label: ', array(0))
('text: ', (8,))
('label: ', array(0))
('text: ', (8,))


### 06 모델 선언

In [36]:
# rnn with many-to-one
class RNN(nn.Module):
    def __init__(self,
                input_size,
                word_vec_size,
                hidden_size,
                n_classes,
                num_layers=4,
                dropout_p=0.3
                ):
        super(RNN, self).__init__()
        
        self.input_size=input_size
        self.word_vec_size=word_vec_size
        self.hidden_size=hidden_size
        self.n_classes=n_classes
        self.num_layers=num_layers
        self.dropout_p=dropout_p
        
        # 입력 차원(vocab_size), 출력 차원(word_vec_size)
        self.emb=nn.Embedding(input_size, word_vec_size) #부터
        
        self.lstm=nn.LSTM(input_size=word_vec_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         dropout=dropout_p,
                         batch_first=True,
                         bidirectional=True)
        self.fc=nn.Linear(hidden_size*2, num_classes)
        
        self.activation=nn.LogSoftmax(dim=-1) # 마지막 차원에 softmax 씌워줌
    
    def forward(self, x):
        # x: (batch_size, length)
        x=self.emb(x)
        # x: (batch_size, length, word_vec_size)
        x, _ = self.lstm(x)
        
        # x: (bacth_Size, length, hidden_size*2)
        # x[:-1]: (batch_size, 1, hidden_size*@)
        out = self.activation(self.fc(x[:,-1]))
        # self.fc(x[:-1]): (batch_Size, num_classes)
        return out
        

In [37]:
model=RNN(input_size=vocab_size,
         word_vec_size=word_vec_size,
         hidden_size=hidden_size,
         n_classes=num_classes,
         num_layers=num_layers,
         dropout_p=dropout_p)

In [38]:
def ComputeAccr(dloader, imodel):
    correct=0
    total=0
    
    model.eval()
    for i, data in enumerate(dloader):
        texts=data.text.to(device)
        labels=data.label.to(device)
        
        output=model(texts)
        _, output_index=torch.max(output,1)
        
        total +=labels.size(0)
        correct += (output_index == labels).sum().float()
        
    model.train()
    return (100*correct/total).numpy()

In [39]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 87.74


### 07 loss, optimizer

In [42]:
loss_func=nn.NLLLoss()
optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate)

### 08 학습

In [47]:
total_step=len(loaders.train_loader)

for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts=data.text.to(device)
        labels=data.label.to(device)
        
#         print("[%d]"%i)
        
        outputs=model(texts)
        loss=loss_func(outputs,labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if(i+1)%10==0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accr: {:.2f}'
                 .format(epoch+1, num_epochs, i+1, total_step,
                        loss.item(),
                        ComputeAccr(loaders.valid_loader, model)))

Epoch [1/10], Step [10/30], Loss: 0.5536, Accr: 88.82
Epoch [1/10], Step [20/30], Loss: 0.1615, Accr: 87.74
Epoch [1/10], Step [30/30], Loss: 0.1280, Accr: 91.72
Epoch [2/10], Step [10/30], Loss: 0.2028, Accr: 86.24
Epoch [2/10], Step [20/30], Loss: 0.2587, Accr: 87.74
Epoch [2/10], Step [30/30], Loss: 0.1699, Accr: 92.15
Epoch [3/10], Step [10/30], Loss: 0.1019, Accr: 94.73
Epoch [3/10], Step [20/30], Loss: 0.4707, Accr: 93.23
Epoch [3/10], Step [30/30], Loss: 0.1358, Accr: 95.05
Epoch [4/10], Step [10/30], Loss: 0.0218, Accr: 93.01
Epoch [4/10], Step [20/30], Loss: 0.2102, Accr: 95.81
Epoch [4/10], Step [30/30], Loss: 0.0125, Accr: 94.30
Epoch [5/10], Step [10/30], Loss: 0.0800, Accr: 95.16
Epoch [5/10], Step [20/30], Loss: 0.0015, Accr: 95.48
Epoch [5/10], Step [30/30], Loss: 0.1426, Accr: 95.70
Epoch [6/10], Step [10/30], Loss: 0.0015, Accr: 94.62
Epoch [6/10], Step [20/30], Loss: 0.0021, Accr: 96.24
Epoch [6/10], Step [30/30], Loss: 0.0022, Accr: 96.24
Epoch [7/10], Step [10/30], 

### 09 테스트

In [48]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 96.45


### 10 학습된 파라미터 저장

In [49]:
netname='./nets/rnn_weight_20201126.pkl'
torch.save(model, netname, )

  "type " + obj.__name__ + ". It won't be checked "


### 11 학습된 파라미터 로드

In [50]:
netname='./nets/rnn_weight_20201126.pkl'
model=torch.load(netname)

In [51]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 96.45
