# LSTM 모델을 이용한 NLP classification (스팸 메일 분류)

### 1.1 fc 복습

In [4]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

class ANN(nn.Module):
    def __init__(self, num_output, input_size, hidden_size, device):
        super(ANN, self).__init__()
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h = self.fc1(x).relu()
        h = self.fc2(x).relu()
        predict = self.outlayer(h)
        return predict

### 1.2 LSTM for NLP
- input layer : 정수 인코딩 된 결과
- embed layer : word2vec 역할 -> nn.Embedding(단어 갯수, embed_dim)
- LSTM  layer : nn.LSTM(embed_size, hidden_size, layer 수, dropout rate, bid/sin)

In [25]:
class LSTM_net(nn.Module):
    def __init__(self, num_output, size_vocab, dim_embed, hidden_size, linear_size, num_layers, device):
        super(LSTM_net, self).__init__()
        self.device = device
        self.num_output = num_output # 1 : 이진 분류
        self.hidden_size = hidden_size # 128
        self.num_layers = num_layers # 2

        self.embed = nn.Embedding(size_vocab, dim_embed)

        self.lstm = nn.LSTM(input_size = dim_embed, hidden_size = hidden_size,
                           num_layers = num_layers, dropout = 0.3, bidirectional = True)
        self.fclayer = nn.Linear(hidden_size, linear_size)
        self.outlayer = nn.Linear(linear_size, num_output)

    def forward(self, x):
        scaler = 2 if self.lstm.bidirectional == True else 1
        # x : 정수 인코딩 [batch, seq_len] -> word2vec 결과 : [batch, seq_len, dim_embed]
        emb = self.embed(x)
        # hidden state와 cell state 초기화 [num_layer x scaler, batch, hidden]
        h_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                      self.hidden_size, requires_grad=True))
        c_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                      self.hidden_size, requires_grad=True))
        # emb : [seq_len, batch, dim_embed] 
        # lstm의 결과 : out, h, c 중 h만 사용
        lstm_out, (h, c) = self.lstm(emb.transpose(1,0), (h_state, c_state))
        # important : 마지막 time의 hidden만 사용하겠다
        h = h[-1] 
        h = self.fclayer(h).relu()
        predict = self.outlayer(h)
        
        return predict

### 1.3 Spam Mail Classification : 데이터 전처리

In [6]:
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')
print('총 샘플의 수 :',len(data))
display(data.info(), data.head())

총 샘플의 수 : 5572
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


None

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']
data[:5]

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data = data[['v2', 'v1']]
data[:5]

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
data.columns = ['text','spam']
data[:5]

Unnamed: 0,text,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


### 토큰화

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
stop_words = set(stopwords.words('english'))
data = data.dropna().reset_index(drop=True)
len(data)

5572

In [11]:
token_text = []
for i in range(5572):
    token = word_tokenize(data.iloc[i,0])
    token_stop_text = []
    for w in token:
        if w not in stop_words:
            token_stop_text.append(w)
    token_text.append(token_stop_text)
print('after cleaning :', len(token_text))

after cleaning : 5572


In [12]:
token_text[:3]

[['Go',
  'jurong',
  'point',
  ',',
  'crazy',
  '..',
  'Available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  '...',
  'Cine',
  'got',
  'amore',
  'wat',
  '...'],
 ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...'],
 ['Free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'FA',
  'Cup',
  'final',
  'tkts',
  '21st',
  'May',
  '2005',
  '.',
  'Text',
  'FA',
  '87121',
  'receive',
  'entry',
  'question',
  '(',
  'std',
  'txt',
  'rate',
  ')',
  'T',
  '&',
  'C',
  "'s",
  'apply',
  '08452810075over18',
  "'s"]]

### 정수 인코딩

In [13]:
# 빈도수 조사 : Bow
word2inx = {}
Bow = []
for sentence in token_text:
    for word in sentence:
        if word not in word2inx.keys(): #  없으면
            word2inx[word] = len(word2inx)
            Bow.insert(len(word2inx)-1,1)
        else: # 이미 있으면
            inx = word2inx.get(word)
            Bow[inx] += 1


# vocab
vocab = {}
for n, v in enumerate(word2inx):
    vocab[v] = Bow[n]
    
# 빈도수로 정렬하기
# sorted 함수 : vocab.items()을 정렬 / 임시함수 x[1]를 리턴 / 오름차순,내림차순
vocab_sort = sorted(vocab.items(), key=lambda x:x[1], reverse = True)

# 많이 쓰이는 순서부터 인코딩
word2inx = {word[0] : index + 1 for index, word in enumerate(vocab_sort)}

# encoding
for i, sentence in enumerate(token_text):
    for j, word in enumerate(sentence):
        token_text[i][j] = word2inx[word]


In [14]:
print(token_text[0])

[736, 5213, 847, 3, 848, 11, 2541, 1503, 49, 99, 413, 1318, 127, 3362, 6, 5214, 33, 5215, 122, 6]


### 학습을 위한 Label

In [15]:
text_label = np.array(data.iloc[:,1])

### Padding 및 데이터 자르기

In [16]:
print(np.shape(token_text))
print(np.shape(text_label))
maxlen = 0
for w in token_text:
    if len(w)>= maxlen:
        maxlen = len(w)

# 단어 100개까지만 보자
maxlen = 100
rowdata = []
for w in token_text:
    if len(w) >= maxlen:
        rowdata.append(w[:maxlen])
    else:
        rowdata.append(np.pad(w,(0, maxlen), 'constant', constant_values=0)[:maxlen])
text_padded = np.concatenate(rowdata, axis=0).reshape(-1, maxlen)
print(np.shape(text_padded))       

(5572,)
(5572,)
(5572, 100)


  result = asarray(a).shape


### 1.4 학습을 위한 Dataset 만들기 및 학습 과정

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import LongTensor as LT
from torch import FloatTensor as FT

# torch(data).to(device) 작업을 해줌
class Generate_Dataset(torch.utils.data.Dataset):
    def __init__(self, xdata, ydata, device):
        self.x_data = xdata
        self.y_data = ydata
        self.device = device
    
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = LT(self.x_data[idx]).to(self.device)
        y = LT(self.y_data[idx]).to(self.device)
        return x, y  

### Generate Dataset

In [18]:
dataset = Generate_Dataset(text_padded[:5000,:], text_label[:5000].reshape([-1,1]), device)
trainset, testset = random_split(dataset, [4500, 500])
train_loader = DataLoader(trainset, batch_size=256, shuffle=True)
test_loader = DataLoader(testset, batch_size=500, shuffle=False)

### Define Network and Optimizer

In [27]:
# output : 2 (one-hot으로 binary 분류)

lstm_net = LSTM_net(num_output = 2, size_vocab = len(word2inx), dim_embed = 64,
                   hidden_size=64, linear_size=64, num_layers=1, device=device)

optimizer = torch.optim.Adam(lstm_net.parameters(), lr = 0.01)

### Training Session
- x : email word
- predict size : [batch, 2] : 2는 원핫 인코딩이기 때문에
- y : [batch, 1]

In [28]:
for epoch in range(10):
    print('Epoch', epoch)
    for x, y in train_loader:
        predict = lstm_net(x)
        loss = torch.nn.functional.cross_entropy(predict, y.ravel())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(loss)

Epoch 0
tensor(0.1590, grad_fn=<NllLossBackward0>)
Epoch 1
tensor(0.0245, grad_fn=<NllLossBackward0>)
Epoch 2
tensor(0.0486, grad_fn=<NllLossBackward0>)
Epoch 3
tensor(0.0107, grad_fn=<NllLossBackward0>)
Epoch 4
tensor(0.0075, grad_fn=<NllLossBackward0>)
Epoch 5
tensor(6.5922e-05, grad_fn=<NllLossBackward0>)
Epoch 6
tensor(1.6995e-05, grad_fn=<NllLossBackward0>)
Epoch 7
tensor(4.4514e-05, grad_fn=<NllLossBackward0>)
Epoch 8
tensor(4.1579e-05, grad_fn=<NllLossBackward0>)
Epoch 9
tensor(3.4756e-05, grad_fn=<NllLossBackward0>)


### Test the Performance

In [29]:
for x, y in test_loader:
    predict = lstm_net(x).argmax(1).detach().numpy()
    answer = y.ravel().detach().numpy()
score = 0
for i in range(len(predict)):
    if predict[i] == answer[i]:
        score += 1
print(score, 'out of 500, accuracy is', score/500*100, "%")

485 out of 500, accuracy is 97.0 %
