In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras.preprocessing.text import Tokenizer
from torchtext import data
from torchtext.data import TabularDataset
from tqdm.notebook import tqdm

In [2]:
import torch
import torchtext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 0. 초기 설정

## 1) seed 설정

In [3]:
import random

In [4]:
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f265c05be70>

## 2) Hyperparameter 설정

In [5]:
# dataset
TRAIN_SIZE = 0.7
# model
BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCHS = 30
DROP_RATE = 0.3

## 3) GPU

In [6]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)

cpu와 cuda 중 다음 기기로 학습함: cuda


------
------

# 1. EDA

## 0) get DataFrame

In [7]:
### 1> set several paths
PATH_TRAIN = '../input/nlp-getting-started/train.csv'
PATH_TEST = '../input/nlp-getting-started/test.csv'

### 2> read_csv
df_train = pd.read_csv(PATH_TRAIN)
df_test = pd.read_csv(PATH_TEST)

------
------

# 2. Data Preprocess (Cleaning) 


## 0) concat

In [8]:
print(df_train.shape)
print(df_test.shape)

(7613, 5)
(3263, 4)


In [9]:
df = df_train.append(df_test, sort=False)
df.shape

(10876, 5)

-------

## 1) Cleaning by Regular expression

In [10]:
import re
import string

In [11]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub('', text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    
    return html.sub('', text)
    
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    
    return text.translate(table)

In [12]:
df['text'] = df['text'].apply(lambda x: remove_URL(x))
df['text'] = df['text'].apply(lambda x: remove_html(x))
df['text'] = df['text'].apply(lambda x: remove_emoji(x))
df['text'] = df['text'].apply(lambda x: remove_punct(x))

-------

## 2) Lemmatization (표제어 추출)

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
keywords = df_train.keyword.unique()[1:]
keywords = list(map(lambda x: x.replace('%20', ' '), keywords))

wnl = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    sentence_words = sentence.split(' ')
    new_sentence_words = list()
    
    for sentence_word in sentence_words:
        sentence_word = sentence_word.replace('#', '')
        new_sentence_word = wnl.lemmatize(sentence_word.lower(), wordnet.VERB)
        new_sentence_words.append(new_sentence_word)
        
    new_sentence = ' '.join(new_sentence_words)
    new_sentence = new_sentence.strip()
    
    return new_sentence

In [15]:
df['text'] = df['text'].apply(lambda x: lemmatize_sentence(x))

-------
# Pytorch
-------

# 3. Data Preprocess

## 1) prepare dataset

In [16]:
df_train = df.iloc[:len(df_train)]
df_test = df.iloc[len(df_train):]

In [17]:
df_train = df_train[['id','text','target']]
df_test = df_test[['id','text']]

In [18]:
import os

In [19]:
if not os.path.exists('preprocessed_train.csv'):
    df_train.to_csv('preprocessed_train.csv', index = False)
    
if not os.path.exists('preprocessed_test.csv'):
    df_test.to_csv('preprocessed_test.csv', index = False)

-------

## 2) Field 정의하기

### 1> Field란?

- datatype을 define: TEXT와 LABEL 객체를 정의해준다. (input과 output을 정의해준다.)
- 추후 어떤 전처리를 할 것인지 정의한다.

(data를 받을 객체를 미리 선언하는 것 같다.)

### 2> 내부 기능

- sequential: 해당 data가 sequential data인지 여부 (default는 True)
- use_vocab: 단어 집합을 만들 것인가 여부
    - 단어 집합: 중복을 제거한 텍스트의 총 단어의 집합(set)
- tokenize: 어떤 token화 함수를 사용할 것인지 지정 (default는 string.split)
- lower : 영어 데이터를 전부 소문자화한다. (default는 False)
    - 컴퓨터는 대문자와 소문자를 다르게 판단하므로 필요한 기능
- batch_first : 미니 배치 차원을 맨 앞으로 하여 데이터를 불러올 것인지 여부. (default는 False)
- is_target : 레이블 데이터 여부. (default는 False)
- fix_length : 최대 허용 길이. 이 길이에 맞춰서 패딩 작업(Padding)이 진행됩니다.

In [20]:
TEXT = torchtext.data.Field(sequential=True, 
                            tokenize='spacy', 
                            lower=True, 
                            include_lengths=True, 
                            batch_first=True, 
                            fix_length=25)
LABEL = torchtext.data.Field(use_vocab=True,
                           sequential=False,
                           dtype=torch.float16)
ID = torchtext.data.Field(use_vocab=False,
                         sequential=False,
                         dtype=torch.float16)

--------

## 3) CSV -> TabularDataset

### 1> TabularDataset

- 데이터를 불러오면서 
- 필드에서 정의했던 토큰화 방법으로 토큰화를 수행합니다 (이때, 소문자화 같은 기본적인 전처리도 함께 이루어집니다.)

### 2> 내부 기능

- path : 파일이 위치한 경로.
- format : 데이터의 포맷.
- fields : 위에서 정의한 필드를 지정. 첫번째 원소는 데이터 셋 내에서 해당 필드를 호칭할 이름, 두번째 원소는 지정할 필드.
<font color = 'red'> (DataFrame은 column 순서대로 field 지정해야 한다.) </font>
- skip_header : 데이터의 첫번째 줄은 무시. <font color = 'red'> 꼭 해야한다. 안 그러면 column이 data로 끼어든다. </font>

In [21]:
from torchtext.data import TabularDataset

In [22]:
trainset = TabularDataset(path='preprocessed_train.csv', format='csv', skip_header=True,
                            fields=[('id', ID), ('text', TEXT), ('target', LABEL)])
testset = TabularDataset(path='preprocessed_test.csv', format='csv', skip_header=True,
                            fields=[('id', ID), ('text', TEXT)])

------

## 4) 단어 집합(단어장, vocabulary) 생성

### 1> 단어 집합이란?

- 중복을 제거한 총 단어들의 집합
- 단어 집합으로 추후 어떤 기준을 가지고(몇 번 등장?, 등장 순위별) 정수 인코딩을 만듭니다.

### 2> 내부 기능

- min_freq: 단어 집합에 추가 시 단어의 최소 등장 빈도 조건을 추가
- max_size: 단어 집합의 최대 크기를 지정

In [23]:
from torchtext.vocab import Vectors, GloVe

In [24]:
TEXT.build_vocab(trainset, testset, 
                 max_size=20000, min_freq=10,
                 vectors=GloVe(name='6B', dim=300))  # We use it for getting vocabulary of words
LABEL.build_vocab(trainset)
ID.build_vocab(trainset, testset)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                           
100%|█████████▉| 399480/400000 [00:57<00:00, 7280.59it/s]

-------

## 5) split dataset

In [25]:
trainset, valset = trainset.split(split_ratio = TRAIN_SIZE, random_state=random.getstate(),
                                  strata_field = 'target', stratified=True)

----------

## 6) Build the Iterator(dataloader)

In [26]:
train_iter = torchtext.data.Iterator(dataset = trainset, batch_size = BATCH_SIZE, device = DEVICE,
                                     train=True, shuffle=True, repeat=False, sort = False)
val_iter = torchtext.data.Iterator(dataset = valset, batch_size = BATCH_SIZE, device = DEVICE,
                                  train=True, shuffle=True, repeat=False)
test_iter = torchtext.data.Iterator(dataset = testset, batch_size = BATCH_SIZE, device = DEVICE,
                                   train=False, shuffle=False, repeat=False)

-------

## 7) and so on

In [27]:
word_embeddings = TEXT.vocab.vectors
vocab_size = len(TEXT.vocab)
n_classes = 2

---------
---------

# 4. Build the LSTM

In [28]:
class LSTM_model(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embedding_dim, n_classes, dropout_p = DROP_RATE):
        super(LSTM_model, self).__init__()
        self.n_layers = n_layers
        self.embed = nn.Embedding(n_vocab, embedding_dim)
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers = self.n_layers, batch_first = True)
        self.out = nn.Linear(self.hidden_dim, n_classes)
        
    def forward(self, x):
        # x = [64, 27]
        x = self.embed(x)
        # x = [64, 27, 128]
        h_0 = self._init_state(batch_size = x.size(0))#첫 번째 은닉 벡터 정의
        # h_0 = [1, 64, 256]
        x, _ = self.lstm(x,(h_0,h_0))
        # x = [64, 27, 256]
        h_t = x[:,-1,:]
        # h_t = [64, 256]
        self.dropout(h_t)
        logit = self.out(h_t)
        # logit = [64, 2]
        return logit
    
    def _init_state(self, batch_size = 1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [29]:
def train(model, optimizer, train_iter):
    model.train()
    acc, total_loss = 0, 0
    for b,batch in enumerate(train_iter):
        x, y = batch.text[0], batch.target
        y.sub_(1)
        y = y.type(torch.LongTensor)
        x = x.to(DEVICE)
        y = y.data.to(DEVICE)
        optimizer.zero_grad()# 기울기 0으로 초기화
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction = 'mean')
        total_loss += loss.item()
        acc += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        loss.backward()
        optimizer.step()
    size = len(train_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100. * acc / size
    return avg_loss, avg_accuracy

In [30]:
def evaluate(model, val_iter):
    model.eval()
    acc, total_loss = 0., 0.
    for batch in val_iter:
        x, y = batch.text[0], batch.target
        y.sub_(1)
        y = y.type(torch.LongTensor)
        x = x.to(DEVICE)
        y = y.data.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction = 'sum')#오차의 합 구하고 total_loss에 더해줌
        total_loss += loss.item()
        acc += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100. * acc / size
    return avg_loss, avg_accuracy

In [31]:
model = LSTM_model(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [32]:
best_val_loss = None
for e in tqdm(range(1, EPOCHS + 1)):
    train_loss, train_accuracy = train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("<<e : %d>> <<train_loss : %f>> <<train_accuracy : %f>> <<val_loss : %f>> <<val_accuracy : %f>>"%(e, train_loss, train_accuracy, val_loss, val_accuracy))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

<<e : 1>> <<train_loss : 0.010890>> <<train_accuracy : 56.089321>> <<val_loss : 0.681105>> <<val_accuracy : 57.049034>>
<<e : 2>> <<train_loss : 0.010701>> <<train_accuracy : 57.027584>> <<val_loss : 0.680417>> <<val_accuracy : 57.049034>>
<<e : 3>> <<train_loss : 0.010541>> <<train_accuracy : 57.759426>> <<val_loss : 0.660413>> <<val_accuracy : 62.390541>>


100%|█████████▉| 399480/400000 [01:10<00:00, 7280.59it/s]

<<e : 4>> <<train_loss : 0.008972>> <<train_accuracy : 71.720772>> <<val_loss : 0.549946>> <<val_accuracy : 74.080559>>
<<e : 5>> <<train_loss : 0.006856>> <<train_accuracy : 80.709328>> <<val_loss : 0.508388>> <<val_accuracy : 76.882660>>
<<e : 6>> <<train_loss : 0.005719>> <<train_accuracy : 84.969032>> <<val_loss : 0.517359>> <<val_accuracy : 78.283714>>
<<e : 7>> <<train_loss : 0.004818>> <<train_accuracy : 88.046532>> <<val_loss : 0.594362>> <<val_accuracy : 77.101578>>
<<e : 8>> <<train_loss : 0.004057>> <<train_accuracy : 90.898849>> <<val_loss : 0.576452>> <<val_accuracy : 78.064796>>
<<e : 9>> <<train_loss : 0.003220>> <<train_accuracy : 92.944267>> <<val_loss : 0.750275>> <<val_accuracy : 73.992996>>
<<e : 10>> <<train_loss : 0.002612>> <<train_accuracy : 94.539307>> <<val_loss : 0.695445>> <<val_accuracy : 75.525391>>
<<e : 11>> <<train_loss : 0.002214>> <<train_accuracy : 94.970909>> <<val_loss : 0.850475>> <<val_accuracy : 76.576180>>
<<e : 12>> <<train_loss : 0.001730>> <