In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras.preprocessing.text import Tokenizer
from torchtext import data
from torchtext.data import TabularDataset
from tqdm.notebook import tqdm

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
chart = train[['text','target']]
t_chart = test[['text']]

In [4]:
def remove(text):
    text = text.str.lower()
    text = text.str.replace('http\S+','')# http://와 https:// 두 가지 버전 있음
    text = text.str.replace('[0-9]','')# 사람 숫자, 날짜
    text = text.str.replace('@\S+','')# '@' 뒤에는 아이디 태그
    text = text.str.replace(',','')
    return text

In [5]:
chart["text"] = remove(chart["text"])
t_chart["text"] = remove(t_chart["text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
chart.to_csv('train_r.csv',index = False)
t_chart.to_csv('test_r.csv',index = False)

In [6]:
pd.read_csv('train_r.csv').target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
len(pd.read_csv('train_r.csv'))

7613

# CSV -> trainset (= TabularDataset)

In [8]:
BATCH_SIZE = 64
lr = 0.001
epochs = 20
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [9]:
TEXT = data.Field(sequential = True, tokenize = str.split, batch_first = True)
LABEL = data.LabelField(dtype = torch.long)

In [10]:
trainset = TabularDataset(path = 'train_r.csv', format='csv', fields=[('text', TEXT), ('label', LABEL)])
testset = TabularDataset(path = 'test_r.csv', format='csv', fields=[('text', TEXT)])

In [11]:
trainset.examples[0].text, trainset.examples[0].label

(['text'], 'target')

In [12]:
trainset.examples.pop(0) # column을 제거해야 합니다.

<torchtext.data.example.Example at 0x23c850c7488>

In [13]:
trainset.examples[0].text, trainset.examples[0].label

(['our',
  'deeds',
  'are',
  'the',
  'reason',
  'of',
  'this',
  '#earthquake',
  'may',
  'allah',
  'forgive',
  'us',
  'all'],
 '1')

# trainset -> train_iter (= torchtext.data.Iterator)

In [14]:
TEXT.build_vocab(trainset, min_freq = 2)# 최소 2번 이상 등장한 단어만 사전에 담음
LABEL.build_vocab(trainset)

In [15]:
trainset, valset = trainset.split(split_ratio = 0.9)

In [16]:
train_iter = data.Iterator(dataset = trainset, batch_size = BATCH_SIZE, device = DEVICE)
val_iter = data.Iterator(dataset = valset, batch_size = BATCH_SIZE, device = DEVICE)
test_iter = data.Iterator(dataset = testset, batch_size = BATCH_SIZE, device = DEVICE)

In [17]:
vocab_size = len(TEXT.vocab)
n_classes = 2

# model

In [18]:
class LSTM_model(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embedding_dim, n_classes, dropout_p = 0.5):
        super(LSTM_model, self).__init__()
        self.n_layers = n_layers
        self.embed = nn.Embedding(n_vocab, embedding_dim)
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers = self.n_layers, batch_first = True)
        self.out = nn.Linear(self.hidden_dim, n_classes)
        
    def forward(self, x):
        # x = [64, 27]
        x = self.embed(x)
        # x = [64, 27, 128]
        h_0 = self._init_state(batch_size = x.size(0))#첫 번째 은닉 벡터 정의
        # h_0 = [1, 64, 256]
        x, _ = self.lstm(x,(h_0,h_0))
        # x = [64, 27, 256]
        h_t = x[:,-1,:]
        # h_t = [64, 256]
        self.dropout(h_t)
        logit = self.out(h_t)
        # logit = [64, 2]
        return logit
    
    def _init_state(self, batch_size = 1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

# Train model

In [47]:
def train(model, optimizer, train_iter):
    model.train()
    acc, total_loss = 0, 0
    for b,batch in tqdm(enumerate(train_iter)):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        optimizer.zero_grad()# 기울기 0으로 초기화
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction = 'mean')
        total_loss += loss.item()
        acc += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        loss.backward()
        optimizer.step()
    size = len(train_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100 * acc / size
    return avg_loss, avg_accuracy

# Evaluate model

In [48]:
def evaluate(model, val_iter):
    model.eval()
    acc, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction = 'sum')#오차의 합 구하고 total_loss에 더해줌
        total_loss += loss.item()
        acc += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100 * acc / size
    return avg_loss, avg_accuracy

In [49]:
from tqdm import tqdm

# Train

In [50]:
model = LSTM_model(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [51]:
best_val_loss = None
for e in tqdm(range(1, epochs + 1)):
    train_loss, train_accuracy = train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("<<e : %d>> <<train_loss : %5.2f>> <<train_accuracy : %5.2f>> <<val_loss : %5.2f>> <<val_accuracy : %5.2f>>"%(e, train_loss, train_accuracy, val_loss, val_accuracy))
    sleep(0.01)

  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
0it [00:00, ?it/s]
1it [00:00,  4.75it/s]
2it [00:00,  4.60it/s]
3it [00:00,  4.63it/s]
4it [00:00,  4.68it/s]
5it [00:01,  4.59it/s]
6it [00:01,  4.76it/s]
7it [00:01,  4.78it/s]
8it [00:01,  4.55it/s]
9it [00:01,  4.72it/s]
10it [00:02,  4.83it/s]
11it [00:02,  4.93it/s]
12it [00:02,  4.76it/s]
13it [00:02,  4.75it/s]
14it [00:02,  4.57it/s]
15it [00:03,  4.49it/s]
16it [00:03,  4.58it/s]
17it [00:03,  4.84it/s]
18it [00:03,  4.82it/s]
19it [00:04,  4.87it/s]
20it [00:04,  4.77it/s]
21it [00:04,  4.74it/s]
22it [00:04,  4.83it/s]
23it [00:04,  4.80it/s]
24it [00:05,  4.52it/s]
25it [00:05,  4.39it/s]
26it [00:05,  4.53it/s]
27it [00:05,  4.62it/s]
28it [00:06,  3.58it/s]
29it [00:06,  3.88it/s]
30it [00:06,  4.08it/s]
31it [00:06,  4.28it/s]
32it [00:07,  4.48it/s]
33it [00:07,  4.45it/s]
34it [00:07,  4.55it/s]
35it [00:07,  4.57it/s]
36it [00:07,  4.53it/s]
37it [

<<e : 1>> <<train_loss :  0.01>> <<train_accuracy : 56.00>> <<val_loss :  0.68>> <<val_accuracy : 57.00>>


  5%|████▏                                                                              | 1/20 [00:23<07:35, 23.95s/it]
0it [00:00, ?it/s]
1it [00:00,  4.94it/s]
2it [00:00,  4.86it/s]
3it [00:00,  4.88it/s]
4it [00:00,  4.91it/s]
5it [00:01,  4.86it/s]
6it [00:01,  3.43it/s]
7it [00:01,  3.87it/s]
8it [00:01,  4.06it/s]
9it [00:02,  4.25it/s]
10it [00:02,  4.49it/s]
11it [00:02,  4.61it/s]
12it [00:02,  4.67it/s]
13it [00:02,  4.68it/s]
14it [00:03,  4.59it/s]
15it [00:03,  4.72it/s]
16it [00:03,  3.77it/s]
17it [00:03,  4.12it/s]
18it [00:04,  4.36it/s]
19it [00:04,  4.37it/s]
20it [00:04,  4.19it/s]
21it [00:04,  4.14it/s]
22it [00:05,  4.33it/s]
23it [00:05,  4.31it/s]
24it [00:05,  4.42it/s]
25it [00:05,  4.69it/s]
26it [00:05,  4.72it/s]
27it [00:06,  4.69it/s]
28it [00:06,  4.55it/s]
29it [00:06,  4.70it/s]
30it [00:06,  4.75it/s]
31it [00:07,  4.70it/s]
32it [00:07,  4.71it/s]
33it [00:07,  4.53it/s]
34it [00:07,  4.55it/s]
35it [00:07,  4.49it/s]
36it [00:08,  4.36it/s]
37it [

<<e : 2>> <<train_loss :  0.01>> <<train_accuracy : 57.00>> <<val_loss :  0.68>> <<val_accuracy : 57.00>>


 10%|████████▎                                                                          | 2/20 [00:48<07:11, 23.99s/it]
0it [00:00, ?it/s]
1it [00:00,  4.18it/s]
2it [00:00,  4.40it/s]
3it [00:00,  4.58it/s]
4it [00:00,  4.75it/s]
5it [00:01,  4.87it/s]
6it [00:01,  4.85it/s]
7it [00:01,  4.93it/s]
8it [00:01,  4.95it/s]
9it [00:01,  4.83it/s]
10it [00:02,  3.95it/s]
11it [00:02,  3.61it/s]
12it [00:02,  3.77it/s]
13it [00:03,  3.86it/s]
14it [00:03,  3.11it/s]
15it [00:03,  3.40it/s]
16it [00:03,  3.79it/s]
17it [00:04,  4.12it/s]
18it [00:04,  4.51it/s]
19it [00:04,  4.70it/s]
20it [00:04,  4.60it/s]
21it [00:04,  4.81it/s]
22it [00:05,  4.78it/s]
23it [00:05,  4.74it/s]
24it [00:05,  4.77it/s]
25it [00:05,  4.76it/s]
26it [00:05,  4.86it/s]
27it [00:06,  5.09it/s]
28it [00:06,  4.91it/s]
29it [00:06,  4.85it/s]
30it [00:06,  4.78it/s]
31it [00:06,  4.55it/s]
32it [00:07,  4.47it/s]
33it [00:07,  4.62it/s]
34it [00:07,  4.75it/s]
35it [00:07,  4.61it/s]
36it [00:08,  4.56it/s]
37it [

<<e : 3>> <<train_loss :  0.01>> <<train_accuracy : 61.00>> <<val_loss :  0.58>> <<val_accuracy : 73.00>>


 15%|████████████▍                                                                      | 3/20 [01:11<06:46, 23.92s/it]
0it [00:00, ?it/s]
1it [00:00,  4.30it/s]
2it [00:00,  4.12it/s]
3it [00:00,  4.29it/s]
4it [00:00,  4.30it/s]
5it [00:01,  4.50it/s]
6it [00:01,  4.62it/s]
7it [00:01,  4.78it/s]
8it [00:01,  4.97it/s]
9it [00:01,  4.82it/s]
10it [00:02,  4.78it/s]
11it [00:02,  4.60it/s]
12it [00:02,  4.33it/s]
13it [00:02,  4.30it/s]
14it [00:03,  4.55it/s]
15it [00:03,  4.68it/s]
16it [00:03,  4.71it/s]
17it [00:03,  4.74it/s]
18it [00:03,  4.65it/s]
19it [00:04,  4.78it/s]
20it [00:04,  4.80it/s]
21it [00:04,  4.50it/s]
22it [00:04,  4.63it/s]
23it [00:04,  4.76it/s]
24it [00:05,  4.85it/s]
25it [00:05,  4.81it/s]
26it [00:05,  4.70it/s]
27it [00:05,  4.47it/s]
28it [00:06,  4.68it/s]
29it [00:06,  4.74it/s]
30it [00:06,  4.83it/s]
31it [00:06,  4.49it/s]
32it [00:06,  4.51it/s]
33it [00:07,  4.59it/s]
34it [00:07,  4.58it/s]
35it [00:07,  4.37it/s]
36it [00:07,  4.30it/s]
37it [

<<e : 4>> <<train_loss :  0.01>> <<train_accuracy : 76.00>> <<val_loss :  0.56>> <<val_accuracy : 74.00>>


 20%|████████████████▌                                                                  | 4/20 [01:35<06:21, 23.85s/it]
0it [00:00, ?it/s]
1it [00:00,  5.04it/s]
2it [00:00,  5.12it/s]
3it [00:00,  4.91it/s]
4it [00:00,  5.02it/s]
5it [00:00,  5.24it/s]
6it [00:01,  5.20it/s]
7it [00:01,  5.13it/s]
8it [00:01,  4.94it/s]
9it [00:01,  4.83it/s]
10it [00:02,  4.82it/s]
11it [00:02,  4.58it/s]
12it [00:02,  4.77it/s]
13it [00:02,  4.64it/s]
14it [00:02,  4.57it/s]
15it [00:03,  4.73it/s]
16it [00:03,  4.67it/s]
17it [00:03,  4.79it/s]
18it [00:03,  4.55it/s]
19it [00:03,  4.61it/s]
20it [00:04,  4.61it/s]
21it [00:04,  4.82it/s]
22it [00:04,  5.02it/s]
23it [00:04,  4.95it/s]
24it [00:04,  4.90it/s]
25it [00:05,  4.93it/s]
26it [00:05,  4.83it/s]
27it [00:05,  4.86it/s]
28it [00:05,  4.96it/s]
29it [00:05,  5.04it/s]
30it [00:06,  4.95it/s]
31it [00:06,  4.98it/s]
32it [00:06,  4.83it/s]
33it [00:06,  4.68it/s]
34it [00:07,  4.51it/s]
35it [00:07,  4.58it/s]
36it [00:07,  4.61it/s]
37it [

KeyboardInterrupt: 