## Movie_Review_Sentiment_Classification
### (영화리뷰 감성분석)

***
#### 1. 데이터 준비와 확인

In [12]:
import random
import numpy as np
import pandas as pd
import torch
from torchtext import data
from torchtext import datasets
from tqdm import tqdm

from eunjeon import Mecab
mecab = Mecab()

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

문장의 길이가 필터 사이즈보다 작으면 에러가 나므로 다음과 같이 토크나이저 수정

In [2]:
FILTER_SIZES = [3,4,5]
def tokenizer(text):
    token = [t for t in mecab.morphs(text)]
    if len(token) < max(FILTER_SIZES):
        for i in range(0, max(FILTER_SIZES) - len(token)):
            token.append('<PAD>')
    return token

In [3]:
TEXT = data.Field(tokenize = tokenizer, batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
fields = {'document': ('text',TEXT), 'label': ('label',LABEL)}
# dictionary 형식은 {csv컬럼명 : (데이터 컬럼명, Field이름)}

In [5]:
df1 = pd.read_csv("0112.csv", encoding="utf-8")
df2 = pd.read_csv("0113.csv", encoding="utf-8")
df3 = pd.read_csv("0114.csv", encoding="utf-8")
df4 = pd.read_csv("0115.csv", encoding="utf-8")
df5 = pd.read_csv("0116.csv", encoding="utf-8")
df6 = pd.read_csv("0117.csv", encoding="utf-8")
df7 = pd.read_csv("0118.csv", encoding="utf-8")
df8 = pd.read_csv("0119.csv", encoding="utf-8")
df9 = pd.read_csv("0120.csv", encoding="utf-8")
df10 = pd.read_csv("0121.csv", encoding="utf-8")
df11 = pd.read_csv("0122.csv", encoding="utf-8")
df12 = pd.read_csv("0123.csv", encoding="utf-8")
df13 = pd.read_csv("0124.csv", encoding="utf-8")
df14 = pd.read_csv("0125.csv", encoding="utf-8")
df15 = pd.read_csv("0126.csv", encoding="utf-8")
df16 = pd.read_csv("0127.csv", encoding="utf-8")
df17 = pd.read_csv("0128.csv", encoding="utf-8")
df18 = pd.read_csv("0129.csv", encoding="utf-8")
df19 = pd.read_csv("0130.csv", encoding="utf-8")
df20 = pd.read_csv("0131.csv", encoding="utf-8")
df21 = pd.read_csv("0201.csv", encoding="utf-8")
df22 = pd.read_csv("0202.csv", encoding="utf-8")
df23 = pd.read_csv("0203.csv", encoding="utf-8")
df24 = pd.read_csv("0204.csv", encoding="utf-8")
df25 = pd.read_csv("0205.csv", encoding="utf-8")
df26 = pd.read_csv("0206.csv", encoding="utf-8")
df27 = pd.read_csv("0207.csv", encoding="utf-8")
df28 = pd.read_csv("0208.csv", encoding="utf-8")
df29 = pd.read_csv("0209.csv", encoding="utf-8")
df30 = pd.read_csv("0210.csv", encoding="utf-8")
df31 = pd.read_csv("0211.csv", encoding="utf-8")
df32 = pd.read_csv("0212.csv", encoding="utf-8")
df33 = pd.read_csv("0213.csv", encoding="utf-8")

In [6]:
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,
                df21,df22,df23,df24,df25,df26,df27,df28,df29,df30,df31,df32,df33], axis=0, ignore_index = True)
df = df.rename(columns = {'sentence':'document'})

def rating_to_label(score):
    if score <= 2:
        return 1
    if 3 <= score <= 4:
        return 2
    if 5 <= score <= 6:
        return 3
    if 7 <= score <= 8:
        return 4
    if score >= 9:
        return 5
    
df['label'] = df['score'].apply(lambda x: rating_to_label(x))
df = df.drop(columns = 'score')
df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
df['document'] = df['document'].str.replace('^ +', "") # 공백은 empty 값으로 변경
df['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
df = df.dropna(axis = 0)
df.set_index('document',inplace=True)
df = df.astype('int')
df.reset_index(inplace=True)

  df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
  df['document'] = df['document'].str.replace('^ +', "") # 공백은 empty 값으로 변경


In [7]:
df.label.value_counts()

5    43890
4    12477
1    10026
3     6496
2     3591
Name: label, dtype: int64

In [8]:
stopwords = ['은','는','이','가','의','들','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

def preprocess(text):
    word = [t for t in text if t not in stopwords]
    return word

In [9]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=0)

train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [10]:
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

In [13]:
for sentence in tqdm(train_data['document']):
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    train_data.append(stopwords_removed_sentence)

  train_data.append(stopwords_removed_sentence)
100%|███████████████████████████████████████████████████████████████████████████| 61184/61184 [05:03<00:00, 201.74it/s]


In [14]:
for sentence in tqdm(test_data['document']):
    tokenized_sentence = mecab.morphs(sentence) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    test_data.append(stopwords_removed_sentence)

  test_data.append(stopwords_removed_sentence)
100%|███████████████████████████████████████████████████████████████████████████| 15296/15296 [00:37<00:00, 410.47it/s]


In [15]:
import random
train_data, test_data = data.TabularDataset.splits(
                            path = '.',
                            train = 'train_data.csv',
                            test = 'test_data.csv',
                            format = 'csv',
                            fields = fields
)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [16]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data,
                max_size = MAX_VOCAB_SIZE,
                vectors = 'fasttext.simple.300d',
                unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [17]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    device = device)

In [18]:
import torch.nn as nn
import torch.nn.functional as F

In [19]:
def print_shape(name, data):
    print(f'{name} has shape {data.shape}')

In [20]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1,
                                             out_channels=n_filters,
                                             kernel_size=(fs, embedding_dim))
                                   for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #print_shape('text', text)
        # text = [batch_size, sent_len]
        
        embedded = self.embedding(text)
        #print_shape('embedded', embedded)
        # embedded = [batch_size, sent_len, emb_dim]
        
        embedded = embedded.unsqueeze(1)
        #print_shape('embedded', embedded)
        # embedded = [batch_size, 1, sent_len, emb_dim]
        
        #print_shape('self.convs[0](embedded)', self.convs[0](embedded))
        # self.convs[0](embedded) = [batch_size, n_filters, sent_len-filter_sizes[n]+1, 1 ]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        #print_shape('F.max_pool1d(conved[0], conved[0].shape[2])', F.max_pool1d(conved[0], conved[0].shape[2]))
        # F.max_pool1d(conved[0], conved[0].shape[2]) = [batch_size, n_filters, 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        cat = self.dropout(torch.cat(pooled, dim=1))
        #print_shape('cat', cat)
        # cat = [batch_size, n_filters * len(filter_size)]
        
        res = self.fc(cat)
        #print_shape('res', res)
        # res = [batch_size, output_dim]
        
        return self.fc(cat)

In [21]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [22]:
model = model.to(device)

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'모델의 파라미터 수는 {count_parameters(model):,} 개 입니다.')

모델의 파라미터 수는 7,861,201 개 입니다.


In [24]:
pretrained_weight = TEXT.vocab.vectors
print(pretrained_weight.shape, model.embedding.weight.data.shape)

torch.Size([25002, 300]) torch.Size([25002, 300])


In [25]:
model.embedding.weight.data.copy_(pretrained_weight)

tensor([[-0.1117, -0.4966,  0.1631,  ..., -1.4447,  0.8402, -0.8668],
        [ 0.1032, -1.6268,  0.5729,  ...,  0.3180, -0.1626, -0.0417],
        [-0.1020,  1.6282,  2.1635,  ..., -0.6009, -0.1467,  0.0285],
        ...,
        [-0.3387, -1.4635, -0.0609,  ...,  0.3799,  1.0763,  0.1709],
        [ 1.1313,  0.7882, -0.9301,  ..., -1.6988,  1.0547, -1.1780],
        [ 0.4711,  1.4491, -0.7185,  ..., -0.0707,  0.6123, -0.8139]])

In [26]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## 모델 훈련

In [27]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [28]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [29]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds==y).float()
    acc = correct.sum() / len(correct)
    return acc

In [30]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1) # output_dim = 1
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
from tqdm import tqdm

N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in tqdm(range(N_EPOCHS)):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

 20%|████████████████▊                                                                   | 1/5 [01:20<05:20, 80.10s/it]

Epoch: 01 | Epoch Time: 1m 20s
	Train Loss: -81.552 | Train Acc: 31.15%
	 Val. Loss: -431.359 |  Val. Acc: 33.86%


 40%|█████████████████████████████████▌                                                  | 2/5 [02:55<04:28, 89.37s/it]

Epoch: 02 | Epoch Time: 1m 35s
	Train Loss: -2401.496 | Train Acc: 35.22%
	 Val. Loss: -5987.214 |  Val. Acc: 34.41%


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [04:35<03:07, 93.93s/it]

Epoch: 03 | Epoch Time: 1m 39s
	Train Loss: -14070.523 | Train Acc: 35.89%
	 Val. Loss: -24944.217 |  Val. Acc: 33.99%


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [06:19<01:38, 98.00s/it]

Epoch: 04 | Epoch Time: 1m 44s
	Train Loss: -43673.849 | Train Acc: 35.94%
	 Val. Loss: -65008.408 |  Val. Acc: 34.88%


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:03<00:00, 96.73s/it]

Epoch: 05 | Epoch Time: 1m 43s
	Train Loss: -98131.239 | Train Acc: 36.28%
	 Val. Loss: -131500.562 |  Val. Acc: 34.74%





In [34]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: -130895.431 | Test Acc: 35.37%


In [35]:
for epoch in tqdm(range(N_EPOCHS)):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
            
    print(f'Epoch: {epoch+6:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

 20%|████████████████▌                                                                  | 1/5 [01:44<06:56, 104.07s/it]

Epoch: 06 | Epoch Time: 1m 44s
	Train Loss: -183838.562 | Train Acc: 36.26%
	 Val. Loss: -229074.940 |  Val. Acc: 34.90%


 40%|█████████████████████████████████▏                                                 | 2/5 [03:25<05:07, 102.62s/it]

Epoch: 07 | Epoch Time: 1m 41s
	Train Loss: -302592.981 | Train Acc: 36.31%
	 Val. Loss: -361909.386 |  Val. Acc: 34.86%


 60%|█████████████████████████████████████████████████▊                                 | 3/5 [05:10<03:26, 103.46s/it]

Epoch: 08 | Epoch Time: 1m 44s
	Train Loss: -464330.699 | Train Acc: 36.41%
	 Val. Loss: -535287.599 |  Val. Acc: 34.68%


 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [06:53<01:43, 103.53s/it]

Epoch: 09 | Epoch Time: 1m 43s
	Train Loss: -667862.035 | Train Acc: 36.19%
	 Val. Loss: -751199.865 |  Val. Acc: 35.07%


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:38<00:00, 103.75s/it]

Epoch: 10 | Epoch Time: 1m 44s
	Train Loss: -920719.567 | Train Acc: 36.65%
	 Val. Loss: -1015213.235 |  Val. Acc: 34.83%





In [36]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: -1011057.261 | Test Acc: 35.38%
