[reference](https://github.com/bentrevett/pytorch-sentiment-analysis)

In [0]:
# Importing Libraries
import os
import time
import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
path = '/content/drive/My Drive/lt-module'
os.chdir(path)

In [0]:
# Custom Libraries
import DataLoader
import Model

## Define functions - to load dataset & model

In [0]:
# load dataset
def load_dataset(dataset, architecture, batch_size, device, path):
    if dataset == "imdb":
      if architecture == "cnn":
        data = DataLoader.IMDB_CNN(batch_size, device, path)
      elif architecture == "lstm":
        data = DataLoader.IMDB_LSTM(batch_size, device, path)

    elif dataset == "agnews":
        data = DataLoader.AGNEWS(batch_size, device, path)

    else:
        raise ValueError(dataset + "is not supported")

    return data

# load model and set hyperparameters
def load_model(architecture, data_choice, batch_size):

    if architecture == "cnn":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      n_filters = 100
      filter_sizes = [3,4,5]
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
        model = Model.binaryCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
        model = Model.multiCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    elif architecture == "lstm":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      hidden_dim = 256
      output_dim = 1
      n_layers = 2
      bidirectional = True
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
      
      model = Model.LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    # temporary
    elif architecture == "bert":
        return None

    else:
        raise ValueError(architecture + "is not supported")

## Choose dataset & model

In [6]:
# colab에서 돌리기 때문에 우선 arg가 아니라 변수로 넘겨주기
data_choice = "agnews" # 데이터셋 선택
batch_size = 64 # batch size 선택 - 32 or 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} device.')

# load dataset
dataset = load_dataset(data_choice, batch_size, device, path)

Using cuda device.


#### CNN

In [0]:
# load model and set hyperparameters
arch_choice = "cnn" # 모델 선택
model = load_model(arch_choice, data_choice, batch_size)

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,621,704 trainable parameters


#### LSTM

In [0]:
# load model and set hyperparameters
arch_choice = "lstm" # 모델 선택
model = load_model(arch_choice, data_choice, batch_size)

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,812,396 trainable parameters


## Weight initialization

In [0]:
# weight initializtion
def initialize_xavier_normal(m):
    
  """
	Function to initialize a layer by picking weights from a xavier normal distribution
	Arguments
	---------
	m : The layer of the neural network
	Returns
	-------
	None
	"""
  
  if type(m) == torch.nn.Conv2d:
    torch.nn.init.xavier_normal_(m.weight)
    m.bias.data.fill_(0)

  elif type(m) in [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]:
    for name, param in m.named_parameters():
        if 'weight_ih' in name:
          torch.nn.init.xavier_normal_(param.data)
        elif 'weight_hh' in name:
          torch.nn.init.orthogonal_(param.data)
        elif 'bias' in name:
          param.data.fill_(0)

In [11]:
model.apply(initialize_xavier_normal)

multiCNN(
  (embedding): Embedding(25002, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=4, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [12]:
module = model.convs
print(list(module.named_parameters()))

[('0.weight', Parameter containing:
tensor([[[[-1.6252e-03,  7.8407e-04, -9.1406e-05,  ...,  9.1451e-03,
           -9.8919e-03,  2.8691e-02],
          [ 7.5317e-03, -7.7893e-03,  6.3339e-03,  ..., -3.1627e-03,
           -1.1985e-04,  3.6551e-03],
          [ 6.0764e-03,  1.5388e-03,  1.0905e-02,  ...,  4.9358e-03,
            5.3245e-03, -3.4391e-04]]],


        [[[ 1.6880e-03,  1.1568e-02,  1.7059e-03,  ...,  6.7518e-03,
            4.1384e-03, -6.3355e-04],
          [ 9.5090e-03,  1.1349e-02, -9.4317e-03,  ...,  8.5466e-03,
            3.5060e-03,  8.8262e-03],
          [ 1.6474e-03,  6.0323e-03, -4.2947e-03,  ...,  5.9515e-03,
           -8.8256e-03,  1.0599e-02]]],


        [[[-8.2143e-04, -3.4910e-03,  6.3031e-04,  ...,  1.6716e-03,
           -1.8973e-03, -5.9115e-03],
          [ 8.4538e-04,  3.8626e-03, -1.6742e-02,  ...,  1.0080e-02,
           -6.1672e-03,  7.1661e-03],
          [ 4.8424e-03, -1.1446e-03, -3.5680e-03,  ..., -2.4350e-03,
           -5.5305e-03, -1.8163

In [13]:
# 모델 정의는 완료. 상태 체크해보기

# 옵티마이저 초기화
optimizer = optim.Adam(model.parameters())

# 모델의 state_dict 출력
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# 옵티마이저의 state_dict 출력
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
embedding.weight 	 torch.Size([25002, 100])
convs.0.weight 	 torch.Size([100, 1, 3, 100])
convs.0.bias 	 torch.Size([100])
convs.1.weight 	 torch.Size([100, 1, 4, 100])
convs.1.bias 	 torch.Size([100])
convs.2.weight 	 torch.Size([100, 1, 5, 100])
convs.2.bias 	 torch.Size([100])
fc.weight 	 torch.Size([4, 300])
fc.bias 	 torch.Size([4])
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'params': [140497839653440, 140497870453712, 140498183399896, 140497839653512, 140497839653368, 140497839653296, 140497839653224, 140497839653152, 140497839653080]}]


In [0]:
torch.save(model.state_dict(), path+"/agnews-cnn.pt")

#### lstm

In [24]:
model.apply(initialize_xavier_normal)

LSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=4, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [25]:
module = model.rnn
print(list(module.named_parameters()))

[('weight_ih_l0', Parameter containing:
tensor([[-0.0454,  0.0001, -0.0361,  ...,  0.0364, -0.0138,  0.0505],
        [ 0.0185, -0.0419,  0.0177,  ...,  0.0439,  0.0229, -0.0252],
        [ 0.0679,  0.0086,  0.0383,  ..., -0.0028,  0.0332,  0.0241],
        ...,
        [ 0.0392,  0.0362,  0.0553,  ...,  0.0006,  0.1057,  0.0176],
        [ 0.0020,  0.0335, -0.0056,  ...,  0.0065,  0.0083,  0.0660],
        [ 0.0230,  0.0063, -0.0556,  ...,  0.0530,  0.0616,  0.0520]],
       requires_grad=True)), ('weight_hh_l0', Parameter containing:
tensor([[-0.0111,  0.0356, -0.0574,  ..., -0.0291,  0.0126, -0.0135],
        [-0.0157, -0.0170, -0.0135,  ..., -0.0158, -0.0008, -0.0247],
        [-0.0324,  0.0252, -0.0129,  ..., -0.0233,  0.0037, -0.0224],
        ...,
        [ 0.0264, -0.0257, -0.0148,  ...,  0.0109,  0.0049, -0.0311],
        [ 0.0200,  0.0365,  0.0200,  ...,  0.0136, -0.0588, -0.0038],
        [ 0.0551, -0.0339,  0.0341,  ...,  0.0317,  0.0078,  0.0355]],
       requires_grad=Tru

In [26]:
# 모델 정의는 완료. 상태 체크해보기

# 옵티마이저 초기화
optimizer = optim.Adam(model.parameters())

# 모델의 state_dict 출력
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# 옵티마이저의 state_dict 출력
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
embedding.weight 	 torch.Size([25002, 100])
rnn.weight_ih_l0 	 torch.Size([1024, 100])
rnn.weight_hh_l0 	 torch.Size([1024, 256])
rnn.bias_ih_l0 	 torch.Size([1024])
rnn.bias_hh_l0 	 torch.Size([1024])
rnn.weight_ih_l0_reverse 	 torch.Size([1024, 100])
rnn.weight_hh_l0_reverse 	 torch.Size([1024, 256])
rnn.bias_ih_l0_reverse 	 torch.Size([1024])
rnn.bias_hh_l0_reverse 	 torch.Size([1024])
rnn.weight_ih_l1 	 torch.Size([1024, 512])
rnn.weight_hh_l1 	 torch.Size([1024, 256])
rnn.bias_ih_l1 	 torch.Size([1024])
rnn.bias_hh_l1 	 torch.Size([1024])
rnn.weight_ih_l1_reverse 	 torch.Size([1024, 512])
rnn.weight_hh_l1_reverse 	 torch.Size([1024, 256])
rnn.bias_ih_l1_reverse 	 torch.Size([1024])
rnn.bias_hh_l1_reverse 	 torch.Size([1024])
fc.weight 	 torch.Size([4, 512])
fc.bias 	 torch.Size([4])
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'params': [140497867597864, 14049786759800

In [0]:
torch.save(model.state_dict(), path+"/agnews-lstm.pt")

#### both

In [0]:
criterion = torch.nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### For CNN

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text[0])
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text[0])
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 16s
	Train Loss: 0.468 | Train Acc: 83.22%
	 Val. Loss: 0.338 |  Val. Acc: 88.12%
Epoch: 02 | Epoch Time: 0m 16s
	Train Loss: 0.321 | Train Acc: 88.99%
	 Val. Loss: 0.309 |  Val. Acc: 89.29%
Epoch: 03 | Epoch Time: 0m 16s
	Train Loss: 0.255 | Train Acc: 91.34%
	 Val. Loss: 0.295 |  Val. Acc: 90.00%
Epoch: 04 | Epoch Time: 0m 16s
	Train Loss: 0.212 | Train Acc: 92.69%
	 Val. Loss: 0.299 |  Val. Acc: 90.21%
Epoch: 05 | Epoch Time: 0m 16s
	Train Loss: 0.177 | Train Acc: 93.99%
	 Val. Loss: 0.321 |  Val. Acc: 90.11%
Epoch: 06 | Epoch Time: 0m 16s
	Train Loss: 0.148 | Train Acc: 94.90%
	 Val. Loss: 0.337 |  Val. Acc: 90.18%
Epoch: 07 | Epoch Time: 0m 16s
	Train Loss: 0.126 | Train Acc: 95.66%
	 Val. Loss: 0.370 |  Val. Acc: 89.99%
Epoch: 08 | Epoch Time: 0m 16s
	Train Loss: 0.107 | Train Acc: 96.34%
	 Val. Loss: 0.395 |  Val. Acc: 89.80%
Epoch: 09 | Epoch Time: 0m 16s
	Train Loss: 0.091 | Train Acc: 96.88%
	 Val. Loss: 0.436 |  Val. Acc: 89.88%
Epoch: 10 | Epoch T

In [21]:
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.310 | Test Acc: 89.52%


### For LSTM

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 57s
	Train Loss: 0.779 | Train Acc: 68.84%
	 Val. Loss: 0.470 |  Val. Acc: 84.10%
Epoch: 02 | Epoch Time: 0m 57s
	Train Loss: 0.461 | Train Acc: 83.32%
	 Val. Loss: 0.343 |  Val. Acc: 88.13%
Epoch: 03 | Epoch Time: 0m 57s
	Train Loss: 0.377 | Train Acc: 86.64%
	 Val. Loss: 0.322 |  Val. Acc: 89.12%
Epoch: 04 | Epoch Time: 0m 57s
	Train Loss: 0.327 | Train Acc: 88.47%
	 Val. Loss: 0.290 |  Val. Acc: 90.26%
Epoch: 05 | Epoch Time: 0m 57s
	Train Loss: 0.295 | Train Acc: 89.62%
	 Val. Loss: 0.273 |  Val. Acc: 90.83%
Epoch: 06 | Epoch Time: 0m 57s
	Train Loss: 0.270 | Train Acc: 90.54%
	 Val. Loss: 0.272 |  Val. Acc: 90.95%
Epoch: 07 | Epoch Time: 0m 57s
	Train Loss: 0.252 | Train Acc: 91.24%
	 Val. Loss: 0.259 |  Val. Acc: 91.24%
Epoch: 08 | Epoch Time: 0m 57s
	Train Loss: 0.231 | Train Acc: 91.84%
	 Val. Loss: 0.251 |  Val. Acc: 91.56%
Epoch: 09 | Epoch Time: 0m 57s
	Train Loss: 0.219 | Train Acc: 92.35%
	 Val. Loss: 0.262 |  Val. Acc: 91.54%
Epoch: 10 | Epoch T

In [31]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.263 | Test Acc: 91.17%
