In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import torch

In [3]:
# GPU Check

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

Available devices  1
Current cuda device  0
Tesla T4


In [0]:
# TPU Check

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

TPU address is grpc://10.101.190.90:8470


In [0]:
import os
import time
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np

In [0]:
import random

SEED = 1234 # Random Seed for reproductivity

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# tockenize 지정해주기
TEXT = data.Field(sequential=True, tokenize='spacy', lower=True, include_lengths=True, batch_first=True, fix_length=200)
# TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [0]:
from torchtext.data import TabularDataset

# put the train and text csv to PATH
DATA_PATH = '/content/drive/My Drive/lottery-ticket-hypothesis-for-text-classification/data/yelp-polarity/'

train = data.TabularDataset(path = DATA_PATH + 'train.csv', format = 'csv', fields = [('text', TEXT), ('label', LABEL)], skip_header = False)
test = data.TabularDataset(path = DATA_PATH + 'test.csv', format = 'csv', fields = [('text', TEXT), ('label', LABEL)], skip_header = False)

## IMDb

In [0]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(test)}')

print(vars(train.examples[0]))

Number of training examples: 25000
Number of testing examples: 25000
{'text': ['please', 'see', 'also', 'my', 'comment', 'on', 'die', 'nibelungen', 'part', '1', ':', 'siegfried.<br', '/><br', '/>the', 'second', 'part', 'of', 'ufa', 'studio', "'s", 'gargantuan', 'production', 'of', 'the', 'nibelungen', 'saga', 'continues', 'in', 'the', 'stylised', ',', 'symphonic', 'and', 'emotionally', 'detached', 'manner', 'of', 'its', 'predecessor', '.', 'however', ',', 'whereas', 'part', 'one', 'was', 'a', 'passionless', 'portrayal', 'of', 'individual', 'acts', 'of', 'heroism', ',', 'part', 'two', 'is', 'a', 'chaotic', 'depiction', 'of', 'bloodletting', 'on', 'a', 'grand', 'scale.<br', '/><br', '/>as', 'in', 'part', 'one', ',', 'director', 'fritz', 'lang', 'maintains', 'a', 'continuous', 'dynamic', 'rhythm', ',', 'with', 'the', 'pace', 'of', 'the', 'action', 'and', 'the', 'complexity', 'of', 'the', 'shot', 'composition', 'rising', 'and', 'falling', 'smoothly', 'as', 'the', 'tone', 'of', 'each', 'sce

## Yelp-5

In [0]:
# make splits for data
# by default this splits 70:30

train, valid = train.split(random_state = random.seed(SEED))

In [11]:
print('train.fields', train.fields)
print(f'Number of training examples: {len(train)}')
print(f'Number of validation examples: {len(valid)}') 
print(f'Number of testing examples: {len(test)}')

train.fields {'text': <torchtext.data.field.Field object at 0x7f5bd3f946d8>, 'label': <torchtext.data.field.LabelField object at 0x7f5b6b904160>}
Number of training examples: 392000
Number of validation examples: 168000
Number of testing examples: 38000


In [12]:
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

word_embeddings = TEXT.vocab.vectors

print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
print ("Label Length: " + str(len(LABEL.vocab)))

.vector_cache/glove.6B.zip: 862MB [06:33, 2.19MB/s]                           
100%|█████████▉| 399331/400000 [00:50<00:00, 11237.82it/s]

Length of Text Vocabulary: 527135
Vector size of Text Vocabulary:  torch.Size([527135, 300])
Label Length: 2


In [0]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size = BATCH_SIZE,
    sort = False,
    device = device)

vocab_size = len(TEXT.vocab)

In [0]:
# TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset()

def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
    
def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 1000 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)

In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

In [0]:
# CNN

class CNN(nn.Module):
	def __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length,weights):
		super(CNN, self).__init__()
		
		"""
		Arguments
		---------
		batch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		in_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length)
		out_channels : Number of output channels after convolution operation performed on the input matrix
		kernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated.
		keep_probab : Probability of retaining an activation node during dropout operation
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embedding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table
		--------
		
		"""
		self.batch_size = batch_size
		self.output_size = output_size
		self.in_channels = in_channels
		self.out_channels = out_channels
		self.kernel_heights = kernel_heights
		self.stride = stride
		self.padding = padding
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
		self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding)
		self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding)
		self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding)
		self.dropout = nn.Dropout(keep_probab)
		self.label = nn.Linear(len(kernel_heights)*out_channels, output_size)
	
	def conv_block(self, input, conv_layer):
		conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1)
		activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1)
		max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)
		
		return max_out
	
	def forward(self, input_sentences, batch_size=None):
		
		"""
		The idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix 
		whose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length.
		We will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor 
		and will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected
		to the output layers consisting two units which basically gives us the logits for both positive and negative classes.
		
		Parameters
		----------
		input_sentences: input_sentences of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
		
		Returns
		-------
		Output of the linear layer containing logits for pos & neg class.
		logits.size() = (batch_size, output_size)
		
		"""
		
		input = self.word_embeddings(input_sentences)
		# input.size() = (batch_size, num_seq, embedding_length)
		input = input.unsqueeze(1)
		# input.size() = (batch_size, 1, num_seq, embedding_length)
		max_out1 = self.conv_block(input, self.conv1)
		max_out2 = self.conv_block(input, self.conv2)
		max_out3 = self.conv_block(input, self.conv3)
		
		all_out = torch.cat((max_out1, max_out2, max_out3), 1)
		# all_out.size() = (batch_size, num_kernels*out_channels)
		fc_in = self.dropout(all_out)
		# fc_in.size()) = (batch_size, num_kernels*out_channels)
		logits = self.label(fc_in)
		
		return logits

In [0]:
# RNN

class RNN(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(RNN, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
		
		"""

		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
		self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True)
		self.label = nn.Linear(4*hidden_size, output_size)
	
	def forward(self, input_sentences, batch_size=None):
		
		""" 
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
		
		Returns
		-------
		Output of the linear layer containing logits for pos & neg class which receives its input as the final_hidden_state of RNN.
		logits.size() = (batch_size, output_size)
		
		"""

		input = self.word_embeddings(input_sentences)
		input = input.permute(1, 0, 2)
		if batch_size is None:
			h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda()) # 4 = num_layers*num_directions
		else:
			h_0 =  Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())
		output, h_n = self.rnn(input, h_0)
		# h_n.size() = (4, batch_size, hidden_size)
		h_n = h_n.permute(1, 0, 2) # h_n.size() = (batch_size, 4, hidden_size)
		h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])
		# h_n.size() = (batch_size, 4*hidden_size)
		logits = self.label(h_n) # logits.size() = (batch_size, output_size)
		
		return logits

In [0]:
# LSTM

class LSTMClassifier(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(LSTMClassifier, self).__init__()
		
		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
		
		"""
		
		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
		self.lstm = nn.LSTM(embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)
		
	def forward(self, input_sentence, batch_size=None):
	
		""" 
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
		
		Returns
		-------
		Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
		final_output.shape = (batch_size, output_size)
		
		"""
		
		''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
		input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences,  embedding_length)
		input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
		if batch_size is None:
			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
		else:
			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
		final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)
		
		return final_output

In [29]:
learning_rate = 2e-5
batch_size = 32
output_size = 2
in_channels = 1
out_channels = 100
kernel_heights = [3, 4, 5]
stride = 1
padding = 0
keep_probab = 0.5
embedding_length = 300

model = CNN(batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(5):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

Epoch: 1, Idx: 1000, Training Loss: 0.2584, Training Accuracy:  87.50%
Epoch: 1, Idx: 2000, Training Loss: 0.3634, Training Accuracy:  81.25%
Epoch: 1, Idx: 3000, Training Loss: 0.1975, Training Accuracy:  90.62%
Epoch: 1, Idx: 4000, Training Loss: 0.2251, Training Accuracy:  93.75%
Epoch: 1, Idx: 5000, Training Loss: 0.4394, Training Accuracy:  87.50%
Epoch: 1, Idx: 6000, Training Loss: 0.4562, Training Accuracy:  75.00%
Epoch: 1, Idx: 7000, Training Loss: 0.3440, Training Accuracy:  84.38%
Epoch: 1, Idx: 8000, Training Loss: 0.2116, Training Accuracy:  93.75%
Epoch: 1, Idx: 9000, Training Loss: 0.1748, Training Accuracy:  90.62%
Epoch: 1, Idx: 10000, Training Loss: 0.2624, Training Accuracy:  87.50%
Epoch: 1, Idx: 11000, Training Loss: 0.1595, Training Accuracy:  96.88%
Epoch: 1, Idx: 12000, Training Loss: 0.2180, Training Accuracy:  87.50%
Epoch: 01, Train Loss: 0.274, Train Acc: 88.92%, Val. Loss: 0.200780, Val. Acc: 92.05%
Epoch: 2, Idx: 1000, Training Loss: 0.4217, Training Accur

In [26]:
learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(5):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

Epoch: 1, Idx: 1000, Training Loss: 0.5441, Training Accuracy:  71.88%
Epoch: 1, Idx: 2000, Training Loss: 0.4804, Training Accuracy:  75.00%
Epoch: 1, Idx: 3000, Training Loss: 0.5657, Training Accuracy:  65.62%
Epoch: 1, Idx: 4000, Training Loss: 0.5875, Training Accuracy:  68.75%
Epoch: 1, Idx: 5000, Training Loss: 0.4619, Training Accuracy:  75.00%
Epoch: 1, Idx: 6000, Training Loss: 0.5773, Training Accuracy:  71.88%
Epoch: 1, Idx: 7000, Training Loss: 0.4915, Training Accuracy:  71.88%
Epoch: 1, Idx: 8000, Training Loss: 0.5334, Training Accuracy:  78.12%
Epoch: 1, Idx: 9000, Training Loss: 0.4688, Training Accuracy:  75.00%
Epoch: 1, Idx: 10000, Training Loss: 0.5572, Training Accuracy:  75.00%
Epoch: 1, Idx: 11000, Training Loss: 0.5522, Training Accuracy:  68.75%
Epoch: 1, Idx: 12000, Training Loss: 0.5203, Training Accuracy:  75.00%
Epoch: 01, Train Loss: 0.577, Train Acc: 70.94%, Val. Loss: 0.568177, Val. Acc: 73.10%
Epoch: 2, Idx: 1000, Training Loss: 0.3764, Training Accur

In [28]:
learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(5):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

Epoch: 1, Idx: 1000, Training Loss: 0.6223, Training Accuracy:  71.88%
Epoch: 1, Idx: 2000, Training Loss: 0.2179, Training Accuracy:  87.50%
Epoch: 1, Idx: 3000, Training Loss: 0.2041, Training Accuracy:  87.50%
Epoch: 1, Idx: 4000, Training Loss: 0.2600, Training Accuracy:  90.62%
Epoch: 1, Idx: 5000, Training Loss: 0.4507, Training Accuracy:  87.50%
Epoch: 1, Idx: 6000, Training Loss: 0.1523, Training Accuracy:  93.75%
Epoch: 1, Idx: 7000, Training Loss: 0.2009, Training Accuracy:  87.50%
Epoch: 1, Idx: 8000, Training Loss: 0.3012, Training Accuracy:  90.62%
Epoch: 1, Idx: 9000, Training Loss: 0.1698, Training Accuracy:  90.62%
Epoch: 1, Idx: 10000, Training Loss: 0.1412, Training Accuracy:  90.62%
Epoch: 1, Idx: 11000, Training Loss: 0.2526, Training Accuracy:  90.62%
Epoch: 1, Idx: 12000, Training Loss: 0.1630, Training Accuracy:  93.75%
Epoch: 01, Train Loss: 0.257, Train Acc: 88.21%, Val. Loss: 0.166088, Val. Acc: 93.38%
Epoch: 2, Idx: 1000, Training Loss: 0.2180, Training Accur