In [2]:
# load dataset
import torch

from torchtext import data
from datasets import load_dataset
dataset = load_dataset("glue", "sst2")
dataset

Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

## data processing

In [3]:
# preprocessing and tokenizer
from collections import Counter
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

def get_alphabet(corpuses):
	"""
	obtain the dict
			:param corpuses: 
	"""
	word_counter = Counter()

	for corpus in corpuses:
		for item in corpus:
			tokens = tokenizer(item['sentence'])
			for token in tokens:
				word_counter[token] += 1
	print("there are {} words in dict".format(len(word_counter)))
	# logging.info("there are {} words in dict".format(len(word_counter)))
	word_dict = {word: e + 2 for e, word in enumerate(list(word_counter))}
	word_dict['UNK'] = 1
	word_dict['<PAD>'] = 0

	return word_dict

vocab = get_alphabet([dataset['train'],dataset['validation']])

there are 15696 words in dict


In [4]:
# get embedding
import numpy as np 
def get_embedding(alphabet, filename="", embedding_size=100):
	embedding = np.random.rand(len(alphabet), embedding_size)
	if filename is None:
		return embedding
	with open(filename, encoding='utf-8') as f:
		i = 0
		for line in f:
			i += 1
			if i % 100000 == 0:
				print('epch %d' % i)
			items = line.strip().split(' ')
			if len(items) == 2:
				vocab_size, embedding_size = items[0], items[1]
				print((vocab_size, embedding_size))
			else:
				word = items[0]
				if word in alphabet:
					embedding[alphabet[word]] = items[1:]

	print('done')
	return embedding
embedding = get_embedding(vocab, filename="../embedding/glove.6B.300d.txt",embedding_size = 300)

epch 100000
epch 200000
epch 300000
epch 400000
done


In [5]:
embedding.shape

(15698, 300)

In [6]:
# convert to index

def convert_to_word_ids(sentence,alphabet,max_len = 40):
	"""
	docstring here
		:param sentence: 
		:param alphabet: 
		:param max_len=40: 
	"""
	indices = []
	tokens = tokenizer(sentence)
	
	for word in tokens:
		if word in alphabet:
			indices.append(alphabet[word])
		else:
			continue
	result = indices + [alphabet['<PAD>']] * (max_len - len(indices))

	return result[:max_len], min(len(tokens),max_len)

test_enc, length = convert_to_word_ids("hello, how are you", vocab, 10)
print(test_enc)
print(length)

[12, 111, 78, 470, 0, 0, 0, 0, 0, 0]
5


In [65]:
# generate data batch and iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
class DataMaper(Dataset):
    def __init__(self,dataset,vocab):
        self.x = dataset['sentence']
        self.y = dataset['label']
        self.max_length = 20
        self.vocab = vocab

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        sentence = self.x[idx]
        label = self.y[idx]

        enc_sentence,lengths = convert_to_word_ids(sentence, self.vocab, max_len = self.max_length)
        t_sentence = torch.tensor(enc_sentence).to(device)
        t_label = torch.tensor(label).to(device)
        t_length = torch.tensor(lengths).to(device)
        return t_sentence,t_label,t_length

train = DataMaper(dataset['train'],vocab)
validation = DataMaper(dataset['validation'],vocab)
test = DataMaper(dataset['test'], vocab)

loader_train = DataLoader(train, batch_size=batch_size, shuffle=True)
loader_validation = DataLoader(validation, batch_size = batch_size)
loader_test = DataLoader(test,batch_size = batch_size)

In [8]:
# for batch in loader_train:
#     print(batch)

## Training process

## 

In [82]:
def cal_accuracy(probs, target):
    predictions = probs.argmax(dim=1)
    corrects = (predictions == target)
    accuracy = corrects.sum().float() / float(target.size(0))
    return accuracy

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        
        text, label,lengths = batch
        predictions = model(text,lengths)
       
        loss = criterion(predictions, label.squeeze())
        acc = cal_accuracy(predictions, label)        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, label,lengths = batch
            predictions = model(text,lengths).squeeze(1)
            loss = criterion(predictions, label)
            acc = cal_accuracy(predictions, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [83]:
def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type):
    best_valid_loss = float('inf')
    
    for epoch in range(epochs):

        # train the model
        print("train epoch:{}".format(epoch))
        model.train()
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        # save the best model
        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
    

## RNN model


In [84]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

class LSTM(nn.Module):

    # define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim , num_classes, lstm_layers,
                 bidirectional, dropout, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        num_directions = 2 if bidirectional else 1
        self.fc1 = nn.Linear(hidden_dim * num_directions, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.lstm_layers = lstm_layers
        self.num_directions = num_directions
        self.hidden_dim = hidden_dim


    def init_hidden(self, batch_size):
        h, c = (Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.hidden_dim)),
                Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.hidden_dim)))
        return h.to(device), c.to(device)

    def forward(self, text, text_lengths):
        batch_size = text.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)

        embedded = self.embedding(text)
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu().numpy(), batch_first=True,enforce_sorted=False)
        output, (h_n, c_n) = self.lstm(packed_embedded, (h_0, c_0))
        # output_unpacked, output_lengths = pad_packed_sequence(output, batch_first=True)
        # if it is bi directional LSTM, we should concat the two f
        out = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        # out = h_n[-1]
        # print(h_n.shape)
        # out = output_unpacked[:, -1, :]
        preds = self.fc1(out)
        return preds

In [85]:
# epochs = 10
# lr = 1e-2

# rnn_model = LSTM(vocab_size = len(vocab),embedding_dim=300,hidden_dim = 100,
# num_classes = 2, lstm_layers = 2, bidirectional = True,dropout=0.5,pad_index = 0)
# rnn_model.to(device)
# optimizer = torch.optim.Adam(rnn_model.parameters(),lr = lr)
# loss_func = nn.CrossEntropyLoss()
# run_train(epochs,rnn_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

## multiplicative RNN

In [94]:
import math
from torch.nn import functional as F
class RNN(nn.Module):

    # you can also accept arguments in your model constructor

    #  we don't use the output in this implemention
    def __init__(self, embed_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        input_size = embed_size + hidden_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # self.Wih = nn.Parameter(torch.FloatTensor(embed_size,hidden_size))
        # self.Whh = nn.Parameter(torch.FloatTensor(hidden_size,hidden_size))
        self.Wih = nn.Linear(embed_size, hidden_size)
        self.Whh = nn.Linear(hidden_size, hidden_size)
        # w_im = torch.Tensor(embed_size,  hidden_size)
        # w_hm = torch.Tensor(hidden_size, hidden_size)
        # b_im = torch.Tensor(hidden_size)
        # b_hm = torch.Tensor(hidden_size)
        # self.w_im = nn.Parameter(w_im)
        # self.b_im = nn.Parameter(b_im)
        # self.w_hm = nn.Parameter(w_hm)
        # self.b_hm = nn.Parameter(b_hm)
        self.w_im = nn.Linear(embed_size, hidden_size)
        self.w_hm = nn.Linear(hidden_size, hidden_size)
        
        # self.i2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(input_size, output_size)
        self.reset_parameters()
        
    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)
        

    def forward(self, data, last_hidden):
        input = torch.cat((data, last_hidden), 1)
    
        mx = self.w_im(data) * self.w_hm(last_hidden)

        wi = self.Wih(data)
        wh = self.Whh(mx)
        # wi = torch.mm(data,self.Wih)
        # wh = torch.mm(mx,self.Whh)

        hidden = torch.relu(wi + wh)

        output = self.h2o(input)
        return output, hidden
    def initHidden(self,batch_size):
        # return torch.zeros(batch_size,self.hidden_size).to(self.device)
        return nn.init.kaiming_uniform_(torch.empty(batch_size, self.hidden_size)).to(self.device)
class RNN_layer(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_dim,output_size):
        super(RNN_layer,self).__init__()
        self.rnn = RNN(embed_size,hidden_dim,output_size)
        self.embedding = nn.Embedding(vocab_size, embed_size,padding_idx=0)
        # self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout(0.2)
    def forward(self,x):
        batch_size = x.size(0)
        seq_len = x.size(1)

        x = self.dropout(self.embedding(x))

        hidden = self.rnn.initHidden(batch_size)
        hiddens = []
        # recurrent rnn
        for i in range(seq_len):
            output, hidden = self.rnn(x[:,i,:], hidden)
            hiddens.append(hidden.unsqueeze(1))
        final_hidden = hidden
        hidden_tensor = torch.cat(hiddens,1)
        return hidden_tensor,final_hidden,output
        

class RNN_Model_for_classfication(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_dim,output_size):
        super(RNN_Model_for_classfication,self).__init__()

        self.hidden_dim = hidden_dim
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.vocab_size = vocab_size 
        # define the layer
        # self.rnn = nn.RNN(embed_size,hidden_dim,num_layers = 1,batch_first= True)
        self.rnn = RNN_layer(self.vocab_size,embed_size,hidden_dim,output_size)
        self.fc = nn.Linear(hidden_dim,output_size)
        self.softmax = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)
    def forward(self,x,lens):
    
        hidden_tensor, final_hidden , output = self.rnn(x)

        out = output
        return out

In [95]:
epochs = 10
lr = 1e-3

rnn_model = RNN_Model_for_classfication(vocab_size = len(vocab),embed_size = 300,hidden_dim = 256,
output_size = 2)

rnn_model.to(device)
optimizer = torch.optim.Adam(rnn_model.parameters(),lr = lr)
loss_func = nn.CrossEntropyLoss()
run_train(epochs,rnn_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

train epoch:0
	Train Loss: 0.693 | Train Acc: 56.05%
	 Val. Loss: 0.704 |  Val. Acc: 51.70%
train epoch:1
	Train Loss: 0.680 | Train Acc: 57.63%
	 Val. Loss: 0.726 |  Val. Acc: 52.32%
train epoch:2
	Train Loss: 0.669 | Train Acc: 58.64%
	 Val. Loss: 0.773 |  Val. Acc: 52.23%
train epoch:3
	Train Loss: 0.659 | Train Acc: 59.40%
	 Val. Loss: 0.763 |  Val. Acc: 53.19%
train epoch:4
	Train Loss: 823598.447 | Train Acc: 57.19%
	 Val. Loss: 2.446 |  Val. Acc: 50.36%
train epoch:5
	Train Loss: 19.433 | Train Acc: 57.28%
	 Val. Loss: 2.199 |  Val. Acc: 51.58%
train epoch:6
	Train Loss: 9.246 | Train Acc: 57.56%
	 Val. Loss: 3.100 |  Val. Acc: 51.09%
train epoch:7
	Train Loss: 2.250 | Train Acc: 57.87%
	 Val. Loss: 2.201 |  Val. Acc: 50.87%
train epoch:8
	Train Loss: 1.516 | Train Acc: 58.00%
	 Val. Loss: 1.563 |  Val. Acc: 50.31%
train epoch:9
	Train Loss: 1.222 | Train Acc: 58.12%
	 Val. Loss: 1.364 |  Val. Acc: 50.83%
