In [1]:
# load dataset

from datasets import load_dataset
import torch
task_data = "sst2"
if task_data == "sst2":
    dataset = load_dataset("glue", "sst2")
else:
    dataset = load_dataset("glue", "mrpc")



Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
max(map(lambda x:len(x.split()),dataset['train']['sentence']))
# print(max(map(lambda x:len(x.split()),dataset['train']['sentence1'])))
# print(max(map(lambda x:len(x.split()),dataset['train']['sentence2'])))

52

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

## data processing

In [4]:
# preprocessing and tokenizer
from collections import Counter
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

def get_alphabet(corpuses):
	"""
	obtain the dict
			:param corpuses: 
	"""
	word_counter = Counter()

	for corpus in corpuses:
		for item in corpus:
			if task_data == "mrpc":
				tokens = tokenizer(item['sentence1'] + ' ' + item['sentence1'])
			else:
				tokens = tokenizer(item['sentence'])
			for token in tokens:
				word_counter[token] += 1
	print("there are {} words in dict".format(len(word_counter)))
	# logging.info("there are {} words in dict".format(len(word_counter)))
	word_dict = {word: e + 2 for e, word in enumerate(list(word_counter))}
	word_dict['UNK'] = 1
	word_dict['<PAD>'] = 0

	return word_dict

vocab = get_alphabet([dataset['train'],dataset['validation']])

there are 15696 words in dict


In [5]:
# get embedding
import numpy as np 
def get_embedding(alphabet, filename="", embedding_size=100):
	embedding = np.random.rand(len(alphabet), embedding_size)
	if filename is None:
		return embedding
	with open(filename, encoding='utf-8') as f:
		i = 0
		for line in f:
			i += 1
			if i % 100000 == 0:
				print('epch %d' % i)
			items = line.strip().split(' ')
			if len(items) == 2:
				vocab_size, embedding_size = items[0], items[1]
				print((vocab_size, embedding_size))
			else:
				word = items[0]
				if word in alphabet:
					embedding[alphabet[word]] = items[1:]

	print('done')
	return embedding
embedding = get_embedding(vocab, filename="../embedding/glove.6B.300d.txt",embedding_size = 300)

epch 100000
epch 200000
epch 300000
epch 400000
done


In [6]:
embedding.shape

(15698, 300)

In [7]:
# convert to index

def convert_to_word_ids(sentence,alphabet,max_len = 40):
	"""
	docstring here
		:param sentence: 
		:param alphabet: 
		:param max_len=40: 
	"""
	indices = []
	tokens = tokenizer(sentence)
	
	for word in tokens:
		if word in alphabet:
			indices.append(alphabet[word])
		else:
			continue
	result = indices + [alphabet['<PAD>']] * (max_len - len(indices))

	return result[:max_len]

test_enc = convert_to_word_ids("hello, how are you", vocab, 10)
test_enc

[12, 111, 78, 470, 0, 0, 0, 0, 0, 0]

In [8]:
# generate data batch and iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
max_document_length = 200
class DataMaper(Dataset):
    def __init__(self,dataset,vocab,max_document_length):
        if task_data == 'sst2':
            self.x = dataset['sentence']
        else:
            self.x1 = dataset['sentence1'] 
            self.x2 = dataset['sentence2'] 
        self.y = dataset['label']
        self.max_length = max_document_length
        self.vocab = vocab

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        
        if task_data == 'mrpc':
            sentence = self.x1[idx] + ' ' + self.x2[idx]
        else:
            sentence = self.x[idx]
        label = self.y[idx]

        enc_sentence = convert_to_word_ids(sentence, self.vocab, max_len = self.max_length)
        t_sentence = torch.tensor(enc_sentence).to(device)
        t_label = torch.tensor(label).to(device)
        return t_sentence,t_label

train = DataMaper(dataset['train'],vocab,max_document_length = 200)
validation = DataMaper(dataset['validation'],vocab,max_document_length = 200)
test = DataMaper(dataset['test'], vocab,max_document_length = 200)

loader_train = DataLoader(train, batch_size=batch_size, shuffle=True)
loader_validation = DataLoader(validation, batch_size = batch_size)
loader_test = DataLoader(test,batch_size = batch_size)

In [9]:
# for batch in loader_train:
#     print(batch)

## loading a model

In [10]:
import torch.nn as nn 

class Linear(nn.Module):
    def __init__(self,input_size,hidden_size, num_class):
        super(Linear, self).__init__()
        self.fc1 = nn.Linear(input_size,hidden_size, bias = True)
        self.fc2 = nn.Linear(hidden_size, num_class,bias = True)

    def forward(self, text):
        text = text.float()
        x = self.fc1(text)
        preds = self.fc2(x)
        return preds
        


## Training process

## 

In [11]:
def cal_accuracy(probs, target):
    predictions = probs.argmax(dim=1)
    corrects = (predictions == target)
    accuracy = corrects.sum().float() / float(target.size(0))
    return accuracy

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        
        text, label = batch
        predictions = model(text)
        loss = criterion(predictions, label.squeeze())
        acc = cal_accuracy(predictions, label)        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, label = batch
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label)
            acc = cal_accuracy(predictions, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type):
    best_valid_loss = float('inf')
    
    for epoch in range(epochs):

        # train the model
        model.train()
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        # save the best model
        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
    

In [13]:
epochs = 30
hidden_size = 50
num_classes = 2
lr = 1e-4
linear_model = Linear(max_document_length,hidden_size,num_classes)
linear_model.to(device)
optimizer = torch.optim.Adam(linear_model.parameters(),lr = lr)
loss_func = nn.CrossEntropyLoss()
run_train(epochs,linear_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

	Train Loss: 7.086 | Train Acc: 51.23%
	 Val. Loss: 3.785 |  Val. Acc: 51.29%
	Train Loss: 1.444 | Train Acc: 52.41%
	 Val. Loss: 3.833 |  Val. Acc: 49.42%
	Train Loss: 1.384 | Train Acc: 52.46%
	 Val. Loss: 3.418 |  Val. Acc: 49.26%
	Train Loss: 1.405 | Train Acc: 53.06%
	 Val. Loss: 3.631 |  Val. Acc: 48.93%
	Train Loss: 1.450 | Train Acc: 53.02%
	 Val. Loss: 3.695 |  Val. Acc: 48.86%
	Train Loss: 1.367 | Train Acc: 52.72%
	 Val. Loss: 3.616 |  Val. Acc: 50.09%
	Train Loss: 1.365 | Train Acc: 53.19%
	 Val. Loss: 4.431 |  Val. Acc: 51.99%
	Train Loss: 1.351 | Train Acc: 53.25%
	 Val. Loss: 2.863 |  Val. Acc: 50.16%
	Train Loss: 1.356 | Train Acc: 53.20%
	 Val. Loss: 3.500 |  Val. Acc: 50.16%
	Train Loss: 1.332 | Train Acc: 53.28%
	 Val. Loss: 3.518 |  Val. Acc: 48.62%
	Train Loss: 1.400 | Train Acc: 53.21%
	 Val. Loss: 3.239 |  Val. Acc: 50.25%
	Train Loss: 1.305 | Train Acc: 53.23%
	 Val. Loss: 3.696 |  Val. Acc: 49.78%
	Train Loss: 1.325 | Train Acc: 53.61%
	 Val. Loss: 3.966 |  Val

## CNN model

In [14]:
class CNN(nn.Module):
    def __init__(self, vocab_dim, e_dim, h_dim, o_dim):
        super(CNN, self).__init__()
        self.emb = nn.Embedding(vocab_dim, e_dim, padding_idx=0)
        # self.emb.load_state_dict({"weight":torch.tensor(embedding)})
        non_trainable = True
        if non_trainable:
            self.emb.weight.requires_grad = False
        self.dropout = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, h_dim, (3, e_dim))
        self.conv2 = nn.Conv2d(1, h_dim, (4, e_dim))
        self.conv3 = nn.Conv2d(1, h_dim, (5, e_dim))
        self.fc = nn.Linear(h_dim * 3, o_dim)
        # self.softmax = nn.Softmax(dim=1)
        # self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        embed = self.dropout(self.emb(x)).unsqueeze(1)
        c1 = torch.relu(self.conv1(embed).squeeze(3))
        p1 = torch.max_pool1d(c1, c1.size()[2]).squeeze(2)
        c2 = torch.relu(self.conv2(embed).squeeze(3))
        p2 = torch.max_pool1d(c2, c2.size()[2]).squeeze(2)
        c3 = torch.relu(self.conv3(embed).squeeze(3))
        p3 = torch.max_pool1d(c3, c3.size()[2]).squeeze(2)
        pool = self.dropout(torch.cat((p1, p2, p3), 1))
        hidden = self.fc(pool)
        # return self.softmax(hidden), self.log_softmax(hidden)
        return hidden 

        



In [15]:
epochs = 10
max_document_length = 40
hidden_size = 50
num_classes = 2
lr = 1e-4

cnn_model = CNN(len(vocab),e_dim = 300,h_dim = 64, o_dim = 2)
cnn_model.to(device)
optimizer = torch.optim.Adam(cnn_model.parameters(),lr = lr)
loss_func = nn.CrossEntropyLoss()
run_train(epochs,cnn_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

	Train Loss: 0.626 | Train Acc: 63.79%
	 Val. Loss: 0.578 |  Val. Acc: 71.41%
	Train Loss: 0.521 | Train Acc: 74.27%
	 Val. Loss: 0.532 |  Val. Acc: 73.64%
	Train Loss: 0.445 | Train Acc: 79.56%
	 Val. Loss: 0.510 |  Val. Acc: 74.46%
	Train Loss: 0.388 | Train Acc: 82.86%
	 Val. Loss: 0.498 |  Val. Acc: 75.25%
	Train Loss: 0.346 | Train Acc: 85.10%
	 Val. Loss: 0.493 |  Val. Acc: 75.56%
	Train Loss: 0.316 | Train Acc: 86.60%
	 Val. Loss: 0.496 |  Val. Acc: 75.67%
	Train Loss: 0.292 | Train Acc: 87.84%
	 Val. Loss: 0.500 |  Val. Acc: 75.49%
	Train Loss: 0.275 | Train Acc: 88.43%
	 Val. Loss: 0.496 |  Val. Acc: 76.21%
	Train Loss: 0.260 | Train Acc: 89.29%
	 Val. Loss: 0.508 |  Val. Acc: 76.38%
	Train Loss: 0.248 | Train Acc: 89.86%
	 Val. Loss: 0.502 |  Val. Acc: 76.50%


## RNN model


In [16]:
epochs = 10

hidden_size = 50
num_classes = 2
lr = 1e-3
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN(nn.Module):
    def __init__(self, vocab_dim, e_dim, h_dim, o_dim):
        super(RNN, self).__init__()
        self.h_dim = h_dim
        self.dropout = nn.Dropout(0.2)
        self.emb = nn.Embedding(vocab_dim, e_dim, padding_idx=0)
        # self.emb.load_state_dict({"weight":torch.tensor(embedding)})
        # non_trainable = True
        # if non_trainable:
        #     self.emb.weight.requires_grad = False
        self.lstm = nn.RNN(e_dim, h_dim, bidirectional=False, batch_first=True)
        self.fc = nn.Linear(h_dim, o_dim)

    def forward(self, x):
        embed = self.dropout(self.emb(x))
        print(embed)        
        out, _ = self.lstm(embed)
        hidden = self.fc(out[:, -1, :])
        return hidden

rnn_model = RNN(len(vocab),e_dim = 300,h_dim = 64, o_dim = 2)
rnn_model.to(device)
optimizer = torch.optim.Adam(rnn_model.parameters(),lr = lr)
loss_func = nn.CrossEntropyLoss()
run_train(epochs,rnn_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

tensor([[[ 1.0260, -0.0000,  0.8843,  ..., -0.2663, -0.6355,  1.4080],
         [ 0.0094,  0.0000,  0.2679,  ...,  1.8438,  0.4206,  1.2296],
         [ 0.2029, -0.8962,  2.1734,  ...,  0.3732, -0.0762,  0.9980],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.9881,  0.0310,  1.4007,  ..., -0.0000,  0.0000, -1.4308],
         [-1.5299,  0.0000, -0.8905,  ..., -0.5306, -2.1391,  0.1005],
         [ 2.4643,  1.2164,  0.0000,  ...,  0.5484,  1.5321,  0.7308],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.2813, -1.8355, -1.4775,  ..., -0.0000,  0.0000,  1.2485],
         [-0.4552,  0.9911, -1.6375,  ..., -0

NameError: name 'text_lengths' is not defined

: 