In [1]:
# load dataset

from datasets import load_dataset
dataset = load_dataset("glue", "sst2")
dataset['train'][:10]

Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature ',
  'remains utterly satisfied to remain the same throughout ',
  'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
  "that 's far too tragic to merit such superficial treatment ",
  'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
  'of saucy ',
  "a depressed fifteen-year-old 's suicidal poetry ",
  "are more deeply thought through than in most ` right-thinking ' films "],
 'label': [0, 0, 1, 0, 0, 0, 1, 1, 0, 1],
 'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}

In [2]:
import torch
import torch
import torch.nn as nn
from torchtext import data



## data processing

In [3]:
# preprocessing and tokenizer
from collections import Counter
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

def get_alphabet(corpuses):
	"""
	obtain the dict
			:param corpuses: 
	"""
	word_counter = Counter()

	for corpus in corpuses:
		for item in corpus:
			tokens = tokenizer(item['sentence'])
			for token in tokens:
				word_counter[token] += 1
	print("there are {} words in dict".format(len(word_counter)))
	# logging.info("there are {} words in dict".format(len(word_counter)))
	word_dict = {word: e + 2 for e, word in enumerate(list(word_counter))}
	word_dict['UNK'] = 1
	word_dict['<PAD>'] = 0

	return word_dict

vocab = get_alphabet([dataset['train'],dataset['validation']])

there are 15696 words in dict


In [4]:
# get embedding
import numpy as np 
def get_embedding(alphabet, filename="", embedding_size=100):
	embedding = np.random.rand(len(alphabet), embedding_size)
	if filename is None:
		return embedding
	with open(filename, encoding='utf-8') as f:
		i = 0
		for line in f:
			i += 1
			if i % 100000 == 0:
				print('epch %d' % i)
			items = line.strip().split(' ')
			if len(items) == 2:
				vocab_size, embedding_size = items[0], items[1]
				print((vocab_size, embedding_size))
			else:
				word = items[0]
				if word in alphabet:
					embedding[alphabet[word]] = items[1:]

	print('done')
	return embedding
embedding = get_embedding(vocab, filename="../embedding/glove.6B.300d.txt",embedding_size = 300)

epch 100000
epch 200000
epch 300000
epch 400000
done


In [5]:
embedding.shape

(15698, 300)

In [6]:
# convert to index

def convert_to_word_ids(sentence,alphabet,max_len = 40):
	"""
	docstring here
		:param sentence: 
		:param alphabet: 
		:param max_len=40: 
	"""
	indices = []
	tokens = tokenizer(sentence)
	for word in tokens:
		if word in alphabet:
			indices.append(alphabet[word])
		else:
			continue
	result = indices + [alphabet['<PAD>']] * (max_len - len(indices))

	return result[:max_len], min(len(indices),max_len)

test_enc, length = convert_to_word_ids("hello, how are you", vocab, 10)
print(test_enc)
print(length)

[12, 111, 78, 470, 0, 0, 0, 0, 0, 0]
4


In [7]:
# generate data batch and iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 1
class DataMaper(Dataset):
    def __init__(self,dataset,vocab):
        self.x = dataset['sentence']
        self.y = dataset['label']
        self.max_length = 120
        self.vocab = vocab

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        sentence = self.x[idx]
        label = self.y[idx]

        enc_sentence,lengths = convert_to_word_ids(sentence, self.vocab, max_len = self.max_length)
        t_sentence = torch.tensor(enc_sentence).to(device)
        t_label = torch.tensor(label).to(device)
        t_length = torch.tensor(lengths).to(device)
        return t_sentence,t_label,t_length

train = DataMaper(dataset['train'],vocab)
validation = DataMaper(dataset['validation'],vocab)
test = DataMaper(dataset['test'], vocab)

loader_train = DataLoader(train, batch_size=batch_size, shuffle=True)
loader_validation = DataLoader(validation, batch_size = batch_size)
loader_test = DataLoader(test,batch_size = batch_size)

In [27]:
# for batch in loader_train:
#     print(batch)

## Training process

## 

In [8]:
def cal_accuracy(probs, target):
    predictions = probs.argmax(dim=1)
    corrects = (predictions == target)
    accuracy = corrects.sum().float() / float(target.size(0))
    return accuracy

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        
        text, label,lengths = batch
        predictions = model(text,lengths)
        loss = criterion(predictions, label)
        acc = cal_accuracy(predictions, label)        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, label,lengths = batch
            predictions = model(text,lengths)
            loss = criterion(predictions, label)
            acc = cal_accuracy(predictions, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [9]:
def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type):
    best_valid_loss = float('inf')
    
    for epoch in range(epochs):

        # train the model
        model.train()
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        # save the best model
        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
    

## my own RNN model

In [10]:
from torch import nn
class RNN(nn.Module):

    # you can also accept arguments in your model constructor

    #  we don't use the output in this implemention
    def __init__(self, embed_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        input_size = embed_size + hidden_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(input_size, output_size)
        

    def forward(self, data, last_hidden):
        input = torch.cat((data, last_hidden), 1)
        hidden = torch.sigmoid(self.i2h(input))
        output = self.h2o(input)
        return output, hidden
    def initHidden(self,batch_size):
        # return torch.zeros(batch_size,self.hidden_size).to(self.device)
        return nn.init.kaiming_uniform_(torch.empty(batch_size, self.hidden_size)).to(self.device)
class RNN_layer(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_dim,output_size):
        super(RNN_layer,self).__init__()
        self.rnn = RNN(embed_size,hidden_dim,output_size)
        self.embedding = nn.Embedding(vocab_size, embed_size,padding_idx=0)
        # self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout(0.2)
    def forward(self,x, text_lens):
        batch_size = x.size(0)
        seq_len = x.size(1)

        x = self.dropout(self.embedding(x))

        hidden = self.rnn.initHidden(batch_size)
        hiddens = []
        # recurrent rnn
        for i in range(seq_len):
            output, hidden_next = self.rnn(x[:,i,:], hidden)
            mask = (i < text_lens).float().unsqueeze(1).expand_as(hidden_next).to(device)
            # if hidden_next is 
            hidden_next = (hidden_next * mask + hidden * (1 - mask)).to(device)
            hiddens.append(hidden_next.unsqueeze(1))
            hidden = hidden_next
        final_hidden = hidden
        hidden_tensor = torch.cat(hiddens,1)
        return hidden_tensor,final_hidden,output
        

class RNN_Model_for_classfication(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_dim,output_size):
        super(RNN_Model_for_classfication,self).__init__()

        self.hidden_dim = hidden_dim
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.vocab_size = vocab_size 
        # define the layer
        # self.rnn = nn.RNN(embed_size,hidden_dim,num_layers = 1,batch_first= True)
        self.rnn_layer = RNN_layer(self.vocab_size,embed_size,hidden_dim,output_size)
        self.fc = nn.Linear(hidden_dim,output_size)
        self.softmax = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)
    def forward(self,x,lens):
    
        hidden_tensor, final_hidden , output = self.rnn_layer(x, lens)

        out = output
        return out

In [11]:
epochs = 10
lr = 1e-3

rnn_model = RNN_Model_for_classfication(vocab_size = len(vocab),embed_size = 300,hidden_dim = 256,
output_size = 2)
rnn_model.to(device)
optimizer = torch.optim.Adam(rnn_model.parameters(),lr = lr)
loss_func = nn.CrossEntropyLoss()
run_train(epochs,rnn_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

KeyboardInterrupt: 

## TN

In [30]:
import torch
import torch.nn as nn

class TN(nn.Module):

    # tensor network unit
    def __init__(self, rank, output_size):
        super(TN, self).__init__()

        self.rank = rank
        self.output_size = output_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # self.device = torch.device('cpu')
        self.i2h = nn.Linear(self.rank, self.rank)
        self.x2w = nn.Linear(1, self.rank)
        self.h2o = nn.Linear(self.rank, output_size)
    

    def forward(self, data, m):
        # input = torch.cat((data, m.squeeze(1)), 1)

        # hidden = self.i2h(input)
        # output = self.h2o(hidden)

        # unit = self.i2h(data)
        unit = data.contiguous().view(-1,self.rank,self.rank)
        # get hidden
        activition = torch.nn.Tanh()
        # batch_size = unit.size(0)

        # weight = self.i2h.weight.unsqueeze(0).repeat([batch_size,1,1])
        # unit = torch.einsum("bij,bjk->bik",[unit,weight])
        m = activition(torch.einsum("bij,bjk->bik",[m,unit]))
        
        # # m = unit
        hidden = self.i2h(m)
        output = self.h2o(hidden)
        return hidden, output

    def init_m1(self,batch_size):
        return torch.ones(batch_size,1,self.rank).to(self.device)
        # return nn.Linear(1,self.rank).to(self.device)
    def init_m2(self):
        return nn.Linear(self.rank, self.output_size)
    def init_hidden(self,batch_size):
        return torch.zeros(batch_size,self.rank).to(self.device)

class TN_layer(nn.Module):
    def __init__(self,rank,vocab_size,output_size):
        super(TN_layer,self).__init__()
        self.tn = TN(rank,output_size)
        self.rank = rank
        self.embedding = nn.Embedding(vocab_size,self.rank * self.rank,padding_idx=0)

        # self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout(0.2)

        
    def forward(self,x):
        batch_size = x.size(0)
        seq_len = x.size(1)

        encoding = self.embedding(x)
        
        # m = self.tn.init_hidden(batch_size)
        m = self.tn.init_m1(batch_size)
        # m = m.weight.view(-1,self.rank).unsqueeze(0).repeat([batch_size,1,1])
        hiddens = []
        # recurrent tn
        for i in range(seq_len):
            m, output = self.tn(encoding[:,i,:], m)
            hiddens.append(m)
        final_hidden = m
        hidden_tensor = torch.cat(hiddens,1)
        return hidden_tensor,final_hidden,output
        

class TN_model_for_classfication(nn.Module):
    def __init__(self,rank,vocab_size, output_size):
        super(TN_model_for_classfication,self).__init__()

        self.rank = rank
        self.output_size = output_size
        self.vocab_size = vocab_size
        
        self.tn = TN_layer(self.rank, self.vocab_size, output_size)
        self.fc = nn.Linear(self.rank,output_size)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.softmax = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self,x, lens):
        seq_output, hidden, output = self.tn(x)
        # out = out.contiguous().view(-1,self.rank)
        # output = self.fc(hidden.squeeze(1))
        output = output.squeeze(1)

        
        return output
    def init_hidden(self,batch_size):
        hidden = torch.zeros(self.n_layers,batch_size,self.hidden_dim).to(self.device)
        return hidden

In [31]:
epochs = 10
lr = 1e-3

tn_model = TN_model_for_classfication(rank = 256, vocab_size = len(vocab),output_size = 2)
tn_model.to(device)
optimizer = torch.optim.Adam(tn_model.parameters(),lr = lr)
loss_func = nn.CrossEntropyLoss()
run_train(epochs,tn_model,loader_train,loader_validation,optimizer,loss_func,model_type = "cls")

KeyboardInterrupt: 

: 