## Using pytorch lightning to train the model

## get dataset

In [1]:
from datasets import load_dataset
import torch
# preprocessing and tokenizer
from collections import Counter
from torchtext.data.utils import get_tokenizer
import wandb

dataset = load_dataset("glue", "sst2")
tokenizer = get_tokenizer("basic_english")

def get_alphabet(corpuses):
	"""
	obtain the dict
			:param corpuses: 
	"""
	word_counter = Counter()

	for corpus in corpuses:
		for item in corpus:
			tokens = tokenizer(item['sentence'])
			for token in tokens:
				word_counter[token] += 1
	print("there are {} words in dict".format(len(word_counter)))
	# logging.info("there are {} words in dict".format(len(word_counter)))
	word_dict = {word: e + 2 for e, word in enumerate(list(word_counter))}
	word_dict['UNK'] = 1
	word_dict['<PAD>'] = 0

	return word_dict

vocab = get_alphabet([dataset['train'],dataset['validation']])


Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

there are 15696 words in dict


In [2]:
# get embedding
import numpy as np 
def get_embedding(alphabet, filename="", embedding_size=100):
	embedding = np.random.rand(len(alphabet), embedding_size)
	if filename is None:
		return embedding
	with open(filename, encoding='utf-8') as f:
		i = 0
		for line in f:
			i += 1
			if i % 100000 == 0:
				print('epch %d' % i)
			items = line.strip().split(' ')
			if len(items) == 2:
				vocab_size, embedding_size = items[0], items[1]
				print((vocab_size, embedding_size))
			else:
				word = items[0]
				if word in alphabet:
					embedding[alphabet[word]] = items[1:]

	print('done')
	return embedding
embedding = get_embedding(vocab, filename="../embedding/glove.6B.300d.txt",embedding_size = 300)

epch 100000
epch 200000
epch 300000
epch 400000
done


In [3]:
# convert to index

def convert_to_word_ids(sentence,alphabet,max_len = 40):
	"""
	docstring here
		:param sentence: 
		:param alphabet: 
		:param max_len=40: 
	"""
	indices = []
	tokens = tokenizer(sentence)
	
	for word in tokens:
		if word in alphabet:
			indices.append(alphabet[word])
		else:
			continue
	result = indices + [alphabet['<PAD>']] * (max_len - len(indices))

	return result[:max_len], min(len(tokens),max_len)

test_enc, length = convert_to_word_ids("hello, how are you", vocab, 10)
print(test_enc)
print(length)

[12, 111, 78, 470, 0, 0, 0, 0, 0, 0]
5


In [4]:
# generate data batch and iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from pytorch_lightning.loggers import WandbLogger



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
batch_size = 64
class DataMaper(Dataset):
    def __init__(self,dataset,vocab):
        self.x = dataset['sentence']
        self.y = dataset['label']
        self.max_length = 40
        self.vocab = vocab

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        sentence = self.x[idx]
        label = self.y[idx]

        enc_sentence,lengths = convert_to_word_ids(sentence, self.vocab, max_len = self.max_length)
        t_sentence = torch.tensor(enc_sentence).to(device)
        t_label = torch.tensor(label).to(device)
        t_length = torch.tensor(lengths).to(device)
        return t_sentence,t_label,t_length

train = DataMaper(dataset['train'],vocab)
validation = DataMaper(dataset['validation'],vocab)
test = DataMaper(dataset['test'], vocab)

loader_train = DataLoader(train, batch_size=batch_size, shuffle=True)
loader_validation = DataLoader(validation, batch_size = batch_size)
loader_test = DataLoader(test,batch_size = batch_size)

In [6]:
import torch
from torch import nn
import pytorch_lightning as pl

def cal_accuracy(probs, target):
    predictions = probs.argmax(dim=1)
    corrects = (predictions == target)
    accuracy = corrects.sum().float() / float(target.size(0))
    return accuracy

In [6]:
# change the model to pytorch_lightning

class CNN(nn.Module):
    def __init__(self, vocab_dim, e_dim, h_dim, o_dim):
        super(CNN, self).__init__()
        self.emb = nn.Embedding(vocab_dim, e_dim, padding_idx=0)
        self.emb.load_state_dict({"weight":torch.tensor(embedding)})
        non_trainable = True
        if non_trainable:
            self.emb.weight.requires_grad = False
        self.dropout = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, h_dim, (3, e_dim))
        self.conv2 = nn.Conv2d(1, h_dim, (4, e_dim))
        self.conv3 = nn.Conv2d(1, h_dim, (5, e_dim))
        self.fc = nn.Linear(h_dim * 3, o_dim)
        # self.softmax = nn.Softmax(dim=1)
        # self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        embed = self.dropout(self.emb(x)).unsqueeze(1)
        c1 = torch.relu(self.conv1(embed).squeeze(3))
        p1 = torch.max_pool1d(c1, c1.size()[2]).squeeze(2)
        c2 = torch.relu(self.conv2(embed).squeeze(3))
        p2 = torch.max_pool1d(c2, c2.size()[2]).squeeze(2)
        c3 = torch.relu(self.conv3(embed).squeeze(3))
        p3 = torch.max_pool1d(c3, c3.size()[2]).squeeze(2)
        pool = self.dropout(torch.cat((p1, p2, p3), 1))
        hidden = self.fc(pool)
        # return self.softmax(hidden), self.log_softmax(hidden)
        return hidden 

class litCNN(pl.LightningModule):
    def __init__(self, vocab_dim, e_dim, h_dim, o_dim):
        super().__init__()
        self.model = CNN(vocab_dim,e_dim,h_dim,o_dim)

        # find the batch_size
        self.save_hyperparameters()

    def forward(self, x):
        encode = self.model(x)
        return encode 
    
    # optimizers go into configure_optimizer

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(),lr = 1e-3)
        return optimizer
    
    # train and validation
    def training_step(self,train_batch, batch_idx):
        text, label,lengths = train_batch
        predictions = self.model(text)
        criterion = nn.CrossEntropyLoss()

        loss = criterion(predictions, label)
        acc = cal_accuracy(predictions, label)
        self.log("train_loss",loss)
        self.log("acc", acc)
        return loss

    def validation_step(self,val_batch,batch_idx):
        text, label,lengths = val_batch
        predictions = self.model(text)
        criterion = nn.CrossEntropyLoss()

        loss = criterion(predictions, label)
        acc = cal_accuracy(predictions, label)
        self.log("val_loss",loss)
        self.log("val_acc", acc)
        return acc
    def validation_epoch_end(self, validation_step_outputs):
        all_acc= torch.stack(validation_step_outputs)
        print(torch.mean(all_acc))
        self.log("val_epoch_acc",torch.mean(all_acc))

## train the model

In [7]:
import wandb
wandb_logger = WandbLogger(name = "cnn_word2vec",project="text_classification")
model = litCNN(len(vocab),e_dim = 300,h_dim = 64, o_dim = 2)
# wandb_logger.watch(model, log="all")
trainer = pl.Trainer(logger=wandb_logger,max_epochs = 10)
trainer.fit(model,loader_train,loader_validation)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzhansu[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.19 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name  | Type | Params
-------------------------------
0 | model | CNN  | 4.9 M 
-------------------------------
230 K     Trainable params
4.7 M     Non-trainable params
4.9 M     Total params
19.762    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

tensor(0.5312)


  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

## RNN pytorch_lightning training


In [7]:
wandb_logger = WandbLogger(name = "rnn_word2vec",project="text_classification")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzhansu[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.19 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [8]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

class LSTM(nn.Module):

    # define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim , num_classes, lstm_layers,
                 bidirectional, dropout, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)
        self.embedding.load_state_dict({"weight":torch.tensor(embedding)})
        non_trainable = True
        if non_trainable:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        num_directions = 2 if bidirectional else 1
        self.fc1 = nn.Linear(hidden_dim * num_directions, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.lstm_layers = lstm_layers
        self.num_directions = num_directions
        self.hidden_dim = hidden_dim


    def init_hidden(self, batch_size):
        h, c = (Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.hidden_dim)),
                Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.hidden_dim)))
        return h.to(device), c.to(device)

    def forward(self, text, text_lengths):
        batch_size = text.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)

        embedded = self.embedding(text)
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu().numpy(), batch_first=True,enforce_sorted=False)
        output, (h_n, c_n) = self.lstm(packed_embedded, (h_0, c_0))
        # output_unpacked, output_lengths = pad_packed_sequence(output, batch_first=True)
        # if it is bi directional LSTM, we should concat the two f
        out = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        # out = h_n[-1]
        # print(h_n.shape)
        # out = output_unpacked[:, -1, :]
        preds = self.fc1(out)
        return preds
class litRNN(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes,lstm_layers, bidirectional, dropout, pad_index):
        super().__init__()
        self.model = LSTM(vocab_size = vocab_size,embedding_dim=embedding_dim,hidden_dim = hidden_dim,
            num_classes = num_classes, lstm_layers = lstm_layers, bidirectional = bidirectional,dropout=dropout,pad_index = pad_index)

        # find the batch_size
        self.save_hyperparameters()

    def forward(self, x, lengths):
        encode = self.model(x, lengths)
        return encode 
    
    # optimizers go into configure_optimizer

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(),lr = 1e-3)
        return optimizer
    
    # train and validation
    def training_step(self,train_batch, batch_idx):
        text, label, lengths = train_batch
        predictions = self.model(text,lengths)
        criterion = nn.CrossEntropyLoss()

        loss = criterion(predictions, label)
        acc = cal_accuracy(predictions, label)
        self.log("train_loss",loss)
        self.log("acc", acc)
        return loss

    def validation_step(self,val_batch,batch_idx):
        text, label,lengths = val_batch
        predictions = self.model(text,lengths)
        criterion = nn.CrossEntropyLoss()

        loss = criterion(predictions, label)
        acc = cal_accuracy(predictions, label)
        self.log("val_loss",loss)
        self.log("val_acc", acc)
        return acc
    def validation_epoch_end(self, validation_step_outputs):
        all_acc= torch.stack(validation_step_outputs)
        print(torch.mean(all_acc))
        self.log("val_epoch_acc",torch.mean(all_acc))

In [9]:

model = litRNN(vocab_size = len(vocab),embedding_dim=300,hidden_dim = 100,
num_classes = 2, lstm_layers = 2, bidirectional = True,dropout=0.5,pad_index = 0)
# wandb_logger.watch(model, log="all")
trainer = pl.Trainer(logger=wandb_logger,max_epochs = 10)
trainer.fit(model,loader_train,loader_validation)
wandb.finish()

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name  | Type | Params
-------------------------------
0 | model | LSTM | 5.3 M 
-------------------------------
563 K     Trainable params
4.7 M     Non-trainable params
5.3 M     Total params
21.092    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


tensor(0.5234)


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor(0.8190)


Validation: 0it [00:00, ?it/s]

tensor(0.8277)


Validation: 0it [00:00, ?it/s]

tensor(0.8471)


Validation: 0it [00:00, ?it/s]

tensor(0.8442)


Validation: 0it [00:00, ?it/s]

tensor(0.8397)


Validation: 0it [00:00, ?it/s]

tensor(0.8319)


Validation: 0it [00:00, ?it/s]

tensor(0.8304)


Validation: 0it [00:00, ?it/s]

tensor(0.8420)


Validation: 0it [00:00, ?it/s]

tensor(0.8393)


Validation: 0it [00:00, ?it/s]

tensor(0.8444)


NameError: name 'wandb' is not defined