In [25]:
from torchtext.datasets import IMDB
import torch

In [26]:
import tarfile

tar_path = "../Chapter_15/aclImdb_v1.tar.gz"
extract_path = "../Chapter_15/aclImdb_v1/"
def tar_extract(tar_path, extract_path):
	with tarfile.open(tar_path, "r:gz") as tar:
	    tar.extractall(path=extract_path)
tar_extract(tar_path, extract_path)

In [27]:
import os
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
	def __init__(self, data_dir, split="train", transform = None):
		self.data_dir = os.path.join(data_dir, split)
		self.transform = transform

		# store the file paths and labels
		self.texts = []
		self.labels = []

		# load positive reviews
		pos_dir = os.path.join(self.data_dir, "pos")
		for file in os.listdir(pos_dir):
			with open(os.path.join(pos_dir, file),'r', encoding="utf-8") as f:
				self.texts.append(f.read())
				self.labels.append(1)  # positive label

		neg_dir = os.path.join(self.data_dir, "neg")
		for file in os.listdir(neg_dir):
			with open(os.path.join(neg_dir, file), 'r', encoding="utf-8") as f:
				self.texts.append(f.read())
				self.labels.append(0) # Negative label
	

	def __len__(self):
		return len(self.texts)
	
	def __getitem__(self, index) :
		sample = self.texts[index]
		label = self.labels[index]

		if self.transform:
			sample = self.transform(sample)

		return sample, label

In [28]:
from torch.utils.data import DataLoader, random_split

data_dir = "../Chapter_15/aclImdb_v1/aclImdb"
train_dataset = IMDBDataset(data_dir, split="train")
print(len(train_dataset))
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])
print(len(train_dataset))
print(len(valid_dataset))

test_dataset = IMDBDataset(data_dir, split="test")
print(len(test_dataset))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

25000
20000
5000
25000


In [29]:
import re
from collections import Counter, defaultdict

def tokenizer(text):
	text = re.sub("<[^>]*>", "", text)
	emoticons = re.findall(
		r"(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower()
	)
	text = re.sub(r"[\W]+", " ",text.lower()) + " ".join(emoticons).replace("-", '')
	tokenized = text.split()
	return tokenized


token_counts = Counter()
for label, line in train_dataset:
	# print(label)
	# print(line)
	tokens = tokenizer(label)
	token_counts.update(tokens)

print("Vocab-Size: ", len(token_counts))

Vocab-Size:  69105


In [30]:
# Step 3: Encoding each unique token into integers
from torchtext.vocab import vocab
from collections import OrderedDict
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>",1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an','example']])


[11, 7, 36, 456]


In [31]:
# Step 3-A Define the functions for transformation.
text_pipeline = lambda x:[vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x:1 if x == 'pos' else 0

In [32]:
# Step 3-B : Wrap the encode and transformation function
from torch import nn

# Reload train_iter to avoid exhaustion
train_dataset, test_dataset = IMDB(split=('train', 'test'))
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])



# Collate function for batching
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []

    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))

    # Pad sequences to the max length in the batch
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)

    return padded_text_list, torch.tensor(label_list), torch.tensor(lengths)

# Create DataLoader
batch_size = 8
train_dataloader = DataLoader(list(train_dataset), batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(list(test_dataset), batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# Example batch
text_batch, label_batch, length_batch = next(iter(train_dataloader))
print(text_batch)
print(label_batch)
print(length_batch)




tensor([[ 10, 558, 708,  ...,   0,   0,   0],
        [ 31,  50, 101,  ...,   0,   0,   0],
        [ 10, 102,  12,  ...,   0,   0,   0],
        ...,
        [ 11,   7,  61,  ...,   0,   0,   0],
        [ 10,  84, 215,  ...,   0,   0,   0],
        [418,  15,   2,  ...,   0,   0,   0]])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([232, 305,  72, 764, 525, 197,  68, 304])


In [33]:
print(text_batch.shape)

torch.Size([8, 764])


In [34]:
# Let's divide all three datasets into dataloaders with the batch size of 32
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dl = DataLoader(list(test_dataset), batch_size=batch_size, shuffle=True, collate_fn=collate_batch)


In [35]:
# Embeddig layers for Sentence encoding
embedding = nn.Embedding(
	num_embeddings=10,
	embedding_dim=3,
	padding_idx=0
)

# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-0.6445,  1.6313,  1.5884],
         [-0.1480,  0.8178,  1.7236],
         [ 0.3459,  0.5839,  0.3513],
         [-0.4800, -0.9664,  1.3269]],

        [[ 0.3459,  0.5839,  0.3513],
         [-0.1600,  0.2704,  1.6945],
         [-0.1480,  0.8178,  1.7236],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [36]:
# Building an RNN model
class RNN(nn.Module):
	def __init__(self, input_size, hidden_size) -> None:
		super().__init__()
		self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
		self.fc= nn.Linear(hidden_size, 1)

	def forward(self, x):
		_, hidden = self.rnn(x)
		out = hidden[-1, :,:] # we use the final hidden state from the last hidden layer as the input to the fully connected layer
		out = self.fc(out)
		return out
	
model = RNN(64, 32)
print(model)


RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [37]:
model(torch.randn(5,3,64))


tensor([[ 0.2653],
        [ 0.1856],
        [-0.2016],
        [-0.0917],
        [ 0.0908]], grad_fn=<AddmmBackward0>)

In [38]:
# Building an RNN model for the sentiment analysis task.
class RNN(nn.Module):
	def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
		super().__init__()
		self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
		self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
		self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
		self.relu = nn.ReLU()
		self.fc2 = nn.Linear(fc_hidden_size, 1)
		self.sigmoid = nn.Sigmoid()

	def forward(self, text, lengths):
		out = self.embedding(text)
		out = nn.utils.rnn.pack_padded_sequence(
			out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
		)
		out, (hidden, cell) = self.rnn(out)
		out = hidden[-1,:,:]
		out = self.fc1(out)
		out = self.relu(out)
		out = self.fc2(out)
		out = self.sigmoid(out)
		return out




In [39]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size,embed_dim, rnn_hidden_size, fc_hidden_size)

In [40]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [48]:
# Training of the model on dataset
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter


loss_fn = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.0001)
writer = SummaryWriter('runs/IMDB_Movie_Review/',)


def train(dataloader, epoch):
	model.train()
	total_acc , total_loss = 0,0
	for text_batch, label_batch, lengths in dataloader:
		text_batch, label_batch, lengths = text_batch.to(device), label_batch.to(device), lengths.to(device)
		optimizer.zero_grad()
		pred = model(text_batch, lengths)[:,0]
		loss = loss_fn(pred, label_batch.float())
		loss.backward()
		optimizer.step()
		total_acc += (
			(pred >= 0.5).float() == label_batch
		).float().sum().item()
		total_loss += loss.item()*label_batch.size(0)
	avg_acc = total_acc/len(dataloader.dataset)
	avg_loss = total_loss/len(dataloader.dataset)

	writer.add_scalar("Training Accuracy", avg_acc, epoch)
	writer.add_scalar("Training Loss", avg_loss, epoch)


	return avg_acc, avg_loss



In [49]:
# Evaluate function to measure the model performance 
def evaluate(dataloader, epoch):
	model.eval()
	total_acc, total_loss = 0,0
	with torch.no_grad():
		for text_batch, label_batch, lengths in dataloader:
			text_batch, label_batch, lengths = text_batch.to(device), label_batch.to(device), lengths.to(device)
			pred = model(text_batch, lengths)[:,0]
			loss = loss_fn(pred, label_batch.float())
			total_acc += (
				(pred >= 0.5).float() == label_batch
			).float().sum().item()
			total_loss += loss.item()*label_batch.size(0)
	avg_acc = total_acc/len(dataloader.dataset)
	avg_loss = total_loss/len(dataloader.dataset)

	writer.add_scalar("Training Accuracy", avg_acc, epoch)
	writer.add_scalar("Training Loss", avg_loss, epoch)
	return avg_acc, avg_loss

In [50]:
num_epochs = 3
torch.manual_seed(1)
for epoch in range(num_epochs):
	train_acc , train_loss = train(train_dl, epoch)
	valid_acc, valid_loss = evaluate(valid_dl, epoch)
	print(f"Epoch {epoch} Train Accuracy: {train_acc:.4f} Train Loss: {train_loss:.4f}\
	   Validation Accuracy: {valid_acc:.4f} Validation Loss: {valid_loss:.4f}")

Epoch 0 Train Accuracy: 0.9401 Train Loss: 0.1408	   Validation Accuracy: 1.0000 Validation Loss: 0.0005
Epoch 1 Train Accuracy: 1.0000 Train Loss: 0.0002	   Validation Accuracy: 1.0000 Validation Loss: 0.0001
Epoch 2 Train Accuracy: 1.0000 Train Loss: 0.0001	   Validation Accuracy: 1.0000 Validation Loss: 0.0001


In [51]:
test_acc, _ = evaluate(test_dl, epoch=1)
print(f"test_accuracy :{test_acc:.4f}")

test_accuracy :1.0000


In [45]:
# More on the Bidirectional RNN
""" Here, we will set the bidirectional configuration of the LSTM to True , which will
make the recurrent layer pass through the input sequences from both directions, start to end, as well as in the reverse direction."""

class BidirectionRNN(nn.Module):
	def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size ):
		super().__init__()
		self.embedding = nn.Embedding(
			vocab_size, embed_dim, padding_idx=0
		)
		self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
		self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
		self.relu = nn.ReLU()
		self.fc2 = nn.Linear(fc_hidden_size, 1)
		self.sigmoid = nn.Sigmoid()

	def forward(self, text, lengths):
		out = self.embedding(text)
		out = nn.utils.rnn.pack_padded_sequence(
			out, lengths.cpu().numpy(),batch_first=True,enforce_sorted=False
		)
		_, (hidden, cell) = self.rnn(out)
		out = torch.cat((hidden[-2,:,:],
				   hidden[-1,:,:]), dim=1)
		out = self.fc1(out)
		out = self.relu(out)
		out = self.fc2(out)
		out = self.sigmoid(out)
		return out


In [47]:
model

BidirectionRNN(
  (embedding): Embedding(69107, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [52]:
torch.manual_seed(1)
model = BidirectionRNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size).to(device)

epochs = 5
for epoch in range(epochs):
	train_acc , train_loss = train(train_dl, epoch)
	valid_acc, valid_loss = evaluate(valid_dl, epoch)
	print(f"Epoch {epoch} Train Accuracy: {train_acc:.4f} Train Loss: {train_loss:.4f}\
	   Validation Accuracy: {valid_acc:.4f} Validation Loss: {valid_loss:.4f}")
	
test_acc, _ = evaluate(test_dl, epoch=1)
print(f"test_accuracy :{test_acc:.4f}")


Epoch 0 Train Accuracy: 0.0001 Train Loss: 0.7352	   Validation Accuracy: 0.0000 Validation Loss: 0.7352
Epoch 1 Train Accuracy: 0.0001 Train Loss: 0.7352	   Validation Accuracy: 0.0000 Validation Loss: 0.7352
Epoch 2 Train Accuracy: 0.0001 Train Loss: 0.7352	   Validation Accuracy: 0.0000 Validation Loss: 0.7352
Epoch 3 Train Accuracy: 0.0001 Train Loss: 0.7352	   Validation Accuracy: 0.0000 Validation Loss: 0.7352
Epoch 4 Train Accuracy: 0.0001 Train Loss: 0.7352	   Validation Accuracy: 0.0000 Validation Loss: 0.7352
test_accuracy :0.0001
