In [22]:
import torch
from torch import nn
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from tqdm import tqdm

In [2]:
import pandas as pd

splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test_dataset = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])

In [4]:
from sklearn.model_selection import train_test_split
train_dataset, valid_dataset = train_test_split(df, train_size=20000, test_size=5000, random_state=1)

In [5]:
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for i, (text, label) in train_dataset.iterrows():
    tokens = tokenizer(text)
    token_counts.update(tokens)
 
    
print('Vocab-size:', len(token_counts))

Vocab-size: 69353


In [6]:
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 467]


In [7]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# label_pipeine = lambda x: 1. if x == 'pos' else 0.
label_pipeine = lambda x: float(x)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(label_pipeine(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [9]:
from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]['text'], self.dataframe.iloc[idx]['label']
    
train_set = TabularDataset(train_dataset)
dataloader = DataLoader(train_set, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [10]:
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   55,    81,    19,  ...,  2470, 10363,   126],
        [   48,     4,    81,  ...,     0,     0,     0],
        [   10,    14,  1321,  ...,     0,     0,     0],
        [    4,   950, 15303,  ...,     0,     0,     0]], device='cuda:0')
tensor([0., 0., 1., 0.], device='cuda:0')
tensor([455, 116, 149, 112], device='cuda:0')
torch.Size([4, 455])


In [11]:
batch_size = 32
train_set = TabularDataset(train_dataset)
valid_set = TabularDataset(valid_dataset)
test_set = TabularDataset(test_dataset)

train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_set, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [12]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)
text_encoded_input = torch.LongTensor([[1, 2, 3, 4], [4, 3, 2, 0]])
print(embedding(text_encoded_input))

tensor([[[ 1.6328, -0.3470,  0.3809],
         [ 1.0121,  0.6342,  0.7798],
         [-0.1830, -1.5790, -0.2701],
         [-1.6945, -0.0384, -0.4393]],

        [[-1.6945, -0.0384, -0.4393],
         [-0.1830, -1.5790, -0.2701],
         [ 1.0121,  0.6342,  0.7798],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [13]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)

        self.fc = nn.Linear(hidden_size, 1)
        

    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out
    
model = RNN(64, 32)
print(model)

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [14]:
model(torch.randn(5, 3, 64))

tensor([[ 0.2903],
        [ 0.2039],
        [-0.2263],
        [-0.0708],
        [ 0.1707]], grad_fn=<AddmmBackward0>)

In [15]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [26]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size).to(device)
model

RNN(
  (embedding): Embedding(69355, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [24]:
def train(dataloader, optimizer, loss_fn):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in tqdm(dataloader):
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

def evaluate(dataloader, loss_fn):
    model.eval()
    total_acc, total_loss = 0, 0

    with torch.no_grad():
        for text_batch, label_batch, lengths in tqdm(dataloader):
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item() * label_batch.size(0)

    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [27]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
torch.manual_seed(1)
print(f'Train model on device {torch.cuda.get_device_name(device)}')
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}:')
    acc_train, loss_train = train(train_dl, optimizer, loss_fn)
    acc_valid, loss_valid = evaluate(valid_dl, loss_fn)
    print(f'accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Train model on device NVIDIA GeForce GTX 1050
Epoch 1:


100%|██████████| 625/625 [00:27<00:00, 22.41it/s]
100%|██████████| 157/157 [00:03<00:00, 46.20it/s]


accuracy: 0.5989 val_accuracy: 0.6636
Epoch 2:


100%|██████████| 625/625 [00:27<00:00, 22.91it/s]
100%|██████████| 157/157 [00:03<00:00, 45.49it/s]


accuracy: 0.7168 val_accuracy: 0.7418
Epoch 3:


100%|██████████| 625/625 [00:27<00:00, 22.77it/s]
100%|██████████| 157/157 [00:03<00:00, 45.72it/s]


accuracy: 0.7565 val_accuracy: 0.6828
Epoch 4:


100%|██████████| 625/625 [00:27<00:00, 22.71it/s]
100%|██████████| 157/157 [00:03<00:00, 45.46it/s]


accuracy: 0.8029 val_accuracy: 0.7792
Epoch 5:


100%|██████████| 625/625 [00:27<00:00, 22.76it/s]
100%|██████████| 157/157 [00:03<00:00, 45.04it/s]


accuracy: 0.8593 val_accuracy: 0.8194
Epoch 6:


100%|██████████| 625/625 [00:27<00:00, 22.82it/s]
100%|██████████| 157/157 [00:03<00:00, 42.61it/s]


accuracy: 0.8825 val_accuracy: 0.8260
Epoch 7:


100%|██████████| 625/625 [00:28<00:00, 21.69it/s]
100%|██████████| 157/157 [00:03<00:00, 44.23it/s]


accuracy: 0.8872 val_accuracy: 0.8436
Epoch 8:


100%|██████████| 625/625 [00:27<00:00, 22.42it/s]
100%|██████████| 157/157 [00:03<00:00, 40.25it/s]


accuracy: 0.9218 val_accuracy: 0.8438
Epoch 9:


100%|██████████| 625/625 [00:32<00:00, 19.28it/s]
100%|██████████| 157/157 [00:03<00:00, 47.67it/s]


accuracy: 0.9391 val_accuracy: 0.8592
Epoch 10:


100%|██████████| 625/625 [00:28<00:00, 21.84it/s]
100%|██████████| 157/157 [00:03<00:00, 46.93it/s]

accuracy: 0.9488 val_accuracy: 0.8466





In [29]:
acc_test, _ = evaluate(test_dl, loss_fn)
print(f'test_accuracy: {acc_test:.4f}')

100%|██████████| 782/782 [00:17<00:00, 45.98it/s]

test_accuracy: 0.8443





In [31]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out   

In [32]:
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size).to(device)
model

RNN(
  (embedding): Embedding(69355, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [33]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
torch.manual_seed(1)
print(f'Training model on {torch.cuda.get_device_name(device)}')
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}:')
    acc_train, loss_train = train(train_dl, optimizer, loss_fn)
    acc_valid, loss_valid = evaluate(valid_dl, loss_fn)
    print(f'accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Training model on NVIDIA GeForce GTX 1050
Epoch 1:


100%|██████████| 625/625 [00:37<00:00, 16.87it/s]
100%|██████████| 157/157 [00:04<00:00, 35.99it/s]


accuracy: 0.6070 val_accuracy: 0.6972
Epoch 2:


100%|██████████| 625/625 [00:36<00:00, 16.91it/s]
100%|██████████| 157/157 [00:04<00:00, 36.35it/s]


accuracy: 0.6862 val_accuracy: 0.7014
Epoch 3:


100%|██████████| 625/625 [00:36<00:00, 16.96it/s]
100%|██████████| 157/157 [00:04<00:00, 36.34it/s]


accuracy: 0.7685 val_accuracy: 0.7002
Epoch 4:


100%|██████████| 625/625 [00:36<00:00, 16.92it/s]
100%|██████████| 157/157 [00:04<00:00, 36.29it/s]


accuracy: 0.8232 val_accuracy: 0.8316
Epoch 5:


100%|██████████| 625/625 [00:37<00:00, 16.84it/s]
100%|██████████| 157/157 [00:04<00:00, 35.47it/s]


accuracy: 0.8803 val_accuracy: 0.8324
Epoch 6:


100%|██████████| 625/625 [00:37<00:00, 16.79it/s]
100%|██████████| 157/157 [00:04<00:00, 35.82it/s]


accuracy: 0.9072 val_accuracy: 0.8658
Epoch 7:


100%|██████████| 625/625 [00:37<00:00, 16.83it/s]
100%|██████████| 157/157 [00:04<00:00, 35.84it/s]


accuracy: 0.9331 val_accuracy: 0.8668
Epoch 8:


100%|██████████| 625/625 [00:36<00:00, 16.93it/s]
100%|██████████| 157/157 [00:04<00:00, 36.59it/s]


accuracy: 0.9520 val_accuracy: 0.8662
Epoch 9:


100%|██████████| 625/625 [00:37<00:00, 16.72it/s]
100%|██████████| 157/157 [00:04<00:00, 36.22it/s]


accuracy: 0.9654 val_accuracy: 0.8640
Epoch 10:


100%|██████████| 625/625 [00:36<00:00, 16.95it/s]
100%|██████████| 157/157 [00:04<00:00, 36.43it/s]

accuracy: 0.9748 val_accuracy: 0.8612





In [35]:
acc_test, _ = evaluate(test_dl, loss_fn)
print(f'test_accuracy: {acc_test:.4f}') 

100%|██████████| 782/782 [00:21<00:00, 36.21it/s]

test_accuracy: 0.8515



