In [1]:
import torch
import torch.nn as nn
from data_download import load_imdb

In [2]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=False)

In [3]:
print([i2w[w] for w in x_train[141]])

['possibly', 'the', 'best', 'movie', 'ever', 'created', 'in', 'the', 'history', 'of', 'jeffrey', 'combs', 'career', 'and', 'one', 'that', 'should', 'be', 'looked', 'upon', 'by', 'all', 'talent', 'in', 'hollywood', 'for', 'his', 'versatility', 'charisma', 'and', 'uniqueness', 'he', 'brings', 'through', 'his', 'characters', 'and', 'his', 'knowledge', 'of', 'acting']


In [4]:
max_token = max([len(x) for x in x_train])
max_token

2514

In [38]:
batched_x = []
batched_y = []
curr_batch_x = []
curr_batch_y = []
curr_batch_size = 0
for i in reversed(range(len(x_train))):
    curr = x_train[i]
    curr_y = y_train[i]
    if curr_batch_size + len(curr) > max_token:
        batch_max = max([len(x) for x in curr_batch_x])
        curr_batch_x = [instance + [w2i[".pad"]] * (batch_max - len(instance)) for instance in curr_batch_x]
        batched_x.append(curr_batch_x)
        batched_y.append(curr_batch_y)
        curr_batch_x = [curr]
        curr_batch_y = [curr_y]
        curr_batch_size = len(curr)
    elif curr_batch_size + len(curr) == max_token:
        curr_batch_x.append(curr)
        curr_batch_y.append(curr_y)
        batch_max = max([len(x) for x in curr_batch_x])
        curr_batch_x = [instance + [w2i[".pad"]] * (batch_max - len(instance)) for instance in curr_batch_x]
        batched_x.append(curr_batch_x)
        batched_y.append(curr_batch_y)
        curr_batch_x = []
        curr_batch_y = []
        curr_batch_size = 0
    else:
        curr_batch_x.append(curr)
        curr_batch_y.append(curr_y)
        curr_batch_size += len(curr)

if curr_batch_x:
    batch_max = max(len(x) for x in curr_batch_x)
    padded_x = [x + [w2i[".pad"]] * (batch_max - len(x)) for x in curr_batch_x]
    batched_x.append(padded_x)
    batched_y.append(curr_batch_y)

In [39]:
for i in range(len(batched_x)):
    print(f"Batch size: {len(batched_x[i])}")
    print(f"Max length of the batch: {max([len(x) for x in batched_x[i]])}")
    print(f"Token per instance: {[len(batched_x[i][x]) for x in range(len(batched_x[i]))]}")

Batch size: 1
Max length of the batch: 2514
Token per instance: [2514]
Batch size: 1
Max length of the batch: 1853
Token per instance: [1853]
Batch size: 1
Max length of the batch: 1773
Token per instance: [1773]
Batch size: 1
Max length of the batch: 1646
Token per instance: [1646]
Batch size: 1
Max length of the batch: 1587
Token per instance: [1587]
Batch size: 1
Max length of the batch: 1568
Token per instance: [1568]
Batch size: 1
Max length of the batch: 1555
Token per instance: [1555]
Batch size: 1
Max length of the batch: 1427
Token per instance: [1427]
Batch size: 1
Max length of the batch: 1422
Token per instance: [1422]
Batch size: 1
Max length of the batch: 1392
Token per instance: [1392]
Batch size: 1
Max length of the batch: 1341
Token per instance: [1341]
Batch size: 1
Max length of the batch: 1316
Token per instance: [1316]
Batch size: 1
Max length of the batch: 1279
Token per instance: [1279]
Batch size: 2
Max length of the batch: 1240
Token per instance: [1240, 1240]


In [50]:
tensor_x = [torch.tensor(batch, dtype=torch.long) for batch in batched_x]
tensor_y = [torch.tensor(batch, dtype=torch.long) for batch in batched_y]

In [57]:
class BaseLineClassifier(nn.Module):
    def __init__(self, vocab_size, k):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, k)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(k, 2)
    def forward(self, x):
        x = self.emb(x)
        x = x.transpose(1, 2)
        x = self.global_pool(x)
        x = x.squeeze(2)
        x = self.fc(x)
        return x

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BaseLineClassifier(len(w2i), 100)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [62]:
def accuracy(pred, y):
    pred = torch.argmax(pred, dim=1)
    correct = (pred == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [63]:
for epoch in range(100):
    for i, (x, y) in enumerate(zip(tensor_x, tensor_y)):
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}, accuracy: {accuracy(output, y).item()}")

Epoch 1, Loss: 0.44603610038757324, accuracy: 0.8235294222831726
Epoch 2, Loss: 0.35875800251960754, accuracy: 0.8529411554336548
Epoch 3, Loss: 0.29186657071113586, accuracy: 0.8823529481887817
Epoch 4, Loss: 0.2415034919977188, accuracy: 0.9117646813392639
Epoch 5, Loss: 0.20938074588775635, accuracy: 0.9411764740943909


KeyboardInterrupt: 