In [3]:
from datasets import load_dataset

# Load IMDb dataset from HuggingFace
dataset = load_dataset("imdb")


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # we just use the tokenizer, not BERT

MAX_LEN = 200

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

encoded_dataset = dataset.map(tokenize, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# Show a sample input review
sample_text = dataset["train"][0]["text"]
print("Sample review text:")
print(sample_text)

# Tokenize the sample text
sample_tokens = tokenizer(sample_text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
print("\nTokenized input_ids shape:", sample_tokens["input_ids"].shape)
print("First 10 token IDs:", sample_tokens["input_ids"][0][:10])


Sample review text:
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few 

In [7]:
# Use the tokenizer to encode text
encoded = tokenizer("This movie was a total waste of time.", 
                    padding="max_length", truncation=True, max_length=20, return_tensors="pt")

# Extract input IDs
input_ids = encoded["input_ids"][0]

# Convert to tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids)

print("Token IDs:", input_ids.tolist())
print("Tokens:   ", tokens)


Token IDs: [101, 2023, 3185, 2001, 1037, 2561, 5949, 1997, 2051, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokens:    ['[CLS]', 'this', 'movie', 'was', 'a', 'total', 'waste', 'of', 'time', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [8]:
decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
print("Decoded back to text:", decoded_text)


Decoded back to text: this movie was a total waste of time.


In [16]:
train_encodings = {
    "input_ids": encoded_dataset["train"]["input_ids"],
    "attention_mask": encoded_dataset["train"]["attention_mask"]
}
train_labels = encoded_dataset["train"]["label"]

test_encodings = {
    "input_ids": encoded_dataset["test"]["input_ids"],
    "attention_mask": encoded_dataset["test"]["attention_mask"]
}
test_labels = encoded_dataset["test"]["label"]


In [17]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [18]:
train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [14]:
import torch.nn as nn

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMSentiment, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return out


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

VOCAB_SIZE = tokenizer.vocab_size
model = LSTMSentiment(VOCAB_SIZE, 128, 256, 2).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 0.6940
Epoch 2, Loss: 0.6903
Epoch 3, Loss: 0.5630


In [20]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.7946


In [22]:
# Show predictions on 5 test samples
print("\nSample predictions:")
label_map = {0: "negative", 1: "positive"}

for i in range(15):
    text = dataset["test"][i]["text"]
    true_label = dataset["test"][i]["label"]

    # Tokenize single example
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
    outputs = model(inputs["input_ids"])
    pred = torch.argmax(outputs, dim=1).item()

    print(f"\nReview: {text[:200]}...")
    print(f"True label: {label_map[true_label]}")
    print(f"Predicted label: {label_map[pred]}")



Sample predictions:

Review: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Bab...
True label: negative
Predicted label: positive

Review: Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell ...
True label: negative
Predicted label: negative

Review: its a totally average film with a few semi-alright action sequences that make the plot seem a little better and remind the viewer of the classic van dam films. parts of the plot don't make sense and s...
True label: negative
Predicted label: negative

Review: STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning <br /><br />Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is 