In [3]:
# For auto-loading backend
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Imports

In [4]:
# ======================================
# Imports
# ======================================

from backend import (
    load_data,
    tokenize,
    build_vocab,
    prepare_datasets,
    build_model,
    train_model,
    evaluate,
    predict_sentiment,
)

  from .autonotebook import tqdm as notebook_tqdm


### Load data and build vocabulary

In [5]:
# ======================================
# Load data and build vocabulary
# ======================================

# Load raw data
train_raw, test_raw = load_data()
print(train_raw, test_raw)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
}) Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [6]:
# Tokenize training split (for vocab building)
tokenized_train_for_vocab = train_raw.map(tokenize, batched=False)
print(tokenized_train_for_vocab)

Map: 100%|██████████| 25000/25000 [00:02<00:00, 8437.16 examples/s]

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})





In [7]:
# Build vocabulary
vocab, special_tokens = build_vocab(tokenized_train_for_vocab)

print(f"Vocab size: {len(vocab)}")
print("Special tokens:", special_tokens)


Vocab size: 80343
Special tokens: {'pad': 0, 'unk': 1}


### Prepare encoded datasets

In [8]:
# ======================================
# Prepare encoded datasets
# ======================================

raw_datasets = (train_raw, test_raw)

train_dataset, test_dataset = prepare_datasets(
    raw_datasets=raw_datasets,
    vocab=vocab,
    special_tokens=special_tokens,
)

print(train_dataset)
print(test_dataset)


Map: 100%|██████████| 25000/25000 [00:02<00:00, 8821.54 examples/s]
Map: 100%|██████████| 25000/25000 [00:04<00:00, 5372.29 examples/s]
Map: 100%|██████████| 25000/25000 [00:04<00:00, 5301.57 examples/s]

Dataset({
    features: ['text', 'label', 'tokens', 'input_ids'],
    num_rows: 25000
})
Dataset({
    features: ['text', 'label', 'tokens', 'input_ids'],
    num_rows: 25000
})





### Build and train the model

In [9]:
# ======================================
# Build and train the model
# ======================================

model = build_model(vocab_size=len(vocab))

In [10]:
model = train_model(
    model=model,
    train_dataset=train_dataset,
)

Epoch 1/5, Loss: 0.6742
Epoch 2/5, Loss: 0.5719
Epoch 3/5, Loss: 0.3542
Epoch 4/5, Loss: 0.2445
Epoch 5/5, Loss: 0.1602


### Evaluate accuracy on the test set

In [11]:
# ======================================
# Evaluate on the test set
# ======================================

test_accuracy = evaluate(
    model=model,
    test_dataset=test_dataset,
)

# Print test accuracy
print("\n***")
print("Final test accuracy:", test_accuracy)
print("***")



***
Final test accuracy: 0.83136
***


### Evaluate performance on custom review text

In [12]:
# ======================================
# Evaluate on custom review text
# ======================================

examples = [
    "This movie was absolutely wonderful, I loved every minute of it.",
    "The film was boring and a complete waste of time.",
    "Not bad, but the plot was a bit weak in the middle.",
    "Good movie.",
    "I enjoyed it.",
    "It was terrible.",
    "Acting was breathtaking.",
]

for text in examples:
    pred = predict_sentiment(
        model=model,
        text=text,
        vocab=vocab,
        special_tokens=special_tokens,
    )
    print(f"{pred:9s} <--| {text}")

positive  <--| This movie was absolutely wonderful, I loved every minute of it.
negative  <--| The film was boring and a complete waste of time.
negative  <--| Not bad, but the plot was a bit weak in the middle.
positive  <--| Good movie.
positive  <--| I enjoyed it.
negative  <--| It was terrible.
positive  <--| Acting was breathtaking.
