# Sentiment Analysis with Transformers: IMDB Movie Reviews

# 1. BERT

In [22]:
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import BertTokenizerFast, TFBertForSequenceClassification, create_optimizer

In [23]:
# Parameters
BATCH_SIZE = 8
MAX_LENGTH = 64
EPOCHS = 1

In [24]:
# Load the IMDB dataset using TFDS
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [25]:
# Use a subset of the dataset
train_dataset, test_dataset = dataset['train'].take(5000), dataset['test'].take(1000)

In [26]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [27]:
# Function to tokenize and encode the dataset
def encode_example(text, label):
    encoded = tokenizer(
        text.numpy().decode('utf-8'),
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH
    )
    return encoded['input_ids'], encoded['attention_mask'], encoded['token_type_ids'], label

In [28]:
# Wrapper function to use `tf.py_function` for encoding
def encode_fn(text, label):
    input_ids, attention_mask, token_type_ids, label = tf.py_function(
        func=encode_example,
        inp=[text, label],
        Tout=[tf.int32, tf.int32, tf.int32, tf.int64]
    )
    input_ids.set_shape([MAX_LENGTH])
    attention_mask.set_shape([MAX_LENGTH])
    token_type_ids.set_shape([MAX_LENGTH])
    label.set_shape([])

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}, label

In [29]:
# Apply the encoding function to the datasets
train_dataset = train_dataset.map(encode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.map(encode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [30]:
# Prepare the datasets for training
train_dataset = train_dataset.shuffle(10000).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [31]:
# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Set up the optimizer and loss function
steps_per_epoch = len(train_dataset)
num_train_steps = steps_per_epoch * EPOCHS
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

In [33]:
# Compile the model with the appropriate loss function
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [34]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS
)



In [35]:
# Evaluate the model on the test set
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

Test Loss: 0.41463029384613037, Test Accuracy: 0.8059999942779541


# 2. DistilBERT

In [41]:
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, create_optimizer

In [42]:
# Parameters
BATCH_SIZE = 8
MAX_LENGTH = 64
EPOCHS = 1

In [43]:
# Load the IMDB dataset using TFDS
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [44]:
# Use a subset of the dataset
train_dataset = dataset['train'].take(5000)
test_dataset = dataset['test'].take(1000)

In [45]:
# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [46]:
# Function to tokenize and encode the dataset
def encode_example(text, label):
    encoded = tokenizer(
        text.numpy().decode('utf-8'),
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH
    )
    return encoded['input_ids'], encoded['attention_mask'], label

In [47]:
# Wrapper function to use `tf.py_function` for encoding
def encode_fn(text, label):
    input_ids, attention_mask, label = tf.py_function(
        func=encode_example,
        inp=[text, label],
        Tout=[tf.int32, tf.int32, tf.int64]
    )
    input_ids.set_shape([MAX_LENGTH])
    attention_mask.set_shape([MAX_LENGTH])
    label.set_shape([])

    return {'input_ids': input_ids, 'attention_mask': attention_mask}, label

In [48]:
# Apply the encoding function to the datasets
train_dataset = train_dataset.map(encode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.map(encode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [49]:
# Prepare the datasets for training
train_dataset = train_dataset.shuffle(10000).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [50]:
# Load the pre-trained DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [51]:
# Set up the optimizer and loss function
steps_per_epoch = len(train_dataset)
num_train_steps = steps_per_epoch * EPOCHS
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

In [52]:
# Compile the model with the appropriate loss function
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [53]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS
)



In [54]:
# Evaluate the model on the test set
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

Test Loss: 0.44413572549819946, Test Accuracy: 0.781000018119812


#### BERT accuracy is around 80% and DistilBERT accuracy is around 78%. BERT is the best model because of its bidirectional understanding of context.DistilBERT offer similar performance to BERT but with reducec computational cost