In [1]:
# Install required packages if not already installed
# !pip install datasets transformers tensorflow

from datasets import load_dataset
import tensorflow as tf
import numpy as np

# Load the IMDB dataset
dataset = load_dataset("imdb")


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 20000  # maximum number of words to consider
max_len = 200      # maximum review length (in tokens)

# Fit the tokenizer on training data
tokenizer_keras = Tokenizer(num_words=max_words)
tokenizer_keras.fit_on_texts(dataset["train"]["text"])

# Tokenize and pad training and test texts
X_train = tokenizer_keras.texts_to_sequences(dataset["train"]["text"])
X_train = pad_sequences(X_train, maxlen=max_len)
y_train = np.array(dataset["train"]["label"])

X_test = tokenizer_keras.texts_to_sequences(dataset["test"]["text"])
X_test = pad_sequences(X_test, maxlen=max_len)
y_test = np.array(dataset["test"]["label"])


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense

baseline_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

baseline_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
baseline_model.summary()

# Train the baseline model
history_baseline = baseline_model.fit(X_train, y_train,
                                      batch_size=32,
                                      epochs=5,
                                      validation_data=(X_test, y_test))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          2560000   
                                                                 
 conv1d (Conv1D)             (None, 196, 64)           41024     
                                                                 
 max_pooling1d (MaxPooling1  (None, 49, 64)            0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2634113 (10.05 MB)
Trainable params: 2634113 (10.05 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [None]:
from transformers import BertTokenizer

# Load BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a tokenization function for the dataset
def tokenize_function(examples):
    return bert_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets (batched processing)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the original text column to speed up training
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT for sequence classification (binary classification: num_labels=2)
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=2,               # adjust epochs as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./bert_logs',
)

# Define the Trainer for BERT
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT model (initially, you could freeze BERT layers here if desired)
trainer.train()


In [None]:
# For full fine-tuning, ensure that all parameters are trainable
for param in bert_model.bert.parameters():
    param.requires_grad = True

# Continue fine-tuning with a few more epochs
training_args.num_train_epochs = 2  # Additional epochs for fine-tuning
trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
import matplotlib.pyplot as plt

plt.plot(history_baseline.history['accuracy'], label='train accuracy')
plt.plot(history_baseline.history['val_accuracy'], label='val accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Baseline Model Accuracy')
plt.show()
