In [None]:
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt

# TensorFlow/Keras imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense

# PyTorch & Hugging Face imports
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


: 

In [2]:
print("TensorFlow recognizes GPU:", len(tf.config.list_physical_devices('GPU')) > 0)
print("PyTorch recognizes GPU:", torch.cuda.is_available())

TensorFlow recognizes GPU: False
PyTorch recognizes GPU: True


In [None]:
dataset = load_dataset("imdb")

In [None]:
max_words = 20000  # maximum number of words to consider
max_len = 200      # maximum review length (in tokens)

# Fit the tokenizer on training data
tokenizer_keras = Tokenizer(num_words=max_words)
tokenizer_keras.fit_on_texts(dataset["train"]["text"])

# Tokenize and pad training and test texts
X_train = tokenizer_keras.texts_to_sequences(dataset["train"]["text"])
X_train = pad_sequences(X_train, maxlen=max_len)
y_train = np.array(dataset["train"]["label"])

X_test = tokenizer_keras.texts_to_sequences(dataset["test"]["text"])
X_test = pad_sequences(X_test, maxlen=max_len)
y_test = np.array(dataset["test"]["label"])

# Build the baseline model
baseline_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

baseline_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
baseline_model.summary()

# Optional: Place training inside a GPU context if you want to be explicit.
# If TensorFlow is GPU-enabled, it should automatically run on GPU without this.
with tf.device('/GPU:0'):
    history_baseline = baseline_model.fit(
        X_train, y_train,
        batch_size=32,
        epochs=5,
        validation_data=(X_test, y_test)
    )

In [None]:
# Load BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a tokenization function for the dataset
def tokenize_function(examples):
    return bert_tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )

# Tokenize the datasets (batched processing)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the original text column to speed up training
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")

train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Load pre-trained BERT for sequence classification (binary classification: num_labels=2)
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=2,               # adjust epochs as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./bert_logs',
    # Set this to False to enable GPU if available
    no_cuda=False
)

# Define the Trainer for BERT
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT model (bottleneck fine-tuning)
trainer.train()


In [None]:
for param in bert_model.bert.parameters():
    param.requires_grad = True

# Continue fine-tuning with a few more epochs
training_args.num_train_epochs = 2  # Additional epochs
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

In [None]:
plt.plot(history_baseline.history['accuracy'], label='Train Accuracy')
plt.plot(history_baseline.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Baseline Model Accuracy')
plt.show()