In [5]:
%pip install tensorflow_datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow_datasets
  Downloading tensorflow_datasets-4.9.3-py3-none-any.whl.metadata (9.3 kB)
Collecting array-record (from tensorflow_datasets)
  Downloading array_record-0.4.1-py39-none-any.whl.metadata (503 bytes)
Collecting dm-tree (from tensorflow_datasets)
  Downloading dm_tree-0.1.8-cp39-cp39-macosx_11_0_arm64.whl.metadata (1.9 kB)
Collecting etils>=0.9.0 (from etils[enp,epath,etree]>=0.9.0->tensorflow_datasets)
  Downloading etils-1.5.2-py3-none-any.whl.metadata (6.3 kB)
Collecting promise (from tensorflow_datasets)
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting tensorflow-metadata (from tensorflow_datasets)
  Downloading tensorflow_metadata-1.15.0-py3-none-any.whl.metadata (2.4 kB)
Collecting toml (from tensorflow_datasets)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting protobuf>=3.20 (from tensorflow_dataset

In [8]:
import tensorflow as tf
import torch
import tensorflow_datasets as tfds
import numpy as np
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [9]:
# Load IMDB dataset from TensorFlow Datasets
imdb_dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data, test_data = imdb_dataset['train'], imdb_dataset['test']

# Prepare data
texts_train = []
labels_train = []
for text, label in train_data:
    texts_train.append(text.numpy().decode('utf-8'))
    labels_train.append(label.numpy())

texts_test = []
labels_test = []
for text, label in test_data:
    texts_test.append(text.numpy().decode('utf-8'))
    labels_test.append(label.numpy())

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/sofarooq/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Size...: 103 MiB [03:10,  1.85s/ MiB]url]                    
Dl Completed...: 2 url [03:10, 95.44s/ url] 
                                                                        

Dl Completed...: 100%|██████████| 1/1 [00:50<00:00, 50.02s/ url]

[1mDataset imdb_reviews downloaded and prepared to /Users/sofarooq/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


2024-04-25 08:46:18.741093: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-04-25 08:46:20.376874: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_len = 128  # maximum length of input sequence
input_ids_train = []
attention_masks_train = []

for text in tqdm(texts_train):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    input_ids_train.append(encoded_dict['input_ids'])
    attention_masks_train.append(encoded_dict['attention_mask'])

input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
labels_train = torch.tensor(labels_train)

input_ids_test = []
attention_masks_test = []

for text in tqdm(texts_test):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    input_ids_test.append(encoded_dict['input_ids'])
    attention_masks_test.append(encoded_dict['attention_mask'])

input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)
labels_test = torch.tensor(labels_test)

tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 28.2kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 326kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:01<00:00, 431kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 197kB/s]
  0%|          | 0/25000 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 25000/25000 [01:05<00:00, 382.75it/s]
100%|██████████| 25000/25000 [18:32<00:00, 22.47it/s]   


In [11]:
# Split data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids_train, labels_train, 
                                                            random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks_train, labels_train,
                                             random_state=42, test_size=0.1)

# Create DataLoader for efficient batching
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 3

model.safetensors:  98%|█████████▊| 430M/440M [11:27<00:11, 895kB/s] Error while downloading from https://cdn-lfs.huggingface.co/bert-base-uncased/68d45e234eb4a928074dfd868cead0219ab85354cc53d20e772753c6bb9169d3?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1714276081&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNDI3NjA4MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9iZXJ0LWJhc2UtdW5jYXNlZC82OGQ0NWUyMzRlYjRhOTI4MDc0ZGZkODY4Y2VhZDAyMTlhYjg1MzU0Y2M1M2QyMGU3NzI3NTNjNmJiOTE2OWQzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=KLbcS8TmBT4EjOvNUp58upDWggUpDGioTrddIibYRUoBucIdixWcZCTe5uVdD1ThuJ5to3UT9YfIYXxE7-rQvcPxhMV7xppx%7Ea8BVGVk39lgCJSttzIQ8LqKdZ60hWxphm828EUXdZZvoCQdUlj8Vv4Z0LFAb1ZifzlDCIxUtLGHb3jBpDCQhrSI-qv4ekXVFFqQJ1DwpE6C5BoWhvFDtQrFr34aebk4hvqe-18opJoXg%7E5CtLfXxq9VtKjQ6NYwHKSfT5W4ftNLb1mGLP9s-c6SjIAhNAm6TA18555%7E4LdMZkwxFalz

In [12]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)            

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():        
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = np.sum(np.argmax(logits, axis=1) == label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Epoch {} - Train loss: {:.4f}, Validation Accuracy: {:.2f}%".format(epoch+1, avg_train_loss, 
                                                                               eval_accuracy/nb_eval_steps * 100))


KeyboardInterrupt: 