In [1]:
!pip install transformers pandas torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [4]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

import os

# Define a function to load the IMDB dataset from disk
def load_imdb_dataset():
    imdb_dir = 'aclImdb'
    train_dir = os.path.join(imdb_dir, 'train')
    test_dir = os.path.join(imdb_dir, 'test')

    train_texts = []
    train_labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
                    train_texts.append(f.read())
                train_labels.append(0 if label_type == 'neg' else 1)

    test_texts = []
    test_labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(test_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
                    test_texts.append(f.read())
                test_labels.append(0 if label_type == 'neg' else 1)

    return train_texts, train_labels, test_texts, test_labels

# Load the IMDB dataset from disk
train_texts, train_labels, test_texts, test_labels = load_imdb_dataset()


--2023-03-23 21:12:54--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-03-23 21:12:58 (22.9 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [5]:
from sklearn.model_selection import train_test_split

# Combine the training and testing texts and labels
texts = train_texts + test_texts
labels = train_labels + test_labels

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [6]:
# Define a function to tokenize the input data
def tokenize_data(texts, labels, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

# Tokenize the training and testing data
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_texts, train_labels, tokenizer, max_len=512)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_texts, test_labels, tokenizer, max_len=512)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
# Load the pre-trained BERT model and adjust the number of labels
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)

# Set the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_input_ids) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [8]:
# !pip install GPUtil

# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda

# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

# free_gpu_cache()       

In [9]:

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define a function to train the model
def train_model(model, train_input_ids, train_attention_masks, train_labels, test_input_ids, test_attention_masks, test_labels, optimizer, scheduler, num_epochs=3):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)

    test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=False)

    loss_fn = torch.nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        model.train()
        train_loss, train_acc = 0.0, 0.0
        for batch in train_loader:
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)

            optimizer.zero_grad()
            outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks)
            logits = outputs.logits
            loss = loss_fn(logits, batch_labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            train_acc += torch.sum(preds == batch_labels).item()

        train_loss /= len(train_loader)
        train_acc /= len(train_loader.dataset)

        model.eval()
        eval_loss, eval_acc = 0.0, 0.0
        for batch in test_loader:
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks)
                logits = outputs.logits
                loss = loss_fn(logits, batch_labels)

            eval_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            eval_acc += torch.sum(preds == batch_labels).item()

        eval_loss /= len(test_loader)
        eval_acc /= len(test_loader.dataset)

        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {train_loss:.3f} | Training Accuracy: {train_acc:.3f}')
        print(f'Validation Loss: {eval_loss:.3f} | Validation Accuracy: {eval_acc:.3f}\n')


In [10]:
train_model(model, train_input_ids, train_attention_masks, train_labels, test_input_ids, test_attention_masks, test_labels, optimizer, scheduler, num_epochs=3)

Epoch 1:
Training Loss: 0.216 | Training Accuracy: 0.912
Validation Loss: 0.182 | Validation Accuracy: 0.937

Epoch 2:
Training Loss: 0.112 | Training Accuracy: 0.961
Validation Loss: 0.145 | Validation Accuracy: 0.948

Epoch 3:
Training Loss: 0.066 | Training Accuracy: 0.978
Validation Loss: 0.177 | Validation Accuracy: 0.946



In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

model.eval()
test_acc = 0.0
for batch in test_loader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_masks)
        logits = outputs.logits

    _, preds = torch.max(logits, dim=1)
    test_acc += torch.sum(preds == batch_labels).item()

test_acc /= len(test_loader.dataset)

print(f'Test Accuracy: {test_acc:.3f}')


Test Accuracy: 0.946
