In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_EPOCHS = 1

In [3]:
url = ("https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz")
filename = url.split("/")[-1]

with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [4]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [5]:
small_train_texts = df.iloc[:5000]['review'].values
small_train_labels = df.iloc[:5000]['sentiment'].values

small_valid_texts = df.iloc[5000:6000]['review'].values
small_valid_labels = df.iloc[5000:6000]['sentiment'].values

small_test_texts = df.iloc[6000:7000]['review'].values
small_test_labels = df.iloc[6000:7000]['sentiment'].values

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
index = 5
encoding = tokenizer(small_train_texts[index])
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [7]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __getitem__(self, index):
        return self.texts[index], self.labels[index]

    def __len__(self):
        return len(self.labels)

In [8]:
import time
t = time.time()
def collate_batch(batch):
    _texts, _labels = zip(*batch)
    encoding = tokenizer(_texts, truncation=True, padding=True)
    _texts, _masks = torch.tensor(encoding['input_ids']), torch.tensor(encoding['attention_mask'])
    return _texts, _masks, torch.tensor(_labels)

small_trainset = IMDbDataset(small_train_texts, small_train_labels)
batch = [small_trainset[i] for i in range(5, 9)]
texts, masks, labels = collate_batch(batch)
time.time() - t

0.03586316108703613

In [9]:
small_train_set = IMDbDataset(small_train_texts, small_train_labels)
small_valid_set = IMDbDataset(small_valid_texts, small_valid_labels)
small_test_set = IMDbDataset(small_test_texts, small_test_labels)

batch_size = 8
train_dl = torch.utils.data.DataLoader(small_train_set, batch_size=batch_size, shuffle=True, num_workers=2, collate_fn=collate_batch)
valid_dl = torch.utils.data.DataLoader(small_valid_set, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collate_batch)
test_dl = torch.utils.data.DataLoader(small_test_set, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collate_batch)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=1e-4)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0

        for batch_idx, (input_ids, attention_mask, labels) in enumerate(data_loader):

        ### Prepare data
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()

        return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    model.train()

    for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_dl):

        ### Prepare data
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        labels = labels.to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_dl):04d} | '
                   f'Loss: {loss:.4f}')

    model.eval()

    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_dl, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_dl, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_dl, DEVICE):.2f}%')

Epoch: 0001/0001 | Batch 0000/0625 | Loss: 0.6770
Epoch: 0001/0001 | Batch 0250/0625 | Loss: 0.4107
Epoch: 0001/0001 | Batch 0500/0625 | Loss: 0.0604
Training accuracy: 91.14%
Valid accuracy: 87.00%
Time elapsed: 5.30 min
Total Training Time: 5.30 min
Test accuracy: 87.00%


In [None]:
model.save_pretrained('./distilbert-imdb')
tokenizer.save_pretrained('./distilbert-imdb')

('./distilbert-imdb/tokenizer_config.json',
 './distilbert-imdb/special_tokens_map.json',
 './distilbert-imdb/vocab.txt',
 './distilbert-imdb/added_tokens.json',
 './distilbert-imdb/tokenizer.json')

In [None]:
tokenizer(small_train_set[0], truncated=0)

In [None]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __getitem__(self, index):
        encoding = tokenizer(self.texts)
        return self.texts[index], self.labels[index]

    def __len__(self):
        return len(self.labels)

In [13]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model = model.to(DEVICE)
model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [21]:
def collate_batch_2(batch):
    _texts, _labels = zip(*batch)
    encoding = tokenizer(_texts, truncation=True, padding=True)
    item = {'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(_labels)}
    return item

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_set,
    optimizers=(optimizer, scheduler),
    data_collator=collate_batch_2,
)

trainer.args._n_gpu=1

In [23]:
t = time.time()
trainer.train()
print(f'Total training time: {(time.time() - t)/60:.2f} min')

Step,Training Loss
100,0.5573
200,0.5037
300,0.506
400,0.5074
500,0.4984
600,0.5035


Total training time: 4.17 min


In [24]:
model.save_pretrained('./distilbert-imdb')
tokenizer.save_pretrained('./distilbert-imdb')

('./distilbert-imdb/tokenizer_config.json',
 './distilbert-imdb/special_tokens_map.json',
 './distilbert-imdb/vocab.txt',
 './distilbert-imdb/added_tokens.json',
 './distilbert-imdb/tokenizer.json')

In [28]:
valid_dl = torch.utils.data.DataLoader(small_valid_set, batch_size=16, shuffle=False, num_workers=2, collate_fn=collate_batch_2)
validation_result = trainer.evaluation_loop(valid_dl, "validation")

In [43]:
import numpy as np
accuracy = ((np.argmax(validation_result.predictions, axis=1) == validation_result.label_ids).sum()/ validation_result.num_samples).item() * 100

84.8

In [45]:
!zip -r /content/distilbert-imdb.zip /content/distilbert-imdb/

  adding: content/distilbert-imdb/ (stored 0%)
  adding: content/distilbert-imdb/tokenizer_config.json (deflated 75%)
  adding: content/distilbert-imdb/config.json (deflated 45%)
  adding: content/distilbert-imdb/vocab.txt (deflated 53%)
  adding: content/distilbert-imdb/tokenizer.json (deflated 71%)
  adding: content/distilbert-imdb/special_tokens_map.json (deflated 42%)
  adding: content/distilbert-imdb/model.safetensors (deflated 8%)


In [46]:
!zip -r /content/results.zip /content/results/

  adding: content/results/ (stored 0%)
  adding: content/results/checkpoint-500/ (stored 0%)
  adding: content/results/checkpoint-500/config.json (deflated 45%)
  adding: content/results/checkpoint-500/training_args.bin (deflated 52%)
  adding: content/results/checkpoint-500/scheduler.pt (deflated 54%)
  adding: content/results/checkpoint-500/optimizer.pt (deflated 18%)
  adding: content/results/checkpoint-500/trainer_state.json (deflated 63%)
  adding: content/results/checkpoint-500/rng_state.pth (deflated 25%)
  adding: content/results/checkpoint-500/model.safetensors (deflated 8%)
  adding: content/results/checkpoint-625/ (stored 0%)
  adding: content/results/checkpoint-625/config.json (deflated 45%)
  adding: content/results/checkpoint-625/training_args.bin (deflated 52%)
  adding: content/results/checkpoint-625/scheduler.pt (deflated 54%)
  adding: content/results/checkpoint-625/optimizer.pt (deflated 17%)
  adding: content/results/checkpoint-625/trainer_state.json (deflated 64%)
