<a href="https://colab.research.google.com/github/rahiakela/deep-learning-research-and-practice/blob/main/deep-learning-fundamentals/unit08-NLP/02-large-language-model/1_distilbert-feature-extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM as Feature Extractor

In this feature-based approach, we are using the embeddings from a pretrained transormer to train a random forest and logistic regression model in scikit-learn:

<img src="https://github.com/rasbt/blog-finetuning-llama-adapters/blob/main/three-conventional-methods/figures/1_feature-based.png?raw=1" width=500>

**Reference**

[Understanding Parameter-Efficient Finetuning of Large Language Models](https://lightning.ai/pages/community/article/understanding-llama-adapters/)

#Setup

In [None]:
!pip install transformers datasets
!pip install torch torchvision torchaudio
!pip install lightning
!pip install torchmetrics
!pip install mlxtend==0.21.0

In [None]:
!wget https://github.com/rasbt/blog-finetuning-llama-adapters/raw/main/three-conventional-methods/local_dataset_utilities.py

In [5]:
import os.path as op

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

from transformers import AutoTokenizer
from transformers import AutoModel

import numpy as np
import pandas as pd
import torch
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# 1 Dataset

In [7]:
download_dataset()

df = load_dataset_into_to_dataframe()
partition_dataset(df)

91% | 73.27 MB | 3.56 MB/s | 20.56 sec elapsed

100%|██████████| 50000/50000 [02:01<00:00, 413.18it/s]


Class distribution:


In [8]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

# 2 Tokenization

In [10]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [12]:
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [13]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [14]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [15]:
del imdb_dataset

# 3 Feature Extractor

In [None]:
model = AutoModel.from_pretrained("distilbert-base-uncased")
model.to(device);

In [17]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [18]:
test_batch = {
  "attention_mask": imdb_tokenized["train"][:3]["attention_mask"].to(device),
  "input_ids": imdb_tokenized["train"][:3]["input_ids"].to(device)
}

with torch.inference_mode():
  test_output = model(**test_batch)

test_output.last_hidden_state.shape

torch.Size([3, 512, 768])

In [19]:
cls_token_output = test_output.last_hidden_state[:, 0]
cls_token_output.shape

torch.Size([3, 768])

In [20]:
@torch.inference_mode()
def get_output_embeddings(batch):
    output = model(
        batch["input_ids"].to(device),
        attention_mask=batch["attention_mask"].to(device)).last_hidden_state[:, 0]
    return {"features": output.cpu().numpy()}

In [None]:
%%time

imdb_features = imdb_tokenized.map(get_output_embeddings, batched=True, batch_size=10)

In [None]:
imdb_features

In [None]:
X_train = np.array(imdb_features["train"]["features"])
y_train = np.array(imdb_features["train"]["label"])

X_val = np.array(imdb_features["validation"]["features"])
y_val = np.array(imdb_features["validation"]["label"])

X_test = np.array(imdb_features["test"]["features"])
y_test = np.array(imdb_features["test"]["label"])

# 4 Train Model

In [None]:
%%time

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print("Training accuracy", clf.score(X_train, y_train))
print("Validation accuracy", clf.score(X_val, y_val))
print("test accuracy", clf.score(X_test, y_test))

Training accuracy 0.8866285714285714
Validation accuracy 0.883
test accuracy 0.8795
Time elapsed 3.28 min


In [None]:
%%time

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print("Training accuracy", clf.score(X_train, y_train))
print("Validation accuracy", clf.score(X_val, y_val))
print("test accuracy", clf.score(X_test, y_test))

Training accuracy 1.0
Validation accuracy 0.8408
test accuracy 0.8324
