# Text classification

Things to do:
do you want to use collate_fn in data_loader

### imports and globals

In [1]:
T5_SMALL = "t5-small"
GPT = "gpt2"  # 117M parameters as per https://huggingface.co/transformers/v3.3.1/pretrained_models.html # "openai-gpt"
DISTILBERT = "distilbert-base-uncased"

In [2]:
# opt
from collections import defaultdict

# mandatory imports
from pathlib import Path
from datasets import load_dataset

import torch
from transformers import AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.utils.data import random_split
import collections
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


## Globals

In [3]:
VALID_SIZE = 0.1
BATCH_SIZE = 32
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 10

### load dataset

In [4]:
#raw_dataset = load_dataset('super_glue', 'cb', cache_dir="./datasets/.cache/huggingface_datasets")
raw_dataset = load_dataset('ag_news', cache_dir="./datasets/.cache/huggingface_datasets")

Found cached dataset ag_news (/home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

label2id = {v:k for k,v in id2label.items()}

### explore dataset

In [10]:
{k: len(raw_dataset[k]) for k in raw_dataset}

{'train': 120000, 'test': 7600}

In [11]:
raw_dataset['test'][0] 

{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'label': 2}

## Preprocess


In [12]:
tokenizer = AutoTokenizer.from_pretrained(GPT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # 'left'

# Use `DataCollatorWithPadding` as it is more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding to max length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]  # this has n_rows which = batch_size
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    # add labels
    labels_batch = torch.tensor(examples['label'])
    #torch.transpose(labels_batch, 0, 1)
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = torch.nn.functional.one_hot(labels_batch)
    labels_matrix = labels_matrix.float()  # without converting int to float, you get an error later
    # print(labels_matrix)
    encoding["label"] = labels_matrix#.tolist()
  
    return encoding


#### encode dataset

In [14]:
# tokenize 
#encoded_dataset = raw_dataset.map(preprocess_data, batched=True)

In [15]:
def prepare_dataset(ds, tokenizer, create_validation_split=False, validation_prop=0.1):
    encoded_dataset = ds.map(preprocess_data, batched=True)
    # is this needed?
    encoded_dataset.set_format("torch")
    
    # we need to create tratin/valid sets
    print(f"creating validation split: {str(create_validation_split)}")
    if create_validation_split:
        train_dataset, validation_dataset= encoded_dataset['train'].train_test_split(test_size=validation_prop).values()
    return {'train': train_dataset, 'valid': validation_dataset, 'test': encoded_dataset['test']}


In [16]:
def describe_label_distr_from_data_loader(dl):
    label_list = []
    for idx, item in enumerate(dl):
        batch_labels_onehot = item['label']#.tolist()
        batch_labels = [torch.argmax(label_onehot).item() for label_onehot in batch_labels_onehot]
        label_list.extend(batch_labels)
    print(f'distribution of labels: {collections.Counter(label_list)}')



In [17]:
encoded_dataset = prepare_dataset(raw_dataset, tokenizer, create_validation_split=True)
valid_loader = DataLoader(encoded_dataset['valid'], batch_size=BATCH_SIZE, shuffle=True)#, collate_fn=data_collator)
train_loader = DataLoader(encoded_dataset['train'], batch_size=BATCH_SIZE, shuffle=True)#, collate_fn=data_collator)

describe_label_distr_from_data_loader(valid_loader)
describe_label_distr_from_data_loader(train_loader)

Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-c4f7b776196c8f42.arrow
Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-43f61cb9fd51c4df.arrow


creating validation split: True
distribution of labels: Counter({0: 3067, 2: 3028, 1: 2963, 3: 2942})
distribution of labels: Counter({3: 27058, 1: 27037, 2: 26972, 0: 26933})


### tokenise, training loop

In [18]:
MODEL_DIR = Path(f"data/models_20230606")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(GPT, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(label2id),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.config.pad_token_id = model.config.eos_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

import numpy as np

# consider std text clf that assumes more of a softmax process
def compute_metrics_std(p: EvalPrediction):
    predictions = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    labels = p.label_ids
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

NameError: name 'EvalPrediction' is not defined

In [22]:
VALID_SIZE = 0.1
BATCH_SIZE = 32
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 50

In [23]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [24]:
args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=0.1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
    logging_dir='./logs',            # directory for storing logs*
    logging_steps=2000,
    report_to='wandb',
    save_total_limit = 5,
)
    

In [25]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [26]:
model.device

device(type='cuda', index=0)

In [33]:
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669471766666295, max=1.0…

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,0.139214,0.899705,0.930361,0.880583


TrainOutput(global_step=338, training_loss=0.23731443867880916, metrics={'train_runtime': 153.8877, 'train_samples_per_second': 70.181, 'train_steps_per_second': 2.196, 'total_flos': 706559172673536.0, 'train_loss': 0.23731443867880916, 'epoch': 0.1})

### evaluate

In [34]:
trainer.evaluate()

{'eval_loss': 0.13921377062797546,
 'eval_f1': 0.8997052631578947,
 'eval_roc_auc': 0.9303611111111111,
 'eval_accuracy': 0.8805833333333334,
 'eval_runtime': 31.6581,
 'eval_samples_per_second': 379.05,
 'eval_steps_per_second': 11.845,
 'epoch': 0.1}

In [101]:
trainer.eval_dataset=encoded_dataset['test']
trainer.evaluate()

{'eval_loss': 0.08296191692352295,
 'eval_f1': 0.9416106497957031,
 'eval_roc_auc': 0.9605701754385965,
 'eval_accuracy': 0.9361842105263158}

test how the accuracy improves with batches of training data

In [36]:
vars(train_loader)

{'dataset': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 108000
 }),
 'num_workers': 0,
 'prefetch_factor': 2,
 'pin_memory': False,
 'pin_memory_device': '',
 'timeout': 0,
 'worker_init_fn': None,
 '_DataLoader__multiprocessing_context': None,
 '_dataset_kind': 0,
 'batch_size': 32,
 'drop_last': False,
 'sampler': <torch.utils.data.sampler.RandomSampler at 0x7fbba41b1550>,
 'batch_sampler': <torch.utils.data.sampler.BatchSampler at 0x7fbba41b15e0>,
 'generator': None,
 'collate_fn': <function torch.utils.data._utils.collate.default_collate(batch)>,
 'persistent_workers': False,
 '_DataLoader__initialized': True,
 '_IterableDataset_len_called': None,
 '_iterator': None}

### to delete batch sampling custom

https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html#Samplers

In [27]:
import math
import random
from torch.utils.data.sampler import Sampler

In [33]:
BASE = 10
BATCH_SIZE = 32
a1 = [np.power(BASE, i) for i in range(5)]
math.log(1200, BASE)
np.power(1, 10)

1

In [47]:
def chunk(indices, chunk_size=-1):
    if chunk_size<1:
        chunk_size = len(indices)
    return torch.split(torch.tensor(indices), chunk_size)

class SequentialTrainingBatchSampler(Sampler):
    def __init__(self, dataset, batch_size, shuffle=False, base=10):
        max_segments = int(math.log(len(dataset), base))
        self.log_index_markers = [np.power(base, sgmt) for sgmt in range(1, max_segments+1)]
        if max_segments < math.log(len(dataset)):
            self.log_index_markers.append(len(dataset))
        self.indices_lists = [list(range(marker)) for marker in self.log_index_markers]
        self.shuffle = shuffle
        self.batch_size = batch_size
    
    def __iter__(self):
        if self.shuffle:
            for indices_list in self.indices_lists:
                random.shuffle(indices_list)
       ## print(self.indices_lists)
        segment_batches  = [chunk(segment_indices_list, self.batch_size) for segment_indices_list in self.indices_lists]

        #combined = [[batch.tolist() for batch in segment] for segment in segment_batches]
        combined = [batch.tolist() for segment in segment_batches for batch in segment]

        if self.shuffle:
            random.shuffle(combined)
        return iter(combined)
    
    def __len__(self):
        return  len(self.indices_lists)#sum([len(segment_indices) for segment_indices in self.indices_lists]) #// self.batch_size

### experiment - sequential supervision

In [48]:
from torch.utils.data import BatchSampler, SequentialSampler, DataLoader
BASE=10
custom_sequential_sampler = SequentialTrainingBatchSampler(encoded_dataset['train'], batch_size=-1, base=BASE)
for i, batch in enumerate(custom_sequential_sampler):
    print(f"Batch number #{i}:  {batch}")
    if i>=1:
        break

Batch number #0:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Batch number #1:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [55]:
model.device, len(custom_sequential_sampler), MODEL_DIR

(device(type='cuda', index=0), 6, PosixPath('data/models_20230606'))

In [50]:
# do for 10, 50, 100, 1000, 20000, 100000
from torch.utils.data import Subset
total_subset_idx = []
sequential_supervision_val_scores = []
sequential_supervision_test_scores = []
for idx, idx_batch in enumerate(custom_sequential_sampler):
    #if idx<4:
    #total_subset_idx.extend(idx_batch)
    batch_dataset = Subset(encoded_dataset['train'], idx_batch)
    print(f"Number of training data points: {len(idx_batch)}")
    #batch_loader = DataLoader(batch_dataset)
    args = TrainingArguments(
        output_dir=MODEL_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=0.1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=METRIC_NAME,
        logging_dir='./logs',            # directory for storing logs*
        logging_steps=2000,
        report_to='wandb',
        save_total_limit = 5,
    )
    
    trainer = Trainer(
                model,
                args,
                train_dataset=batch_dataset,
                eval_dataset=encoded_dataset['valid'],
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
                #data_collator=data_collator
            )
    trainer.train()
    
    print(f"evaluating on validation set")
    trainer.eval_dataset=encoded_dataset['valid']
    val_scores = trainer.evaluate()        
    sequential_supervision_val_scores.append(val_scores)
    
    print(f"evaluating on test set")
    trainer.eval_dataset=encoded_dataset['test']
    test_scores = trainer.evaluate()
    sequential_supervision_test_scores.append(test_scores)



    #trainer.save()
batch_dataset                               

Number of training data points: 10




VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669560833334648, max=1.0…

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,1.529575,0.342382,0.492472,0.000583


Number of training data points: 100




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,1.358408,0.348142,0.500597,0.001083


Number of training data points: 1000




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,1.066795,0.35939,0.520014,0.016583


Number of training data points: 10000




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,0.487509,0.389117,0.609722,0.260667


Number of training data points: 100000




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,0.133472,0.899786,0.931403,0.877583


Number of training data points: 108000




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,0.116661,0.913438,0.941431,0.8985


<torch.utils.data.dataset.Subset at 0x7f79e7ce98b0>

In [58]:
print([score['eval_accuracy'] for score in sequential_supervision_val_scores])
import json
with open(MODEL_DIR.joinpath("val_scores.json"), "w+") as f:
    json.dump(sequential_supervision_val_scores, f)

[0.0005833333333333334, 0.0010833333333333333, 0.016583333333333332, 0.26066666666666666, 0.8775833333333334, 0.8985]


##### push model to hf hub

(https://colab.research.google.com/drive/1U7SX7jNYsNQG5BY1xEQQHu48Pn6Vgnyt?usp=sharing#scrollTo=H5j5YJE2hK58)

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub("your-username/model-name")