# Text classification

Things to do:
do you want to use collate_fn in data_loader

### imports and globals

In [1]:
T5_SMALL = "t5-small"
GPT = "gpt2"  # 117M parameters as per https://huggingface.co/transformers/v3.3.1/pretrained_models.html # "openai-gpt"
DISTILBERT = "distilbert-base-uncased"

In [7]:
# opt
from collections import defaultdict

# mandatory imports
from pathlib import Path
from datasets import load_dataset

import torch
from transformers import AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.utils.data import random_split
import collections
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


In [8]:
VALID_SIZE = 0.1
BATCH_SIZE = 32
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 10

### load dataset

In [9]:
#raw_dataset = load_dataset('super_glue', 'cb', cache_dir="./datasets/.cache/huggingface_datasets")
raw_dataset = load_dataset('ag_news', cache_dir="./datasets/.cache/huggingface_datasets")

Found cached dataset ag_news (/home/jovyan/llm_peft_exploration/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

label2id = {v:k for k,v in id2label.items()}

### explore dataset

In [11]:
{k: len(raw_dataset[k]) for k in raw_dataset}

{'train': 120000, 'test': 7600}

In [12]:
raw_dataset['test'][0] 

{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'label': 2}

## Preprocess


In [18]:
tokenizer = AutoTokenizer.from_pretrained(GPT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # 'left'

# Use `DataCollatorWithPadding` as it is more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding to max length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]  # this has n_rows which = batch_size
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    # add labels
    labels_batch = torch.tensor(examples['label'])
    #torch.transpose(labels_batch, 0, 1)
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = torch.nn.functional.one_hot(labels_batch)
    labels_matrix = labels_matrix.float()  # without converting int to float, you get an error later
    # print(labels_matrix)
    encoding["label"] = labels_matrix#.tolist()
  
    return encoding


#### encode dataset

In [15]:
# tokenize 
#encoded_dataset = raw_dataset.map(preprocess_data, batched=True)

In [24]:
def prepare_dataset(ds, tokenizer, create_validation_split=False, validation_prop=0.1):
    encoded_dataset = ds.map(preprocess_data, batched=True)
    # is this needed?
    encoded_dataset.set_format("torch")
    
    # we need to create tratin/valid sets
    print(f"creating validation split: {str(create_validation_split)}")
    if create_validation_split:
        train_dataset, validation_dataset= encoded_dataset['train'].train_test_split(test_size=validation_prop).values()
    return {'train': train_dataset, 'valid': validation_dataset, 'test': ds['test']}


In [13]:
def describe_label_distr_from_data_loader(dl):
    label_list = []
    for idx, item in enumerate(dl):
        batch_labels_onehot = item['label']#.tolist()
        batch_labels = [torch.argmax(label_onehot).item() for label_onehot in batch_labels_onehot]
        label_list.extend(batch_labels)
    print(f'distribution of labels: {collections.Counter(label_list)}')



In [25]:
encoded_dataset = prepare_dataset(raw_dataset, tokenizer, create_validation_split=True)
valid_loader = DataLoader(encoded_dataset['valid'], batch_size=BATCH_SIZE, shuffle=True)#, collate_fn=data_collator)
train_loader = DataLoader(encoded_dataset['train'], batch_size=BATCH_SIZE, shuffle=True)#, collate_fn=data_collator)

describe_label_distr_from_data_loader(valid_loader)
describe_label_distr_from_data_loader(train_loader)

Loading cached processed dataset at /home/jovyan/llm_peft_exploration/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-7e91ac84989644cd.arrow
Loading cached processed dataset at /home/jovyan/llm_peft_exploration/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-9439a82d3c544e5b.arrow


creating validation split: True
distribution of labels: Counter({2: 3028, 3: 3001, 0: 2998, 1: 2973})
distribution of labels: Counter({1: 27027, 0: 27002, 3: 26999, 2: 26972})


### tokenise, training loop

In [47]:
MODEL_DIR = Path(f"data/models_20230606")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [48]:
model = AutoModelForSequenceClassification.from_pretrained(GPT, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(label2id),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.config.pad_token_id = model.config.eos_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
import wandb
wandb.login()

In [51]:
args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=0.1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
    logging_dir='./logs',            # directory for storing logs*
    logging_steps=2000,
    report_to='wandb'
)
    

In [54]:
trainer = Trainer(
    model,
    args,
    train_dataset=batch_dataset,
    eval_dataset=encoded_dataset['valid'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
model.device

device(type='cpu')

In [55]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mpmitra01[0m ([33mdiscoverylab[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss



KeyboardInterrupt



### evaluate

In [100]:
trainer.evaluate()

{'eval_loss': 0.0778539851307869,
 'eval_f1': 0.9458218549127639,
 'eval_roc_auc': 0.9633611111111112,
 'eval_accuracy': 0.9409166666666666}

In [101]:
trainer.eval_dataset=encoded_dataset['test']
trainer.evaluate()

{'eval_loss': 0.08296191692352295,
 'eval_f1': 0.9416106497957031,
 'eval_roc_auc': 0.9605701754385965,
 'eval_accuracy': 0.9361842105263158}

In [51]:
import torch

torch.save(prompt_model, 'data/models/tensor.pt')

test how the accuracy improves with batches of training data

### experiment - sequential supervision

In [34]:
from torch.utils.data import BatchSampler, SequentialSampler, DataLoader
list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

In [52]:
# do for 10, 50, 100, 1000, 20000, 100000
from torch.utils.data import Subset
total_subset_idx = []
sequential_supervision_val_scores = []
sequential_supervision_test_scores = []
for idx, idx_batch in enumerate(BatchSampler(SequentialSampler(range(len(encoded_dataset['train']))), batch_size=3, drop_last=False)):
    if idx<4:
        total_subset_idx.extend(idx_batch)
        batch_dataset = Subset(encoded_dataset['train'], total_subset_idx)
        #batch_loader = DataLoader(batch_dataset)

        #print(batch_dataset)
        #print(batch_dataset.indices)
        
        trainer = Trainer(
                    model,
                    args,
                    train_dataset=batch_dataset,
                    eval_dataset=encoded_dataset['valid'],
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics,
                    #data_collator=data_collator
                )
        trainer.train()
        val_scores = trainer.evaluate()        
        sequential_supervision_val_scores.append(val_scores)
        
        trainer.eval_dataset=encoded_dataset['test']
        test_scores = trainer.evaluate()
        sequential_supervision_val_scores.append(test_scores)

        
        
        #trainer.save()
batch_dataset                               

{'dataset': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 108000
}), 'indices': [0, 1, 2]}
{'dataset': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 108000
}), 'indices': [0, 1, 2, 3, 4, 5]}
{'dataset': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 108000
}), 'indices': [0, 1, 2, 3, 4, 5, 6, 7, 8]}
{'dataset': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 108000
}), 'indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}


<torch.utils.data.dataset.Subset at 0x7fdc905c29d0>

##### push model to hf hub

(https://colab.research.google.com/drive/1U7SX7jNYsNQG5BY1xEQQHu48Pn6Vgnyt?usp=sharing#scrollTo=H5j5YJE2hK58)

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub("your-username/model-name")