# Text classification

Things to do:
do you want to use collate_fn in data_loader

### imports and globals

In [7]:
T5_SMALL = "t5-small"
GPT = "gpt2"  # 117M parameters as per https://huggingface.co/transformers/v3.3.1/pretrained_models.html # "openai-gpt"
DISTILBERT = "distilbert-base-uncased"

In [None]:
# opt
from collections import defaultdict

# mandatory imports
from pathlib import Path
from datasets import load_dataset

import torch
from transformers import AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.utils.data import random_split
import collections
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
from transformers import TrainingArguments, Trainer


In [None]:
VALID_SIZE = 0.1
BATCH_SIZE = 32
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 10

### load dataset

In [8]:
#raw_dataset = load_dataset('super_glue', 'cb', cache_dir="./datasets/.cache/huggingface_datasets")
raw_dataset = load_dataset('ag_news', cache_dir="./datasets/.cache/huggingface_datasets")

Found cached dataset ag_news (/home/jovyan/llm_peft_exploration/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

label2id = {v:k for k,v in id2label.items()}

## Preprocess


In [10]:
tokenizer = AutoTokenizer.from_pretrained(GPT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # 'left'

# Use `DataCollatorWithPadding` as it is more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding to max length
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### explore dataset

In [11]:
{k: len(raw_dataset[k]) for k in raw_dataset}

{'train': 120000, 'test': 7600}

In [12]:
raw_dataset['test'][0] 

{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'label': 2}

In [13]:
target = torch.randint(0, 10, (8, ))
one_hot = torch.nn.functional.one_hot(target)
target.shape
one_hot

tensor([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

### apply preprocessing

In [14]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]  # this has n_rows which = batch_size
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    # add labels
    labels_batch = torch.tensor(examples['label'])
    #torch.transpose(labels_batch, 0, 1)
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = torch.nn.functional.one_hot(labels_batch)
    labels_matrix = labels_matrix.float()  # without converting int to float, you get an error later
    # print(labels_matrix)
    encoding["label"] = labels_matrix#.tolist()
  
    return encoding


#### encode dataset

In [15]:
# tokenize 
encoded_dataset = raw_dataset.map(preprocess_data, batched=True)
encoded_dataset['test'][1]['label']

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

[0.0, 0.0, 0.0, 1.0]

##### tests

In [None]:
#a, b = raw_dataset['test'].train_test_split(test_size=0.001).values()
#del a
#b_ = b.map(preprocess_data, batched=True)
#b_['label']

### carve out a valid split
different ways to create valid-test splits from train:
1. random sampling: simply use random_split on torch.utils.data.Dataset object. It returns a torch.utils.data.Subset (subset of indices from parent dataset)
2. use test_train_split from sklearn.model_selection()
3. can also use weightedRandomSampler() https://towardsdatascience.com/demystifying-pytorchs-weightedrandomsampler-by-example-a68aceccb452

In [16]:
def describe_label_distr_from_data_loader(dl):
    label_list = []
    for idx, item in enumerate(dl):
        batch_labels_onehot = item['label']#.tolist()
        batch_labels = [torch.argmax(label_onehot).item() for label_onehot in batch_labels_onehot]
        label_list.extend(batch_labels)
    print(f'distribution of labels: {collections.Counter(label_list)}')

# trial
labels = [torch.tensor(instance['label']) for instance in encoded_dataset['test']]
print(labels[:2])
labels_ = [torch.argmax(label_onehot).item() for label_onehot in labels]
collections.Counter(labels_)

[tensor([0., 0., 1., 0.]), tensor([0., 0., 0., 1.])]


Counter({2: 1900, 3: 1900, 1: 1900, 0: 1900})

In [None]:
# is this needed?
encoded_dataset.set_format("torch")

In [None]:
# we need to create tratin/valid sets
train_dataset, validation_dataset= encoded_dataset['train'].train_test_split(test_size=0.1).values()
valid_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)#, collate_fn=data_collator)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)#, collate_fn=data_collator)

In [None]:
describe_label_distr_from_data_loader(valid_loader)
describe_label_distr_from_data_loader(train_loader)

3. simply from splitting the train dataset using pytorch internal wrapper on 
train_dataset, validation_dataset= encoded_dataset['train'].train_test_split(test_size=0.1).values()


### tokenise, training loop

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(GPT, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(label2id),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.config.pad_token_id = model.config.eos_token_id


In [None]:
MODEL_DIR = Path(f"data/models_20230606")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
import wandb
wandb.login()

In [None]:
args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
    logging_dir='./logs',            # directory for storing logs*
    logging_steps=20,
    report_to='wandb'
)
    

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [35]:
model.device

device(type='cpu')

In [None]:
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


### evaluate

In [100]:
trainer.evaluate()

{'eval_loss': 0.0778539851307869,
 'eval_f1': 0.9458218549127639,
 'eval_roc_auc': 0.9633611111111112,
 'eval_accuracy': 0.9409166666666666}

In [101]:
trainer.eval_dataset=encoded_dataset['test']
trainer.evaluate()

{'eval_loss': 0.08296191692352295,
 'eval_f1': 0.9416106497957031,
 'eval_roc_auc': 0.9605701754385965,
 'eval_accuracy': 0.9361842105263158}

In [51]:
import torch

torch.save(prompt_model, 'data/models/tensor.pt')

test how the accuracy improves with batches of training data

### experiment - sequential supervision

In [None]:
for x in range(0, 1, 0.1)

##### push model to hf hub

(https://colab.research.google.com/drive/1U7SX7jNYsNQG5BY1xEQQHu48Pn6Vgnyt?usp=sharing#scrollTo=H5j5YJE2hK58)

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub("your-username/model-name")