# Text classification

### imports and globals

In [1]:
T5_SMALL = "t5-small"
GPT = "gpt2"  # 117M parameters as per https://huggingface.co/transformers/v3.3.1/pretrained_models.html # "openai-gpt"

In [2]:
# opt
from collections import defaultdict

# mandatory imports
from pathlib import Path
from datasets import load_dataset

import torch
from transformers import AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.utils.data import random_split
import collections
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


## Globals

In [3]:
VALID_SIZE = 0.1
BATCH_SIZE = 8
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 10

### load dataset

In [4]:
#raw_dataset = load_dataset('super_glue', 'cb', cache_dir="./datasets/.cache/huggingface_datasets")
raw_dataset = load_dataset('ag_news', cache_dir="./datasets/.cache/huggingface_datasets")

Found cached dataset ag_news (/home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

label2id = {v:k for k,v in id2label.items()}

### explore dataset

In [6]:
{k: len(raw_dataset[k]) for k in raw_dataset}

{'train': 120000, 'test': 7600}

In [7]:
raw_dataset['test'][0] 

{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'label': 2}

## Preprocess


In [8]:
tokenizer = AutoTokenizer.from_pretrained(GPT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # 'left'


In [9]:
# max_length = 384
stride = 128

max_input_length = 364
max_target_length = 32


prefix1 = "Given news article: "
prefix2 = "The news article topic is: "


def preprocess_data(examples):
    context = [prefix1 + doc for doc in examples["text"] ]
    #topics = [prefix2 + q.strip() for q in examples["label"]]
    topic_labels = [id2label[q] for q in examples["label"]]
    prepped_topics = [prefix2+ topic for topic in topic_labels]
    
    inputs = tokenizer(
        context,
        prepped_topics,
        max_length=max_input_length,
        truncation="only_first",
        #stride=stride,
        #return_overflowing_tokens=True,
        #return_offsets_mapping=True,
        padding="max_length",
    )
    inputs["labels"] = inputs["input_ids"].copy()

    return inputs

def preprocess_test_data(examples):
    context = [prefix1 + doc + prefix2 for doc in examples["text"] ]
    topic_labels = [id2label[q] for q in examples["label"]]
    
    inputs = tokenizer(
        context,
    )
    inputs["labels"] = inputs["input_ids"].copy()
    inputs["answer"] = topic_labels

    return inputs

In [9]:
test_dataset = raw_dataset['test'].map(preprocess_test_data, batched=True, batch_size=BATCH_SIZE, remove_columns="label")

Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-7250863f9cba77a0.arrow


In [10]:
tokenizer.decode(test_dataset[0]['labels'], skip_special_tokens=False)

"Given news article: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.The news article topic is: "

#### encode dataset

In [11]:
def prepare_dataset(ds, tokenizer, create_validation_split=False, validation_prop=0.1, remove_columns=[]):
    train_dataset = ds['train'].map(preprocess_data, batched=True, remove_columns=remove_columns)
    test_dataset = ds['test'].map(preprocess_test_data, batched=True, remove_columns=remove_columns)

    train_dataset.set_format("torch")
    validation_dataset = None
    # carve out validation set from train
    print(f"creating validation split: {str(create_validation_split)}")
    if create_validation_split:
        # had forgotten to add random split earlier during the seq training exercise
        train_dataset, validation_dataset= train_dataset.train_test_split(test_size=validation_prop).values()
    return {'train': train_dataset, 'valid': validation_dataset, 'test': test_dataset}


In [12]:
# tokenize 
encoded_dataset = prepare_dataset(raw_dataset, tokenizer, create_validation_split=True,  remove_columns=["label"])


Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-6a1730659d4cc655.arrow
Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-b2b255dbd88aa3a1.arrow


creating validation split: True


In [13]:
print(tokenizer.decode(encoded_dataset['train'][2]['input_ids'], skip_special_tokens=True), '\n')

# we have intentionally not padded test sentences
print(tokenizer.decode(encoded_dataset['test'][2]['input_ids'], skip_special_tokens=True), '\n')

Given news article: Sri Lanka beat Pakistan in Faisalabad Test (AFP) AFP - Sri Lanka defeated Pakistan by 201 runs in the first cricket Test here to take a 1-0 lead in the two-match series.The news article topic is: World 

Given news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.The news article topic is:  

Given news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.The news article topic is: 


### tokenise, training loop

In [14]:
MODEL_DIR = Path(f"data/models_20230806")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [22]:
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained(GPT, 
problem_type="multi_label_classification_with_generative_lm")

model.config.pad_token_id = model.config.eos_token_id


In [None]:
VALID_SIZE = 0.1
BATCH_SIZE = 8
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 100

In [25]:
args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',            # directory for storing logs*
    logging_steps=2000,
    report_to='wandb',
    save_total_limit = 5,
)
    

In [93]:

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid'],
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
    data_collator=data_collator,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [98]:
from transformers import GenerationConfig
generation_config = GenerationConfig(
    max_new_tokens=3, do_sample=True, top_k=5, eos_token_id=model.config.eos_token_id
)
out = model.generate(**inputs, generation_config=generation_config)
out

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[11505, 18121, 12914,  3517, 15069, 40136, 24087,  1883,   287, 20357,
           383,  3517,  2732,   329,  7868,   290, 20389,   357,    35,    69,
          1546,     8,  2904,  5611,   257,   366, 22648, 36757,    78,     1,
          1923,    11,   351,   262, 23619, 27339,  6778,   286, 36267,   262,
          1306,  5270,   286,  3517, 17245,    13,  8989,    11,   484,   635,
         31230,   510,   351,   262,  2647,  2831,   357,  3620,    40,    11,
           290,  2972,  7912,     8,   284,   787,   428,  2968,    13,   412,
          8895,   468,  5729, 22368,   511,   886,   880,    11,   523,   326,
          1751,   287,   674,  4266,   481,   783,   307, 41201,  3898,   546,
           262,  4416,  1483,   286, 22023,  2647,    13,   464, 17010,   290,
          2709,  4355,   286,   428,  1392,   284,   502,   257,  1310,    11,
           523,   314,  2630,   281,  1280,  3850,   284,   262,   360,    69,
          1546,   546,   340,    13,  8989,    11,  

### define trained models

In [3]:
num_data_pts = 100000


In [77]:
classes = ['World', 'Sports', 'Business', 'Sci/Tech']
id2label_dict = {idx:k for idx, k in enumerate(classes)}
label2id_dict = {v:k for k,v in id2label_dict.items()}

### Experiment: Sequential training

test how the accuracy improves with batches of training data

https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html#Samplers

In [31]:
import math
import random
from torch.utils.data.sampler import Sampler

In [32]:
BASE = 10
BATCH_SIZE = 8

In [35]:
from training_utils import SequentialTrainingBatchSampler
BASE=10

custom_sequential_sampler = SequentialTrainingBatchSampler(encoded_dataset['train'], batch_size=-1, base=BASE)
for i, batch in enumerate(custom_sequential_sampler):
    print(f"Batch number #{i}:  {batch}")
    if i>=1:
        break

### check cell below

In [41]:
# do for 10, 50, 100, 1000, 20000, 100000
MODEL_DIR = Path(f"data/models_20230807")

In [None]:

def train_model_series_with_sequential_sampler(batch_size=BATCH_SIZE, n_epochs=5,
                                              compute_metrics: Callable = None, output_model_dir=MODEL_DIR_NEW, 
                                               learning_rate=LEARNING_RATE, metric_name=METRIC_NAME):
    total_subset_idx = []
    sequential_supervision_val_scores = []
    sequential_supervision_test_scores = []
    for idx, idx_batch in enumerate(custom_sequential_sampler):
        print(idx)
        if idx>4:
            break
        sequence_n_output_dir = output_model_dir.joinpath(f"_{idx}")
        batch_dataset = Subset(encoded_dataset['train'], idx_batch)
        print(f"Number of training data points: {len(idx_batch)}")
        args = TrainingArguments(
            output_dir=sequence_n_output_dir,
            evaluation_strategy="steps", #epoch",
            eval_steps = 500,
            save_strategy="steps", #epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=n_epochs,
            weight_decay=0.01,
            metric_for_best_model=metric_name,
            logging_dir='./logs',            # directory for storing logs*
            logging_steps=2000,
            # report_to='wandb',
            load_best_model_at_end = True,
            save_total_limit = 2,
        )

        trainer = Trainer(
                    model,
                    args,
                    train_dataset=batch_dataset,
                    eval_dataset=encoded_dataset['valid'],
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics,
            #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
                    #data_collator=data_collator
                )
        trainer.train()


        print(f"evaluating on test set")
        trainer.eval_dataset=encoded_dataset['test']
        test_scores = trainer.evaluate()
        sequential_supervision_test_scores.append(test_scores)
        #trainer.save()


        print(f"evaluating on validation set")
        trainer.eval_dataset=encoded_dataset['valid']
        val_scores = trainer.evaluate()        
        sequential_supervision_val_scores.append(val_scores)
    
    return sequential_supervision_val_scores, sequential_supervision_test_scores



In [41]:
sequential_supervision_val_scores, sequential_supervision_test_scores = train_model_series_with_sequential_sampler(compute_metrics=compute_metrics)
print([score['eval_accuracy'] for score in sequential_supervision_val_scores])
print([score['eval_accuracy'] for score in sequential_supervision_test_scores])

PyTorch: setting up devices
***** Running training *****
  Num examples = 10
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


Number of training data points: 10


Epoch,Training Loss,Validation Loss
1,No log,4.253873
2,No log,4.159801
3,No log,4.10235
4,No log,4.068579
5,No log,4.054615


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_2023080710/checkpoint-2
Configuration saved in data/models_2023080710/checkpoint-2/config.json
Model weights saved in data/models_2023080710/checkpoint-2/pytorch_model.bin
tokenizer config file saved in data/models_2023080710/checkpoint-2/tokenizer_config.json
Special tokens file saved in data/models_2023080710/checkpoint-2/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving mo

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 100
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 65
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 100


Epoch,Training Loss,Validation Loss
1,No log,3.527111
2,No log,3.39496
3,No log,3.33216
4,No log,3.308984
5,No log,3.302174


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_20230807100/checkpoint-13
Configuration saved in data/models_20230807100/checkpoint-13/config.json
Model weights saved in data/models_20230807100/checkpoint-13/pytorch_model.bin
tokenizer config file saved in data/models_20230807100/checkpoint-13/tokenizer_config.json
Special tokens file saved in data/models_20230807100/checkpoint-13/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 1000


Epoch,Training Loss,Validation Loss
1,No log,3.018964
2,No log,2.971426
3,No log,2.962867
4,No log,2.962022
5,No log,2.961788


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_202308071000/checkpoint-125
Configuration saved in data/models_202308071000/checkpoint-125/config.json
Model weights saved in data/models_202308071000/checkpoint-125/pytorch_model.bin
tokenizer config file saved in data/models_202308071000/checkpoint-125/tokenizer_config.json
Special tokens file saved in data/models_202308071000/checkpoint-125/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batc

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 10000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 10000


Epoch,Training Loss,Validation Loss
1,No log,2.771644
2,2.884500,2.733512
3,2.884500,2.721585
4,2.649800,2.717716
5,2.543100,2.719046


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_2023080710000/checkpoint-1250
Configuration saved in data/models_2023080710000/checkpoint-1250/config.json
Model weights saved in data/models_2023080710000/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in data/models_2023080710000/checkpoint-1250/tokenizer_config.json
Special tokens file saved in data/models_2023080710000/checkpoint-1250/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 100000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 62500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 100000


Epoch,Training Loss,Validation Loss
1,2.6163,2.512237
2,2.4888,2.451821
3,2.3927,2.422847
4,2.3525,2.407537
5,2.3159,2.402459


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_20230807100000/checkpoint-12500
Configuration saved in data/models_20230807100000/checkpoint-12500/config.json
Model weights saved in data/models_20230807100000/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in data/models_20230807100000/checkpoint-12500/tokenizer_config.json
Special tokens file saved in data/models_20230807100000/checkpoint-12500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 108000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 67500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 108000


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [44]:
import json
with open('sequential_trainerer_val_agnews_generative.json', 'w+') as f:
    json.dump(sequential_supervision_val_scores, f)
with open(MODEL_DIR_NEW.joinpath("sequential_trainerer_test_agnews_generative.json"), "w+") as f:
    json.dump(sequential_supervision_test_scores, f)

### inference

In [15]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoConfig

MODEL_DIR_STUB = Path(f"data/models_20230807")
def compose_model_dir_path(model_dir_stub=None, num_data_points=None, checkpoint_num=None):
    dir_path = f"{str(model_dir_stub)}{num_data_points}/checkpoint-{checkpoint_num}"
    print(dir_path)
    return dir_path

model_dir_paths = {10: compose_model_dir_path(MODEL_DIR_STUB, 10, 10),
                   100: compose_model_dir_path(MODEL_DIR_STUB, 100, 65),
                   1000: compose_model_dir_path(MODEL_DIR_STUB, 1000, 625),
                   10000: compose_model_dir_path(MODEL_DIR_STUB, 10000, 6250),
                   100000: compose_model_dir_path(MODEL_DIR_STUB, 100000, 62500)
                  } 


data/models_2023080710/checkpoint-10
data/models_20230807100/checkpoint-65
data/models_202308071000/checkpoint-625
data/models_2023080710000/checkpoint-6250
data/models_20230807100000/checkpoint-62500


num_data_pts = 1000
model_dir = model_dir_paths[num_data_pts]
model = AutoModelForCausalLM.from_pretrained(model_dir, 
problem_type="multi_label_classification")

#config = AutoConfig.from_pretrained(model_dir)
model.config.pad_token_id = model.config.eos_token_id
tokenizer

In [15]:
sampler_idx = 39
sample = encoded_dataset['test'][sampler_idx]
sample_input = tokenizer.decode(sample['input_ids'])
sample_input, sample['answer']

("Given news article: Afghan Army Dispatched to Calm Violence KABUL, Afghanistan - Government troops intervened in Afghanistan's latest outbreak of deadly fighting between warlords, flying from the capital to the far west on U.S. and NATO airplanes to retake an air base contested in the violence, officials said Sunday...The news article topic is: ",
 'World')

In [16]:
from transformers import pipeline
max_new_tokens = 10

prefix3 = f"Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'."
prompt = prefix3 + sample_input

In [17]:
inputs = tokenizer(prompt, return_tensors='pt')['input_ids']
sample_gen_output = model.generate(inputs=inputs, max_new_tokens=max_new_tokens)
print(sample_gen_output.shape)

gen_label_start_index = inputs.shape[1]
gen_label_end_index = gen_label_start_index+max_new_tokens
gen_label_indices = torch.tensor(list(range(gen_label_start_index, gen_label_end_index)))
gen_label = torch.index_select(sample_gen_output, 1, gen_label_indices)
gen_label

NameError: name 'model' is not defined

In [None]:
print(tokenizer.batch_decode(gen_label))
tokenizer.batch_decode(sample_gen_output)

In [18]:
from tqdm import tqdm
import json

In [19]:
max_new_tokens = 10
prefix_test = f"Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'."
preds = []


In [16]:
def generate_preds_for_model_trained_on_n_data_points(num_data_pts):
    model_dir = model_dir_paths[num_data_pts]
    print(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir, 
    problem_type="multi_label_classification")
    model.config.pad_token_id = model.config.eos_token_id    
    
    preds = []
    for i, sample in enumerate(tqdm(encoded_dataset['test'])):
        sample = encoded_dataset['test'][i]
        sample_input = tokenizer.decode(sample['input_ids'])
        prompt = prefix_test + sample_input
        inputs = tokenizer(prompt, return_tensors='pt')['input_ids']
        sample_gen_output = model.generate(inputs=inputs, max_new_tokens=max_new_tokens)

        gen_label_start_index = inputs.shape[1]
        gen_label_end_index = gen_label_start_index+max_new_tokens
        gen_label_indices = torch.tensor(list(range(gen_label_start_index, gen_label_end_index)))
        try: 
            gen_label = torch.index_select(sample_gen_output, 1, gen_label_indices)
        except:
            print(i, gen_label_indices) 

        gen_label_text = tokenizer.batch_decode(gen_label)
        preds.append((i, gen_label_text, sample['answer']))
        
    with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'w+') as f:
        json.dump(preds, f)
    return preds

In [32]:
num_data_pts = 100
preds2 = generate_preds_for_model_trained_on_n_data_points(num_data_pts)
len(encoded_dataset['test'])

data/models_20230807100/checkpoint-65


  1%|          | 94/7600 [00:44<56:20,  2.22it/s]  

93 tensor([80, 81, 82, 83, 84, 85, 86, 87, 88, 89])


  3%|▎         | 215/7600 [01:41<55:09,  2.23it/s]  

214 tensor([104, 105, 106, 107, 108, 109, 110, 111, 112, 113])


  7%|▋         | 539/7600 [04:12<57:57,  2.03it/s]  

538 tensor([88, 89, 90, 91, 92, 93, 94, 95, 96, 97])


  8%|▊         | 639/7600 [05:00<53:50,  2.15it/s]  

638 tensor([78, 79, 80, 81, 82, 83, 84, 85, 86, 87])


 12%|█▏        | 944/7600 [07:23<47:07,  2.35it/s]  

943 tensor([78, 79, 80, 81, 82, 83, 84, 85, 86, 87])


 13%|█▎        | 1020/7600 [07:58<45:48,  2.39it/s]

1019 tensor([ 91,  92,  93,  94,  95,  96,  97,  98,  99, 100])


 15%|█▌        | 1172/7600 [09:09<54:45,  1.96it/s]

1171 tensor([79, 80, 81, 82, 83, 84, 85, 86, 87, 88])


 16%|█▌        | 1193/7600 [09:19<51:52,  2.06it/s]

1192 tensor([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])


 17%|█▋        | 1277/7600 [09:58<48:03,  2.19it/s]

1276 tensor([85, 86, 87, 88, 89, 90, 91, 92, 93, 94])


 20%|██        | 1541/7600 [12:02<52:00,  1.94it/s]

1540 tensor([81, 82, 83, 84, 85, 86, 87, 88, 89, 90])


 21%|██        | 1593/7600 [12:27<44:02,  2.27it/s]

1592 tensor([78, 79, 80, 81, 82, 83, 84, 85, 86, 87])


 21%|██        | 1597/7600 [12:28<44:56,  2.23it/s]

1596 tensor([80, 81, 82, 83, 84, 85, 86, 87, 88, 89])


 26%|██▌       | 1993/7600 [15:32<40:54,  2.28it/s]

1992 tensor([ 92,  93,  94,  95,  96,  97,  98,  99, 100, 101])


 28%|██▊       | 2109/7600 [16:27<45:13,  2.02it/s]

2108 tensor([72, 73, 74, 75, 76, 77, 78, 79, 80, 81])


 30%|███       | 2298/7600 [17:55<38:38,  2.29it/s]

2297 tensor([79, 80, 81, 82, 83, 84, 85, 86, 87, 88])


 32%|███▏      | 2412/7600 [18:49<37:02,  2.33it/s]

2411 tensor([81, 82, 83, 84, 85, 86, 87, 88, 89, 90])


 33%|███▎      | 2511/7600 [19:36<36:04,  2.35it/s]

2510 tensor([69, 70, 71, 72, 73, 74, 75, 76, 77, 78])


 33%|███▎      | 2541/7600 [19:49<31:57,  2.64it/s]

2540 tensor([82, 83, 84, 85, 86, 87, 88, 89, 90, 91])


 34%|███▍      | 2576/7600 [20:06<36:48,  2.27it/s]

2575 tensor([85, 86, 87, 88, 89, 90, 91, 92, 93, 94])


 35%|███▍      | 2642/7600 [20:36<37:04,  2.23it/s]

2641 tensor([72, 73, 74, 75, 76, 77, 78, 79, 80, 81])


 40%|███▉      | 3007/7600 [23:26<33:55,  2.26it/s]

3006 tensor([87, 88, 89, 90, 91, 92, 93, 94, 95, 96])


 42%|████▏     | 3182/7600 [24:48<31:22,  2.35it/s]

3181 tensor([ 93,  94,  95,  96,  97,  98,  99, 100, 101, 102])


 48%|████▊     | 3678/7600 [28:40<32:06,  2.04it/s]

3677 tensor([84, 85, 86, 87, 88, 89, 90, 91, 92, 93])


 49%|████▉     | 3734/7600 [29:06<28:11,  2.28it/s]

3733 tensor([83, 84, 85, 86, 87, 88, 89, 90, 91, 92])


 50%|█████     | 3831/7600 [29:52<26:39,  2.36it/s]

3830 tensor([86, 87, 88, 89, 90, 91, 92, 93, 94, 95])


 51%|█████     | 3849/7600 [30:00<27:36,  2.26it/s]

3848 tensor([80, 81, 82, 83, 84, 85, 86, 87, 88, 89])


 51%|█████▏    | 3897/7600 [30:22<26:58,  2.29it/s]

3896 tensor([89, 90, 91, 92, 93, 94, 95, 96, 97, 98])


 53%|█████▎    | 4038/7600 [31:29<25:12,  2.35it/s]

4037 tensor([86, 87, 88, 89, 90, 91, 92, 93, 94, 95])


 54%|█████▍    | 4093/7600 [31:54<24:35,  2.38it/s]

4092 tensor([82, 83, 84, 85, 86, 87, 88, 89, 90, 91])


 55%|█████▌    | 4191/7600 [32:41<25:14,  2.25it/s]

4190 tensor([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])


 55%|█████▌    | 4208/7600 [32:49<24:39,  2.29it/s]

4207 tensor([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])


 56%|█████▌    | 4221/7600 [32:54<25:23,  2.22it/s]

4220 tensor([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])


 56%|█████▌    | 4247/7600 [33:06<23:25,  2.39it/s]

4246 tensor([ 92,  93,  94,  95,  96,  97,  98,  99, 100, 101])


 57%|█████▋    | 4299/7600 [33:31<24:11,  2.27it/s]

4298 tensor([87, 88, 89, 90, 91, 92, 93, 94, 95, 96])


 59%|█████▊    | 4447/7600 [34:41<22:56,  2.29it/s]

4446 tensor([ 95,  96,  97,  98,  99, 100, 101, 102, 103, 104])


 61%|██████    | 4650/7600 [36:16<21:26,  2.29it/s]

4649 tensor([85, 86, 87, 88, 89, 90, 91, 92, 93, 94])


 62%|██████▏   | 4724/7600 [36:50<22:23,  2.14it/s]

4723 tensor([ 93,  94,  95,  96,  97,  98,  99, 100, 101, 102])


 63%|██████▎   | 4778/7600 [37:16<25:04,  1.88it/s]

4777 tensor([ 95,  96,  97,  98,  99, 100, 101, 102, 103, 104])


 64%|██████▎   | 4831/7600 [37:41<20:43,  2.23it/s]

4830 tensor([81, 82, 83, 84, 85, 86, 87, 88, 89, 90])


 65%|██████▌   | 4947/7600 [38:34<19:15,  2.30it/s]

4946 tensor([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])


 67%|██████▋   | 5062/7600 [39:27<18:21,  2.31it/s]

5061 tensor([80, 81, 82, 83, 84, 85, 86, 87, 88, 89])


 73%|███████▎  | 5528/7600 [43:07<15:35,  2.22it/s]

5527 tensor([84, 85, 86, 87, 88, 89, 90, 91, 92, 93])


 73%|███████▎  | 5553/7600 [43:18<14:52,  2.29it/s]

5552 tensor([88, 89, 90, 91, 92, 93, 94, 95, 96, 97])


 75%|███████▌  | 5732/7600 [44:42<13:54,  2.24it/s]

5731 tensor([83, 84, 85, 86, 87, 88, 89, 90, 91, 92])


 76%|███████▌  | 5791/7600 [45:09<12:57,  2.33it/s]

5790 tensor([86, 87, 88, 89, 90, 91, 92, 93, 94, 95])


 80%|███████▉  | 6043/7600 [47:08<10:05,  2.57it/s]

6042 tensor([79, 80, 81, 82, 83, 84, 85, 86, 87, 88])


 80%|████████  | 6090/7600 [47:30<10:28,  2.40it/s]

6089 tensor([66, 67, 68, 69, 70, 71, 72, 73, 74, 75])


 82%|████████▏ | 6207/7600 [48:24<09:34,  2.43it/s]

6206 tensor([83, 84, 85, 86, 87, 88, 89, 90, 91, 92])


 84%|████████▍ | 6377/7600 [49:44<09:14,  2.21it/s]

6376 tensor([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])


 84%|████████▍ | 6387/7600 [49:49<09:08,  2.21it/s]

6386 tensor([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])


 84%|████████▍ | 6388/7600 [49:49<08:43,  2.31it/s]

6387 tensor([73, 74, 75, 76, 77, 78, 79, 80, 81, 82])


 91%|█████████ | 6934/7600 [54:04<04:51,  2.28it/s]

6933 tensor([72, 73, 74, 75, 76, 77, 78, 79, 80, 81])


 92%|█████████▏| 6964/7600 [54:18<05:15,  2.02it/s]

6963 tensor([82, 83, 84, 85, 86, 87, 88, 89, 90, 91])


 92%|█████████▏| 6989/7600 [54:31<04:49,  2.11it/s]

6988 tensor([78, 79, 80, 81, 82, 83, 84, 85, 86, 87])


 92%|█████████▏| 6996/7600 [54:34<04:45,  2.11it/s]

6995 tensor([75, 76, 77, 78, 79, 80, 81, 82, 83, 84])


 95%|█████████▍| 7202/7600 [56:11<02:46,  2.39it/s]

7201 tensor([73, 74, 75, 76, 77, 78, 79, 80, 81, 82])


 95%|█████████▌| 7229/7600 [56:23<02:25,  2.55it/s]

7228 tensor([86, 87, 88, 89, 90, 91, 92, 93, 94, 95])


 97%|█████████▋| 7403/7600 [57:44<01:30,  2.19it/s]

7402 tensor([87, 88, 89, 90, 91, 92, 93, 94, 95, 96])


100%|██████████| 7600/7600 [59:16<00:00,  2.14it/s]


7600

In [17]:
from evaluate import load, EvaluationModule
bertscore = load("bertscore")
rouge = load("rouge")
classes = ['World', 'Sports', 'Business', 'Sci/Tech']

def score_and_pred_label(gen_text, true_label, scorer:EvaluationModule=rouge, reference_classes=classes):
    #print(f'Using metric: {scorer.name}')
    #scorer = load(metric)
    scores_across_classes = []
    for ref in reference_classes:
        assert isinstance(gen_text, list)
        if scorer.name=="bert_score":
            scores_across_classes.append(scorer.compute(predictions=gen_text, references=[ref], lang="en"))
            precision_scores = [score['precision'] for score in scores_across_classes]

        else:
            scores_across_classes.append(scorer.compute(predictions=gen_text, references=[ref]))
            precision_scores = [score['rougeL'] for score in scores_across_classes]
    pred_label = np.array(precision_scores).argmax()
    #print(f"for generated label {gen_text}, highest scoring label is {reference_classes[pred_label]}, true label: {true_label} \n")
    return pred_label, label2id.get(true_label)

In [18]:
def score_preds(num_data_pts, scorer=bertscore):
    with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'r') as f:
        preds = json.load(f)
    pred_labels_by_scorer = []
    true_labels = []
    tp = 0
    for idx, item in tqdm(enumerate(preds)):
        _, gen_text, true_label = item
        pred_label, true_label = score_and_pred_label(gen_text=gen_text, true_label=true_label, scorer=scorer)
        if pred_label==true_label:
            tp+=1
        pred_labels_by_scorer.append(pred_label)
        true_labels.append(true_label)
        
    acc = tp/len(preds)
    print(scorer.name, num_data_pts, acc)
    return acc

    

In [19]:
num_data_pts = 100
with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'r') as f:
    preds = json.load(f)
    

NameError: name 'json' is not defined

In [21]:
import json
from tqdm import tqdm
num_data_pts = 100
acc = score_preds(num_data_pts, scorer=rouge)
acc

7600it [1:34:49,  1.34it/s]

rouge 100 0.3325





0.3325

In [25]:

num_data_pts = 1000
acc = score_preds(num_data_pts, scorer=rouge)
acc

7600it [1:33:57,  1.35it/s]

rouge 1000 0.4955263157894737





0.4955263157894737

In [38]:
acc_scores = []
for num_data_pts in [10000, 100000]:
    for scorer in [rouge]:
        acc = score_preds(num_data_pts, scorer=scorer)
        acc_scores.append((num_data_pts, acc))

3365it [57:22,  1.02s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

4337it [1:13:55,  1.04s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [39]:
acc_scores

[(10000, 0.6228947368421053), (100000, 0.7094736842105264)]

In [46]:
import json
with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'w+') as f:
    json.dump(preds, f)

In [49]:
pred_labels_bertscore = []
true_labels = []
tp = 0
for idx, item in enumerate(preds):
    _, gen_text, true_label = item
    pred_label, true_label = score_and_pred_label(gen_text=gen_text, true_label=true_label, scorer=rouge)
    if pred_label==true_label:
        tp+=1
    pred_labels_bertscore.append(pred_label)
    true_labels.append(true_label)

print(num_data_pts)
acc = tp/len(preds)
acc

1000


0.6228947368421053

In [41]:
num_data_pts

1000

In [47]:
import json
num_data_pts
with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'r') as f:
    preds = json.load(f)


In [48]:
preds[42]

[42, [' World, Business, Sci/Tech.The news'], 'World']

10000

In [25]:
bertscore = load("bertscore")
rouge = load("rouge")
# results = bertscore.compute(predictions=predictions, references=references, lang="en")

label2id

{'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

In [23]:
indices, predictions, true_labels = zip(*preds)
#bertscore.compute()

In [35]:
from evaluate import load, EvaluationModule
bertscore = load("bertscore")
rouge = load("rouge")
def score_and_pred_label(gen_text, true_label, scorer:EvaluationModule=rouge, reference_classes=references):
    #print(f'Using metric: {scorer.name}')
    #scorer = load(metric)
    scores_across_classes = []
    for ref in reference_classes:
        assert isinstance(gen_text, list)
        if scorer.name=="bert_score":
            scores_across_classes.append(scorer.compute(predictions=gen_text, references=[ref], lang="en"))
            precision_scores = [score['precision'] for score in scores_across_classes]

        else:
            scores_across_classes.append(scorer.compute(predictions=gen_text, references=[ref]))
            precision_scores = [score['rougeL'] for score in scores_across_classes]
    pred_label = np.array(precision_scores).argmax()
    #print(f"for generated label {gen_text}, highest scoring label is {reference_classes[pred_label]}, true label: {true_label} \n")
    return pred_label, label2id.get(true_label)

In [None]:
import json

with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'r') as f:
    preds = json.load(f)


In [30]:
idx = 21
_, gen_text, true_label = preds[idx]
score_and_pred_label(gen_text=gen_text, true_label=true_label,  scorer=bertscore)
score_and_pred_label(gen_text=gen_text, true_label=true_label, scorer=rouge)
     

(3, 3)

In [36]:
pred_labels_bertscore = []
true_labels = []
tp = 0
for idx, item in enumerate(preds):
    _, gen_text, true_label = item
    pred_label, true_label = score_and_pred_label(gen_text=gen_text, true_label=true_label, scorer=rouge)
    if pred_label==true_label:
        tp+=1
    pred_labels_bertscore.append(pred_label)
    true_labels.append(true_label)


In [39]:
num_data_pts


10000

In [37]:

acc = tp/len(preds)
acc

0.6228947368421053

In [36]:
acc = tp/len(preds)
acc

0.7094736842105264