# Text classification

### imports and globals

In [1]:
T5_SMALL = "t5-small"
GPT = "gpt2"  # 117M parameters as per https://huggingface.co/transformers/v3.3.1/pretrained_models.html # "openai-gpt"

In [2]:
# opt
from collections import defaultdict

# mandatory imports
from pathlib import Path
from datasets import load_dataset

import torch
from transformers import AutoTokenizer, AutoConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.utils.data import random_split
import collections
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import EvalPrediction
import numpy as np
from transformers import TrainingArguments, Trainer

from torch.utils.data import Subset
from typing import Callable
from transformers import EarlyStoppingCallback
from training_utils import SequentialTrainingBatchSampler
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling


#from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import pipeline
from tqdm import tqdm
import json

## Globals

In [3]:
### Dataset
DATASET_AGNEWS = 'ag_news'


### load dataset

In [4]:
raw_dataset = load_dataset(DATASET_AGNEWS, cache_dir="./datasets/.cache/huggingface_datasets")

Found cached dataset ag_news (/home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
classes = ['World', 'Sports', 'Business', 'Sci/Tech']

id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

label2id = {v:k for k,v in id2label.items()}

#### explore dataset

In [6]:
{k: len(raw_dataset[k]) for k in raw_dataset}

{'train': 120000, 'test': 7600}

In [7]:
raw_dataset['test'][0] 

{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'label': 2}

## Preprocess


In [8]:
### Data preprocessing params
MAX_INPUT_LENGTH = 364
stride = 128

### Tokenizer config
tokenizer = AutoTokenizer.from_pretrained(GPT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # 'left'


In [9]:
prefix1 = "Given news article: "
prefix2 = "The news article topic is: "


def prepare_dataset(ds, tokenizer, create_validation_split=False, validation_prop=0.1, remove_columns=[]):
    train_dataset = ds['train'].map(preprocess_data, batched=True, remove_columns=remove_columns)
    test_dataset = ds['test'].map(preprocess_test_data, batched=True, remove_columns=remove_columns)

    train_dataset.set_format("torch")
    validation_dataset = None
    # carve out validation set from train
    print(f"creating validation split: {str(create_validation_split)}")
    if create_validation_split:
        # had forgotten to add random split earlier during the seq training exercise
        train_dataset, validation_dataset= train_dataset.train_test_split(test_size=validation_prop).values()
    return {'train': train_dataset, 'valid': validation_dataset, 'test': test_dataset}


def preprocess_data(examples, max_input_length = MAX_INPUT_LENGTH):
    # https://huggingface.co/learn/nlp-course/chapter7/6?fw=tf#preparing-the-dataset
    context = [prefix1 + doc for doc in examples["text"] ]
    topic_labels = [id2label[q] for q in examples["label"]]
    prepped_topics = [prefix2+ topic for topic in topic_labels]
    
    inputs = tokenizer(
        context,
        prepped_topics,
        max_length=max_input_length,
        truncation="only_first",
        #stride=stride,
        #return_overflowing_tokens=True,
        padding="max_length",
    )
    # labels shift input_ids 
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs


def preprocess_test_data(examples):
    context = [prefix1 + doc + prefix2 for doc in examples["text"] ]
    topic_labels = [id2label[q] for q in examples["label"]]
    
    inputs = tokenizer(
        context,
    )
    inputs["labels"] = inputs["input_ids"].copy()
    inputs["answer"] = topic_labels

    return inputs


#### encode dataset

In [10]:
# tokenize 
encoded_dataset = prepare_dataset(raw_dataset, tokenizer, create_validation_split=True,  remove_columns=["label"])


Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-916a9b28d4526ffb.arrow
Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-b2b255dbd88aa3a1.arrow


creating validation split: True


In [11]:
print('training data sample: ', tokenizer.decode(encoded_dataset['train'][2]['input_ids'], skip_special_tokens=True), '\n', tokenizer.decode(encoded_dataset['train'][2]['labels'], skip_special_tokens=True))
# we have intentionally not padded test sentences
print('test data sample: ', tokenizer.decode(encoded_dataset['test'][2]['input_ids'], skip_special_tokens=True), '\n')

training data sample:  Given news article: Israel Kills 5 in Attempt to Assassinate Hamas Man  GAZA (Reuters) - A senior Hamas leader survived an Israeli  assassination attempt on Wednesday but at least five other  Palestinians were killed in the night-time explosion that tore  through his Gaza home.The news article topic is: World 
 Given news article: Israel Kills 5 in Attempt to Assassinate Hamas Man  GAZA (Reuters) - A senior Hamas leader survived an Israeli  assassination attempt on Wednesday but at least five other  Palestinians were killed in the night-time explosion that tore  through his Gaza home.The news article topic is: World
test data sample:  Given news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.The news article topic is:  



### Set training config

In [12]:
VALID_SIZE = 0.1
BATCH_SIZE = 8
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 100
BASE = 10
MODEL_DIR = Path(f"data/models_20230807")
LEARNING_RATE = 2e-5

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#model.config.pad_token_id = model.config.eos_token_id

### Experiment: Sequential training

test how the accuracy improves with batches of training data

In [14]:
custom_sequential_sampler = SequentialTrainingBatchSampler(encoded_dataset['train'], batch_size=-1, base=BASE)

In [15]:
def train_model_series_with_sequential_sampler(batch_size=BATCH_SIZE, n_epochs=5,
                                              compute_metrics: Callable = None, output_model_dir=MODEL_DIR, 
                                               learning_rate=LEARNING_RATE, metric_name=METRIC_NAME,
                                              custom_sequential_sampler=custom_sequential_sampler):
    total_subset_idx = []
    sequential_supervision_val_scores = []
    sequential_supervision_test_scores = []
    for idx, idx_batch in enumerate(custom_sequential_sampler):
        print(idx)
        if idx>4:
            break
        sequence_n_output_dir = output_model_dir.joinpath(f"_{idx}")
        batch_dataset = Subset(encoded_dataset['train'], idx_batch)
        print(f"Number of training data points: {len(idx_batch)}")
        args = TrainingArguments(
            output_dir=sequence_n_output_dir,
            evaluation_strategy="steps", #epoch",
            eval_steps = 500,
            save_strategy="steps", #epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=n_epochs,
            weight_decay=0.01,
            metric_for_best_model=metric_name,
            logging_dir='./logs',            # directory for storing logs*
            logging_steps=2000,
            # report_to='wandb',
            load_best_model_at_end = True,
            save_total_limit = 2,
        )

        trainer = Trainer(
                    model,
                    args,
                    train_dataset=batch_dataset,
                    eval_dataset=encoded_dataset['valid'],
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics,
            #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
                    #data_collator=data_collator
                )
        trainer.train()


        print(f"evaluating on test set")
        trainer.eval_dataset=encoded_dataset['test']
        test_scores = trainer.evaluate()
        sequential_supervision_test_scores.append(test_scores)
        #trainer.save()


        print(f"evaluating on validation set")
        trainer.eval_dataset=encoded_dataset['valid']
        val_scores = trainer.evaluate()        
        sequential_supervision_val_scores.append(val_scores)
    
    return sequential_supervision_val_scores, sequential_supervision_test_scores



In [41]:
sequential_supervision_val_scores, sequential_supervision_test_scores = train_model_series_with_sequential_sampler(compute_metrics=compute_metrics)
print([score['eval_accuracy'] for score in sequential_supervision_val_scores])
print([score['eval_accuracy'] for score in sequential_supervision_test_scores])

PyTorch: setting up devices
***** Running training *****
  Num examples = 10
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


Number of training data points: 10


Epoch,Training Loss,Validation Loss
1,No log,4.253873
2,No log,4.159801
3,No log,4.10235
4,No log,4.068579
5,No log,4.054615


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_2023080710/checkpoint-2
Configuration saved in data/models_2023080710/checkpoint-2/config.json
Model weights saved in data/models_2023080710/checkpoint-2/pytorch_model.bin
tokenizer config file saved in data/models_2023080710/checkpoint-2/tokenizer_config.json
Special tokens file saved in data/models_2023080710/checkpoint-2/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving mo

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 100
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 65
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 100


Epoch,Training Loss,Validation Loss
1,No log,3.527111
2,No log,3.39496
3,No log,3.33216
4,No log,3.308984
5,No log,3.302174


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_20230807100/checkpoint-13
Configuration saved in data/models_20230807100/checkpoint-13/config.json
Model weights saved in data/models_20230807100/checkpoint-13/pytorch_model.bin
tokenizer config file saved in data/models_20230807100/checkpoint-13/tokenizer_config.json
Special tokens file saved in data/models_20230807100/checkpoint-13/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 1000


Epoch,Training Loss,Validation Loss
1,No log,3.018964
2,No log,2.971426
3,No log,2.962867
4,No log,2.962022
5,No log,2.961788


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_202308071000/checkpoint-125
Configuration saved in data/models_202308071000/checkpoint-125/config.json
Model weights saved in data/models_202308071000/checkpoint-125/pytorch_model.bin
tokenizer config file saved in data/models_202308071000/checkpoint-125/tokenizer_config.json
Special tokens file saved in data/models_202308071000/checkpoint-125/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batc

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 10000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 10000


Epoch,Training Loss,Validation Loss
1,No log,2.771644
2,2.884500,2.733512
3,2.884500,2.721585
4,2.649800,2.717716
5,2.543100,2.719046


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_2023080710000/checkpoint-1250
Configuration saved in data/models_2023080710000/checkpoint-1250/config.json
Model weights saved in data/models_2023080710000/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in data/models_2023080710000/checkpoint-1250/tokenizer_config.json
Special tokens file saved in data/models_2023080710000/checkpoint-1250/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 100000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 62500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 100000


Epoch,Training Loss,Validation Loss
1,2.6163,2.512237
2,2.4888,2.451821
3,2.3927,2.422847
4,2.3525,2.407537
5,2.3159,2.402459


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_20230807100000/checkpoint-12500
Configuration saved in data/models_20230807100000/checkpoint-12500/config.json
Model weights saved in data/models_20230807100000/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in data/models_20230807100000/checkpoint-12500/tokenizer_config.json
Special tokens file saved in data/models_20230807100000/checkpoint-12500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 108000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 67500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 108000


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [44]:
import json
with open('sequential_trainerer_val_agnews_generative.json', 'w+') as f:
    json.dump(sequential_supervision_val_scores, f)
with open(MODEL_DIR_NEW.joinpath("sequential_trainerer_test_agnews_generative.json"), "w+") as f:
    json.dump(sequential_supervision_test_scores, f)

### inference

In [16]:
MODEL_DIR_STUB = Path(f"data/models_20230807")

def compose_model_dir_path(model_dir_stub=None, num_data_points=None, checkpoint_num=None):
    dir_path = f"{str(model_dir_stub)}{num_data_points}/checkpoint-{checkpoint_num}"
    print(dir_path)
    return dir_path

model_dir_paths = {10: compose_model_dir_path(MODEL_DIR_STUB, 10, 10),
                   100: compose_model_dir_path(MODEL_DIR_STUB, 100, 65),
                   1000: compose_model_dir_path(MODEL_DIR_STUB, 1000, 625),
                   10000: compose_model_dir_path(MODEL_DIR_STUB, 10000, 6250),
                   100000: compose_model_dir_path(MODEL_DIR_STUB, 100000, 62500)
                  } 


data/models_2023080710/checkpoint-10
data/models_20230807100/checkpoint-65
data/models_202308071000/checkpoint-625
data/models_2023080710000/checkpoint-6250
data/models_20230807100000/checkpoint-62500


define utils

In [17]:
max_new_tokens=3
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token # to avoid an error
prefix_test = f"Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'."
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [19]:
def preprocess_inference_data(examples):
    context = [prefix_test + prefix1 + doc + prefix2 for doc in examples["text"] ]
    topic_labels = [id2label[q] for q in examples["label"]]
    inputs = tokenizer(
        context,
        padding=True, #or =do_not_pad, and set as 'left' padding before preprocessing,
        max_length=128,
        truncation=True,
        return_tensors='pt'
    ).to(device)
    inputs["text"]=context
    inputs["answer"] = topic_labels

    return inputs


def load_model_trained_on_n_data_points(num_data_pts, device=device):
    model_dir = model_dir_paths[num_data_pts]
    print(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir, 
    problem_type="multi_label_classification").to(device)
    return model


def generate_preds_for_model_trained_on_n_data_points(num_data_pts, test_dataset=encoded_dataset['test'], max_new_tokens=max_new_tokens):
    model = load_model_trained_on_n_data_points(num_data_pts)
    model.config.pad_token_id = model.config.eos_token_id    
    preds = [] 
    preds_batches = []
    data_loader = DataLoader(test_dataset, batch_size=8)
    for idx, batch in enumerate(tqdm(data_loader)):
        texts = batch['text']
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to(device)
        with torch.no_grad():
            generated_ids = model.generate(top_k=5, eos_token_id=model.config.eos_token_id, max_new_tokens=max_new_tokens, **encoding).to(device)
        generated_texts = tokenizer.batch_decode(generated_ids[:, -max_new_tokens:], skip_special_tokens=True)
        preds_batches.append(list(zip(generated_texts, batch['answer'])))
    
    preds_flattened = [item for batch in preds_batches for item in batch]    
    
    with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'w+') as f:
        json.dump(preds_flattened, f)
    return preds_flattened

In [20]:
test_data = raw_dataset['test'].map(preprocess_inference_data, batched=True, batch_size=BATCH_SIZE, remove_columns=["label"])
test_data

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'answer'],
    num_rows: 7600
})

#### try it out on a sample data point

In [21]:
sampler_idx = 2
sample = test_data[sampler_idx]
sample_input = tokenizer.decode(sample['input_ids'])
len(sample['input_ids']), sample_input, sample['answer'], sample['text']

(128,
 "<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'.Given news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.The news article topic is: ",
 'Sci/Tech',
 "Classify the following news article as one amongst the following topics 'World', 'Sports', 'Busi

In [22]:
texts = test_data['text'][:2]
encoding = tokenizer(texts, padding=True, return_tensors='pt').to(device)
model = load_model_trained_on_n_data_points(1000).to(device)

with torch.no_grad():
    generated_ids = model.generate(**encoding, max_new_tokens=max_new_tokens)
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
generated_texts

data/models_202308071000/checkpoint-625


["Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'.Given news article: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.The news article topic is:  World, Business",
 "Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'.Given news article: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket.The news article topic is:  Sci/Tech"]

#### run on full test set

In [19]:
for num_data_pts in [100, 1000, 10000, 100000]:
    preds = generate_preds_for_model_trained_on_n_data_points(num_data_pts)


data/models_20230807100/checkpoint-65



KeyboardInterrupt



### Scoring

In [23]:
from evaluate import load, EvaluationModule
import numpy
import json 
from tqdm import tqdm
from time import time
bertscore = load("bertscore")
rouge = load("rouge")
classes = ['World', 'Sports', 'Business', 'Sci/Tech']
from typing import Tuple
def score_gen_text_and_assign_label(gen_text, true_label, scorer:EvaluationModule=rouge, reference_classes=classes, verbose=False):
    scores_across_classes = []
    for ref in reference_classes:
        if scorer.name=="bert_score":
            scores_across_classes.append(scorer.compute(predictions=[gen_text], references=[ref], lang="en"))
            precision_scores = [score['precision'] for score in scores_across_classes]

        else:
            scores_across_classes.append(scorer.compute(predictions=[gen_text], references=[ref]))
            precision_scores = [score['rougeL'] for score in scores_across_classes]
    pred_label = numpy.array(precision_scores).argmax()
    if verbose:
        print(f"for generated label {gen_text}, highest scoring label is {reference_classes[pred_label]}, true label: {true_label} \n")
    return pred_label, label2id.get(true_label)

def score_preds(num_data_pts, scorer=bertscore):
    with open(f'data/predictions/genai_agnews_clf_pred_labels_numdatapts_{num_data_pts}.json', 'r') as f:
        preds = json.load(f)
    pred_labels_by_scorer = []
    true_labels = []
    tp = 0
    for item in tqdm(preds):
        gen_text, true_label = item
        pred_label_id, true_label_id = score_gen_text_and_assign_label(gen_text=gen_text, true_label=true_label, scorer=scorer)
        if pred_label_id==true_label_id:
            tp+=1
        pred_labels_by_scorer.append(pred_label_id)
        true_labels.append(true_label_id)
        
    acc = tp/len(preds)
    print(scorer.name, num_data_pts, acc)
    return acc

    

In [None]:
num_data_pts = 100000
acc = score_preds(num_data_pts, scorer=rouge)
acc

In [26]:
acc_scores = []
for num_data_pts in [100, 1000, 10000, 100000]:
    start_time = time.time()

    for scorer in [rouge]:
        acc = score_preds(num_data_pts, scorer=scorer)
        acc_scores.append((num_data_pts, acc))
    print(f"Completed in {time.time()-start_time} s")

100%|██████████| 7600/7600 [2:08:39<00:00,  1.02s/it]s]


rouge 100 0.3215789473684211
Completed in 7719.489288806915 s


100%|██████████| 7600/7600 [2:08:46<00:00,  1.02s/it]t]


rouge 10000 0.6372368421052632
Completed in 7726.810903787613 s


100%|██████████| 7600/7600 [2:08:37<00:00,  1.02s/it]  

rouge 100000 0.5827631578947369
Completed in 7717.722070217133 s





In [27]:
import time
print(time.time())

1699139047.2175796


In [28]:
acc_scores

[(100, 0.3215789473684211),
 (1000, 0.6001315789473685),
 (10000, 0.6372368421052632),
 (100000, 0.5827631578947369)]