# Text classification

Things to do:
do you want to use collate_fn in data_loader

### imports and globals

In [2]:
T5_SMALL = "t5-small"
GPT = "gpt2"  # 117M parameters as per https://huggingface.co/transformers/v3.3.1/pretrained_models.html # "openai-gpt"
DISTILBERT = "distilbert-base-uncased"

In [3]:
# opt
from collections import defaultdict

# mandatory imports
from pathlib import Path
from datasets import load_dataset

import torch
from transformers import AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.utils.data import random_split
import collections
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


## Globals

In [4]:
VALID_SIZE = 0.1
BATCH_SIZE = 8
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 10

### load dataset

In [5]:
#raw_dataset = load_dataset('super_glue', 'cb', cache_dir="./datasets/.cache/huggingface_datasets")
raw_dataset = load_dataset('ag_news', cache_dir="./datasets/.cache/huggingface_datasets")

Found cached dataset ag_news (/home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

label2id = {v:k for k,v in id2label.items()}

### explore dataset

In [7]:
{k: len(raw_dataset[k]) for k in raw_dataset}

{'train': 120000, 'test': 7600}

In [8]:
raw_dataset['test'][0] 

{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'label': 2}

## Preprocess


In [9]:
tokenizer = AutoTokenizer.from_pretrained(GPT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # 'left'

# Use `DataCollatorWithPadding` as it is more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding to max length
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
# max_length = 384
stride = 128

max_input_length = 364
max_target_length = 32


prefix1 = "Given news article: "
prefix2 = "The news article topic is: "


def preprocess_data(examples):
    context = [prefix1 + doc for doc in examples["text"] ]
    #topics = [prefix2 + q.strip() for q in examples["label"]]
    topic_labels = [id2label[q] for q in examples["label"]]
    prepped_topics = [prefix2+ topic for topic in topic_labels]
    
    inputs = tokenizer(
        context,
        prepped_topics,
        max_length=max_input_length,
        truncation="only_first",
        #stride=stride,
        #return_overflowing_tokens=True,
        #return_offsets_mapping=True,
        padding="max_length",
    )
    inputs["labels"] = inputs["input_ids"].copy()

    return inputs

def preprocess_test_data(examples):
    context = [prefix1 + doc + prefix2 for doc in examples["text"] ]
    #topics = [prefix2 + q.strip() for q in examples["label"]]
    topic_labels = [id2label[q] for q in examples["label"]]
    #prepped_topics = [prefix2+ topic for topic in topic_labels]
    
    inputs = tokenizer(
        context,
    )
    inputs["labels"] = inputs["input_ids"].copy()
    inputs["answer"] = topic_labels

    return inputs

test_dataset = raw_dataset['test'].map(preprocess_test_data, batched=True, batch_size=BATCH_SIZE, remove_columns="label")



Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-a96aeb96b81ccdbb.arrow


In [11]:
tokenizer.decode(test_dataset[0]['labels'], skip_special_tokens=False)

"Given news article: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.The news article topic is: "

#### encode dataset

In [12]:
def prepare_dataset(ds, tokenizer, create_validation_split=False, validation_prop=0.1, remove_columns=[]):
    train_dataset = ds['train'].map(preprocess_data, batched=True, remove_columns=remove_columns)
    test_dataset = ds['test'].map(preprocess_test_data, batched=True, remove_columns=remove_columns)

    train_dataset.set_format("torch")
    validation_dataset = None
    # carve out validation set from train
    print(f"creating validation split: {str(create_validation_split)}")
    if create_validation_split:
        # had forgotten to add random split earlier during the seq training exercise
        train_dataset, validation_dataset= train_dataset.train_test_split(test_size=validation_prop).values()
    return {'train': train_dataset, 'valid': validation_dataset, 'test': test_dataset}


In [13]:
# tokenize 
#encoded_dataset = raw_dataset.map(preprocess_data, batched=True, batch_size=BATCH_SIZE, remove_columns="label")
encoded_dataset = prepare_dataset(raw_dataset, tokenizer, create_validation_split=True,  remove_columns=["label"])


Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-6a1730659d4cc655.arrow
Loading cached processed dataset at /home/jovyan/llm_peft_exploration/notebooks/datasets/.cache/huggingface_datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-51aa51635d5b0458.arrow


creating validation split: True


In [14]:
print(tokenizer.decode(encoded_dataset['train'][2]['input_ids'], skip_special_tokens=True), '\n')

# we have intentionally not padded test sentences
print(tokenizer.decode(encoded_dataset['test'][2]['input_ids'], skip_special_tokens=True), '\n')
print(tokenizer.decode(encoded_dataset['test'][2]['input_ids'], skip_special_tokens=False))


Given news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.The news article topic is:  

Given news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.The news article topic is: 


### tokenise, training loop

In [21]:
MODEL_DIR = Path(f"data/models_20230806")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [22]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(GPT, 
problem_type="multi_label_classification_with_generative_lm")

model.config.pad_token_id = model.config.eos_token_id


In [None]:
VALID_SIZE = 0.1
BATCH_SIZE = 8
SEED = 42
METRIC_NAME = "f1"
N_EPOCHS = 100

In [24]:
import wandb
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [25]:
args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    #metric_for_best_model=METRIC_NAME,
    logging_dir='./logs',            # directory for storing logs*
    logging_steps=2000,
    report_to='wandb',
    save_total_limit = 5,
)
    

In [93]:

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid'],
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
    data_collator=data_collator,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [94]:
#torch.cuda.empty_cache()
#model.device

In [95]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mpmitra01[0m ([33mdiscoverylab[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669316933333296, max=1.0…

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss



KeyboardInterrupt



In [98]:
from transformers import GenerationConfig
generation_config = GenerationConfig(
    max_new_tokens=3, do_sample=True, top_k=5, eos_token_id=model.config.eos_token_id
)
out = model.generate(**inputs, generation_config=generation_config)
out

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[11505, 18121, 12914,  3517, 15069, 40136, 24087,  1883,   287, 20357,
           383,  3517,  2732,   329,  7868,   290, 20389,   357,    35,    69,
          1546,     8,  2904,  5611,   257,   366, 22648, 36757,    78,     1,
          1923,    11,   351,   262, 23619, 27339,  6778,   286, 36267,   262,
          1306,  5270,   286,  3517, 17245,    13,  8989,    11,   484,   635,
         31230,   510,   351,   262,  2647,  2831,   357,  3620,    40,    11,
           290,  2972,  7912,     8,   284,   787,   428,  2968,    13,   412,
          8895,   468,  5729, 22368,   511,   886,   880,    11,   523,   326,
          1751,   287,   674,  4266,   481,   783,   307, 41201,  3898,   546,
           262,  4416,  1483,   286, 22023,  2647,    13,   464, 17010,   290,
          2709,  4355,   286,   428,  1392,   284,   502,   257,  1310,    11,
           523,   314,  2630,   281,  1280,  3850,   284,   262,   360,    69,
          1546,   546,   340,    13,  8989,    11,  

### attempt at openprompt

In [17]:
from openprompt.plms import load_plm
MODEL_DIR_STUB = Path(f"data/models_20230807")
def compose_model_dir_path(model_dir_stub=None, num_data_points=None, checkpoint_num=None):
    dir_path = f"{str(model_dir_stub)}{num_data_points}/checkpoint-{checkpoint_num}"
    print(dir_path)
    return dir_path

model_dir_paths = {10: compose_model_dir_path(MODEL_DIR_STUB, 10, 10),
                   100: compose_model_dir_path(MODEL_DIR_STUB, 100, 65),
                   1000: compose_model_dir_path(MODEL_DIR_STUB, 1000, 625),
                   10000: compose_model_dir_path(MODEL_DIR_STUB, 10000, 6250),
                   100000: compose_model_dir_path(MODEL_DIR_STUB, 100000, 62500)
                  } 


data/models_2023080710/checkpoint-10
data/models_20230807100/checkpoint-65
data/models_202308071000/checkpoint-625
data/models_2023080710000/checkpoint-6250
data/models_20230807100000/checkpoint-62500


In [18]:
num_data_pts = 100000
plm, tokenizer, model_config, WrapperClass = load_plm('gpt', model_dir_paths.get(num_data_pts))

You are using a model of type gpt2 to instantiate a model of type openai-gpt. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at data/models_20230807100000/checkpoint-62500 were not used when initializing OpenAIGPTLMHeadModel: ['transformer.h.1.attn.masked_bias', 'transformer.wpe.weight', 'transformer.wte.weight', 'transformer.ln_f.bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.ln_f.weight', 'transformer.h.3.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.0.attn.masked_bias']
- This IS expected if you are initializing OpenAIGPTLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a

In [46]:
plm.to('cpu')
print(plm.device)
type(plm), 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'

cpu


(transformers.models.openai.modeling_openai.OpenAIGPTLMHeadModel,
 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel')

In [77]:
classes = ['World', 'Sports', 'Business', 'Sci/Tech']
id2label_dict = {idx:k for idx, k in enumerate(classes)}
label2id_dict = {v:k for k,v in id2label_dict.items()}

In [27]:
raw_dataset['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

define verbalizer and promptmodel

In [79]:
# ## Define the verbalizer
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:
from openprompt.prompts import ManualVerbalizer, GenerationVerbalizer, SoftVerbalizer
import torch
# for example the verbalizer contains multiple label words in each class
# myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=4,
#          label_words=["politics", "sports", "business", "technology"])
# or without label words\
#classes = ["politics", "sports", "business", "technology"]

#myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=4,  label_words=classes)
   #                            label_words=classes)
#'''
myverbalizer = GenerationVerbalizer(
    tokenizer,
    num_classes=4,
    classes=classes,
    label_words={
        "World": ["World", "world"],
        "Sports": ["Sports", "sports", "sport"],
        "Business": ["Business", "business"],
        "Sci/Tech": ["Sci/Tech", "sci/tech", "science",  "technology", "sci tech"],
    }
)
#'''
print('verbalizer tokens', myverbalizer)

verbalizer tokens GenerationVerbalizer()


In [80]:
# Constructing Template
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.
from openprompt.prompts import ManualTemplate, SoftTemplate
template_text = '{"placeholder":"text_a"}. The topic of this news article is {"mask"}'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)


In [84]:
from openprompt import PromptForClassification, PromptForGeneration

use_cuda = False
#prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=True)
prompt_model = PromptForGeneration(plm=plm,template=mytemplate, freeze_plm=True, plm_eval_mode=True)

if use_cuda:
    
    prompt_model=  prompt_model.cuda()

#logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
#label_logits = myverbalizer.process_logits(logits)
#print('verbalizer', label_logits) # see what the verbalizer does
#torch.argmax(label_logits, -1)

define dataloader

In [85]:
from openprompt.data_utils import InputExample
dataset = {}
for split in ['test']:
    dataset[split] = []
    for idx, data in enumerate(raw_dataset[split]):
        input_example = InputExample(text_a = data['text'], label=int(data['label']), guid=idx)
        dataset[split].append(input_example)
print(dataset['test'][0])

{
  "guid": 0,
  "label": 2,
  "meta": {},
  "text_a": "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
  "text_b": "",
  "tgt_text": null
}



In [86]:
# To better understand how does the template wrap the example, we visualize one instance.
wrapped_example = mytemplate.wrap_one_example(dataset['test'][10])
print(wrapped_example)

[[{'text': 'Group to Propose New High-Speed Wireless Format  LOS ANGELES (Reuters) - A group of technology companies  including Texas Instruments Inc. &lt;TXN.N&gt;, STMicroelectronics  &lt;STM.PA&gt; and Broadcom Corp. &lt;BRCM.O&gt;, on Thursday said they  will propose a new wireless networking standard up to 10 times  the speed of the current generation.', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '. The topic of this news article is', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'guid': 10, 'label': 3}]


In [88]:
from openprompt import PromptDataLoader
BATCH_SIZE=32
test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, verbalizer=myverbalizer, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=BATCH_SIZE,shuffle=False, teacher_forcing=False, predict_eos_token=True, #False
    truncate_method="head")



The example already has tgt_text Business, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words, is this intended?
The example already has tgt_text Sci/Tech, and will be filled with new label words

AssertionError: You cannot use ``already_has_special_tokens=False`` with this tokenizer. Please use a slow (full python) tokenizer to activate this argument. Or set `return_special_tokens_mask=True` when calling the encoding method to get the special tokens mask in any tokenizer. 

In [90]:
for i, batch in enumerate(test_dataloader):
    if i>1:
        break
    print(batch)

{"input_ids": [[69, 451, 50256, 6513, 50256, 50256, 50256, 79, 641, 952, 50256, 1878, 660, 50256, 16620, 50256, 24592, 50256, 15603, 259, 50256, 28816, 50256, 64, 50256, 15344, 68, 50256, 3605, 282, 50256, 11400, 50256, 1169, 50256, 283, 50256, 50256, 6381, 1324, 1563, 68, 50256, 50256, 1878, 660, 50256, 16620, 50256, 39289, 50256, 301, 1173, 365, 50256, 11730, 50256, 69, 343, 50256, 69, 5702, 64, 50256, 76, 519, 84, 50256, 50256, 50256, 400, 50256, 4852, 72, 50256, 78, 50256, 400, 72, 50256, 3605, 50256, 433, 291, 75, 50256, 72, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50

evaluate

In [56]:

from tqdm import tqdm
use_cuda = True
def evaluate_on_test(test_dataloader, prompt_model):
    allpreds = []
    alllabels = []
    prompt_model.eval()
    for step, inputs in tqdm(enumerate(test_dataloader)):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        alllabels.extend(labels.cpu().tolist())
        allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
        #print(alllabels)
        #print(allpreds)
    acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
    print("test:", acc)  # roughly ~0.85
    return acc, alllabels, allpreds

_ = evaluate_on_test(test_dataloader, prompt_model)

238it [00:48,  4.89it/s]

test: 0.25





In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(_[1], _[2])
_[1][:3], _[2][:3]

([2, 3, 3], [1, 1, 1])

In [41]:
wrapped_example = mytemplate.wrap_one_example(dataset['test'][0])
print(wrapped_example)

[[{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.", 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '. The topic of this news article is', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'guid': 0, 'label': 2}]


### inference

In [15]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, AutoConfig

MODEL_DIR_STUB = Path(f"data/models_20230807")
def compose_model_dir_path(model_dir_stub=None, num_data_points=None, checkpoint_num=None):
    dir_path = f"{str(model_dir_stub)}{num_data_points}/checkpoint-{checkpoint_num}"
    print(dir_path)
    return dir_path

model_dir_paths = {10: compose_model_dir_path(MODEL_DIR_STUB, 10, 10),
                   100: compose_model_dir_path(MODEL_DIR_STUB, 100, 65),
                   1000: compose_model_dir_path(MODEL_DIR_STUB, 1000, 625),
                   10000: compose_model_dir_path(MODEL_DIR_STUB, 10000, 6250),
                   100000: compose_model_dir_path(MODEL_DIR_STUB, 100000, 62500)
                  } 


data/models_2023080710/checkpoint-10
data/models_20230807100/checkpoint-65
data/models_202308071000/checkpoint-625
data/models_2023080710000/checkpoint-6250
data/models_20230807100000/checkpoint-62500


In [16]:
model_dir = model_dir_paths[100000]
model = AutoModelForCausalLM.from_pretrained(model_dir, 
problem_type="multi_label_classification")

#config = AutoConfig.from_pretrained(model_dir)
model.config.pad_token_id = model.config.eos_token_id
tokenizer

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'})

In [22]:
sampler_idx = 39
sample = encoded_dataset['test'][sampler_idx]
sample_input = tokenizer.decode(sample['input_ids'])
sample_input, sample['answer']

("Given news article: Afghan Army Dispatched to Calm Violence KABUL, Afghanistan - Government troops intervened in Afghanistan's latest outbreak of deadly fighting between warlords, flying from the capital to the far west on U.S. and NATO airplanes to retake an air base contested in the violence, officials said Sunday...The news article topic is: ",
 'World')

In [205]:
from transformers import pipeline
max_new_tokens = 10

prefix3 = f"Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'."
prompt = prefix3 + sample_input

In [206]:
inputs = tokenizer(prompt, return_tensors='pt')['input_ids']
sample_gen_output = model.generate(inputs=inputs, max_new_tokens=max_new_tokens)
print(sample_gen_output.shape)

gen_label_start_index = inputs.shape[1]
gen_label_end_index = gen_label_start_index+max_new_tokens
gen_label_indices = torch.tensor(list(range(gen_label_start_index, gen_label_end_index)))
gen_label = torch.index_select(sample_gen_output, 1, gen_label_indices)
gen_label

torch.Size([1, 108])


tensor([[23611,    26, 10603,  3000,    11,  7320,    11,  7092,    11,  7092]])

In [207]:
print(tokenizer.batch_decode(gen_label))
tokenizer.batch_decode(sample_gen_output)

[' quot;World News, Business, Sports, Sports']


["Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'.Given news article: Afghan Army Dispatched to Calm Violence KABUL, Afghanistan - Government troops intervened in Afghanistan's latest outbreak of deadly fighting between warlords, flying from the capital to the far west on U.S. and NATO airplanes to retake an air base contested in the violence, officials said Sunday...The news article topic is:  quot;World News, Business, Sports, Sports"]

In [211]:
for i, sample in tqdm(enumerate(encoded_dataset['test'])):
    next

0it [00:00, ?it/s]

In [215]:
max_new_tokens = 10
prefix_test = f"Classify the following news article as one amongst the following topics 'World', 'Sports', 'Business' or 'Sci/Tech'."
preds = []
for i, sample in enumerate(tqdm(encoded_dataset['test'])):
    sample = encoded_dataset['test'][i]
    sample_input = tokenizer.decode(sample['input_ids'])
    prompt = prefix_test + sample_input
    inputs = tokenizer(prompt, return_tensors='pt')['input_ids']
    sample_gen_output = model.generate(inputs=inputs, max_new_tokens=max_new_tokens)

    gen_label_start_index = inputs.shape[1]
    gen_label_end_index = gen_label_start_index+max_new_tokens
    gen_label_indices = torch.tensor(list(range(gen_label_start_index, gen_label_end_index)))
    gen_label = torch.index_select(sample_gen_output, 1, gen_label_indices)

    gen_label_text = tokenizer.batch_decode(gen_label)
    preds.append((i, gen_label_text, sample['answer']))
    #print(preds[i])
    
    

  0%|          | 0/7600 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [217]:
import json
with open('genai_agnews_clf_pred_labels.json', 'w+') as f:
    json.dump(preds, f)

In [24]:
import json

with open('genai_agnews_clf_pred_labels.json', 'r') as f:
    preds = json.load(f)


In [25]:
preds[42]

[42, [' #151; World Politics.The news article topic'], 'World']

In [27]:
from evaluate import load, EvaluationModule
bertscore = load("bertscore")
predictions = ["sc-fi news ", "sc-fi news", "sc-fi news", "sc-fi news"]
references = ['World', 'Sports', 'Business', 'Sci/Tech']# science technology']
results = bertscore.compute(predictions=predictions, references=references, lang="en")
#np.array(results['recall'])
results

{'precision': [0.8486124277114868,
  0.8483623266220093,
  0.8456303477287292,
  0.871155858039856],
 'recall': [0.8729932308197021,
  0.8720740079879761,
  0.867979109287262,
  0.8111729025840759],
 'f1': [0.860630214214325,
  0.8600547313690186,
  0.8566589951515198,
  0.840095043182373],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.19.0)'}

In [28]:
bertscore = load("bertscore")
rouge = load("rouge")
label2id

{'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

In [29]:
from evaluate import load, EvaluationModule

def score_and_pred_label(gen_text, true_label, scorer:EvaluationModule=rouge, reference_classes=references):
    #print(f'Using metric: {scorer.name}')
    #scorer = load(metric)
    scores_across_classes = []
    for ref in reference_classes:
        assert isinstance(gen_text, list)
        if scorer.name=="bert_score":
            scores_across_classes.append(scorer.compute(predictions=gen_text, references=[ref], lang="en"))
            precision_scores = [score['precision'] for score in scores_across_classes]

        else:
            scores_across_classes.append(scorer.compute(predictions=gen_text, references=[ref]))
            precision_scores = [score['rougeL'] for score in scores_across_classes]
    #print(precision_scores)
    pred_label = np.array(precision_scores).argmax()
    #print(f"for generated label {gen_text}, highest scoring label is {reference_classes[pred_label]}, true label: {true_label} \n")
    return pred_label, label2id.get(true_label)

In [30]:
idx = 21
_, gen_text, true_label = preds[idx]
score_and_pred_label(gen_text=gen_text, true_label=true_label,  scorer=bertscore)
score_and_pred_label(gen_text=gen_text, true_label=true_label, scorer=rouge)
     

(3, 3)

In [None]:
pred_labels_bertscore = []
true_labels = []
tp = 0
for idx, item in enumerate(preds):
    _, gen_text, true_label = item
    pred_label, true_label = score_and_pred_label(gen_text=gen_text, true_label=true_label, scorer=rouge)
    if pred_label==true_label:
        tp+=1
    pred_labels_bertscore.append(pred_label)
    true_labels.append(true_label)


In [None]:
acc = tp/len(preds)
acc

### langchain

In [None]:
class CustomLLM(LLM):
    n: int

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        return prompt[: self.n]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"n": self.n}

In [182]:
llm = CustomLLM(n=10, model=model_dir)



### Experiment: Sequential training

test how the accuracy improves with batches of training data

https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html#Samplers

In [31]:
import math
import random
from torch.utils.data.sampler import Sampler

In [32]:
BASE = 10
BATCH_SIZE = 8

In [35]:
def chunk(indices, chunk_size=-1):
    if chunk_size<1:
        chunk_size = len(indices)
    return torch.split(torch.tensor(indices), chunk_size)

class SequentialTrainingBatchSampler(Sampler):
    def __init__(self, dataset, batch_size, shuffle=False, base=10):
        max_segments = int(math.log(len(dataset), base))
        self.log_index_markers = [np.power(base, sgmt) for sgmt in range(1, max_segments+1)]
        if max_segments < math.log(len(dataset)):
            self.log_index_markers.append(len(dataset))
        self.indices_lists = [list(range(marker)) for marker in self.log_index_markers]
        self.shuffle = shuffle
        self.batch_size = batch_size
    
    def __iter__(self):
        if self.shuffle:
            for indices_list in self.indices_lists:
                random.shuffle(indices_list)
       ## print(self.indices_lists)
        segment_batches  = [chunk(segment_indices_list, self.batch_size) for segment_indices_list in self.indices_lists]

        #combined = [[batch.tolist() for batch in segment] for segment in segment_batches]
        combined = [batch.tolist() for segment in segment_batches for batch in segment]

        if self.shuffle:
            random.shuffle(combined)
        return iter(combined)
    
    def __len__(self):
        return  len(self.indices_lists)#sum([len(segment_indices) for segment_indices in self.indices_lists]) #// self.batch_size

In [36]:
from torch.utils.data import BatchSampler, SequentialSampler, DataLoader
BASE=10
custom_sequential_sampler = SequentialTrainingBatchSampler(encoded_dataset['train'], batch_size=-1, base=BASE)
for i, batch in enumerate(custom_sequential_sampler):
    print(f"Batch number #{i}:  {batch}")
    if i>=1:
        break

Batch number #0:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Batch number #1:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [41]:
# do for 10, 50, 100, 1000, 20000, 100000
MODEL_DIR = Path(f"data/models_20230807")

from torch.utils.data import Subset
total_subset_idx = []
sequential_supervision_val_scores = []
sequential_supervision_test_scores = []
for idx, idx_batch in enumerate(custom_sequential_sampler):
    #if idx<4:
    batch_dataset = Subset(encoded_dataset['train'], idx_batch)
    print(f"Number of training data points: {len(idx_batch)}")
    args = TrainingArguments(
        output_dir=str(MODEL_DIR)+f"{len(idx_batch)}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    #metric_for_best_model=METRIC_NAME,
    logging_dir='./logs',            # directory for storing logs*
    logging_steps=2000,
    report_to='wandb',
    save_total_limit = 2,
    )

    trainer = Trainer(
                model,
                args,
                train_dataset=batch_dataset,
                eval_dataset=encoded_dataset['valid'],
                tokenizer=tokenizer,
                data_collator=data_collator
            )
    trainer.train()
    
    print(f"evaluating on validation set")
    trainer.eval_dataset=encoded_dataset['valid']
    val_scores = trainer.evaluate()        
    sequential_supervision_val_scores.append(val_scores)
    
    print(f"evaluating on test set")
    #prep


    #trainer.save()

PyTorch: setting up devices
***** Running training *****
  Num examples = 10
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


Number of training data points: 10


Epoch,Training Loss,Validation Loss
1,No log,4.253873
2,No log,4.159801
3,No log,4.10235
4,No log,4.068579
5,No log,4.054615


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_2023080710/checkpoint-2
Configuration saved in data/models_2023080710/checkpoint-2/config.json
Model weights saved in data/models_2023080710/checkpoint-2/pytorch_model.bin
tokenizer config file saved in data/models_2023080710/checkpoint-2/tokenizer_config.json
Special tokens file saved in data/models_2023080710/checkpoint-2/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving mo

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 100
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 65
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 100


Epoch,Training Loss,Validation Loss
1,No log,3.527111
2,No log,3.39496
3,No log,3.33216
4,No log,3.308984
5,No log,3.302174


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_20230807100/checkpoint-13
Configuration saved in data/models_20230807100/checkpoint-13/config.json
Model weights saved in data/models_20230807100/checkpoint-13/pytorch_model.bin
tokenizer config file saved in data/models_20230807100/checkpoint-13/tokenizer_config.json
Special tokens file saved in data/models_20230807100/checkpoint-13/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 1000


Epoch,Training Loss,Validation Loss
1,No log,3.018964
2,No log,2.971426
3,No log,2.962867
4,No log,2.962022
5,No log,2.961788


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_202308071000/checkpoint-125
Configuration saved in data/models_202308071000/checkpoint-125/config.json
Model weights saved in data/models_202308071000/checkpoint-125/pytorch_model.bin
tokenizer config file saved in data/models_202308071000/checkpoint-125/tokenizer_config.json
Special tokens file saved in data/models_202308071000/checkpoint-125/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batc

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 10000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 10000


Epoch,Training Loss,Validation Loss
1,No log,2.771644
2,2.884500,2.733512
3,2.884500,2.721585
4,2.649800,2.717716
5,2.543100,2.719046


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_2023080710000/checkpoint-1250
Configuration saved in data/models_2023080710000/checkpoint-1250/config.json
Model weights saved in data/models_2023080710000/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in data/models_2023080710000/checkpoint-1250/tokenizer_config.json
Special tokens file saved in data/models_2023080710000/checkpoint-1250/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 100000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 62500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 100000


Epoch,Training Loss,Validation Loss
1,2.6163,2.512237
2,2.4888,2.451821
3,2.3927,2.422847
4,2.3525,2.407537
5,2.3159,2.402459


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8
Saving model checkpoint to data/models_20230807100000/checkpoint-12500
Configuration saved in data/models_20230807100000/checkpoint-12500/config.json
Model weights saved in data/models_20230807100000/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in data/models_20230807100000/checkpoint-12500/tokenizer_config.json
Special tokens file saved in data/models_20230807100000/checkpoint-12500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

evaluating on validation set


PyTorch: setting up devices
***** Running training *****
  Num examples = 108000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 67500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.


evaluating on test set
Number of training data points: 108000


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [44]:
import json
with open('sequential_trainerer_val_agnews_generative.json', 'w+') as f:
    json.dump(sequential_supervision_val_scores, f)

In [19]:
# Constructing Template
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.
from openprompt.prompts import ManualTemplate, SoftTemplate
template_text = '{"placeholder":"text_a"}. The topic of this news article is {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)


In [15]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm('gpt', str(MODEL_DIRb)+"/checkpoint-1350")

You are using a model of type gpt2 to instantiate a model of type openai-gpt. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at data/models_20230806/checkpoint-1350 were not used when initializing OpenAIGPTLMHeadModel: ['transformer.h.4.attn.masked_bias', 'transformer.ln_f.weight', 'transformer.h.8.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.wte.weight', 'transformer.ln_f.bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.wpe.weight', 'transformer.h.1.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.9.attn.masked_bias']
- This IS expected if you are initializing OpenAIGPTLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertFo

In [16]:
classes = ['World', 'Sports', 'Business', 'Sci/Tech']
id2label_dict = {idx:k for idx, k in enumerate(classes)}
label2id_dict = {v:k for k,v in id2label_dict.items()}


myverbalizer2 = GenerationVerbalizer(
    tokenizer,
    num_classes=4,
    classes=classes,
    label_words={
        "World": ["World", "world"],
        "Sports": ["Sports", "sports", "sport"],
        "Business": ["Business", "business"],
        "Sci/Tech": ["Sci/Tech", "sci/tech", "science",  "technology", "sci tech"],
    }
)

In [19]:
# Constructing Template
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.
from openprompt.prompts import ManualTemplate
template_text = '{"placeholder":"text_a"}. The topic of this news article is {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)


define verbalizer and promptmodel

In [20]:
# ## Define the verbalizer
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:
from openprompt.prompts import SoftVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
# myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=4,
#          label_words=["politics", "sports", "business", "technology"])
# or without label words
myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=4,
                             label_words=["politics", "sports", "business", "technology"])


from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=True)
if use_cuda:
    prompt_model=  prompt_model.cuda()


define dataloader

In [18]:
from openprompt.data_utils import InputExample
dataset = {}
for split in ['test']:
    dataset[split] = []
    for idx, data in enumerate(raw_dataset[split]):
        input_example = InputExample(text_a = data['text'], label=int(data['label']), guid=idx)
        dataset[split].append(input_example)
print(dataset['test'][0])

{
  "guid": 0,
  "label": 2,
  "meta": {},
  "text_a": "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
  "text_b": "",
  "tgt_text": null
}



In [23]:
from openprompt import PromptDataLoader
BATCH_SIZE=8
test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, verbalizer=myverbalizer, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=BATCH_SIZE,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")



tokenizing: 7600it [00:13, 562.90it/s]


evaluate

In [28]:

from tqdm import tqdm
def evaluate_on_test(test_dataloader, prompt_model):
    allpreds = []
    alllabels = []
    for step, inputs in tqdm(enumerate(test_dataloader)):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        alllabels.extend(labels.cpu().tolist())
        allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
        #print(alllabels)
        #print(allpreds)
    acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
    print("test:", acc)  # roughly ~0.85
    return acc, alllabels, allpreds

_ = evaluate_on_test(test_dataloader, prompt_model)

950it [06:32,  2.42it/s]

test: 0.25





In [None]:
MODEL_DIR = Path(f"data/models_20230806")


##### push model to hf hub

(https://colab.research.google.com/drive/1U7SX7jNYsNQG5BY1xEQQHu48Pn6Vgnyt?usp=sharing#scrollTo=H5j5YJE2hK58)