Works with:
* bert-based-cased
* bert-large-cased
* roberta-base
* roberta-large

In [1]:
import fire
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

from tqdm.auto import tqdm
from transformers import AdamW, get_scheduler
from transformers import Trainer, TrainingArguments
import evaluate

from peft import LoraConfig, TaskType, get_peft_model

import os
from datetime import date

id2label = {0:'entailment', 1:'neutral', 2:'contradiction'}
label2id = {'entailment':0, 'neutral':1, 'contradiction':2}
num_labels = len(id2label)

def convertlabels2ids(example):
    example['label'] = label2id[example['label']]
    return example

def build_dataset(tokenizer, num_proc):
    def tokenize_function(examples):
        return tokenizer(examples['premise'],examples['hypothesis'])
    dataset = load_dataset("snli")    
    tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=num_proc).filter(lambda sample: sample['label'] in list(range(num_labels)) ) 
    tokenized_datasets = tokenized_datasets.rename_column('label', 'labels').remove_columns(['premise','hypothesis'])
    train_dataset = tokenized_datasets["train"]
    validation_dataset = tokenized_datasets["validation"]
    return train_dataset, validation_dataset

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def evaluate_test_data(tokenizer, trainer, model_name, num_proc):
    def tokenize_function(examples):
        return tokenizer(examples['premise'],examples['hypothesis'])

    test_datasets = ['snli','multi_nli','sagnikrayc/snli-bt','sagnikrayc/snli-cf-kaushik']
    dataset2split = {'snli':"test", 'multi_nli':"validation_mismatched", 'sagnikrayc/snli-bt':"test", 'sagnikrayc/snli-cf-kaushik':"test"}
    res = []

    for dataset_str in test_datasets:
        target_split = dataset2split[dataset_str] #"validation_mismatched" if dataset_str == 'multi_nli' else "test"
        dataset = load_dataset(dataset_str, split=target_split)

        if dataset_str in ['sagnikrayc/snli-bt','sagnikrayc/snli-cf-kaushik']: dataset = dataset.map(convertlabels2ids) 

        tokenized_test_dataset = dataset.map(tokenize_function, batched=True, num_proc=num_proc).filter(lambda sample: sample['label'] in list(range(num_labels)))

        results = trainer.evaluate(tokenized_test_dataset)
        res.append([model_name, dataset_str,results['eval_accuracy']])
    return res

def log_and_save_results(res,
    results_dir = '../res',
    outfile_name = 'snli_model_performances.csv'
):
    outfile_path = os.path.join(results_dir, outfile_name)

    if not os.path.exists(results_dir): os.mkdir(results_dir)

    if not os.path.exists(outfile_path):
        with open(outfile_path,'a', newline='\n') as f:
            f.write("date; model_name; dataset; accuracy\n")

    today = date.today()

    for i  in res:
        model_name, dataset_str, accuracy = i
        with open(outfile_path,'a', newline='\n') as f:
            f.write(f"{today};{model_name}; {dataset_str}; {accuracy}\n")
        print(f"Accuracy of {model_name} on {dataset_str} dataset: {accuracy}")
    
def main(
    model_checkpoint,
    seed: int=42,
    batch_size: int=128,
    num_train_epochs: int=3,
    num_proc: int=4,
    output_dir: str="../res",
    use_peft: bool = False,
    do_train: bool = True,
    do_eval: bool=True,
    do_log: bool=True,
    mini_train: bool=False,
    save_path: str="/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/nlp-gen" 
):
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    
    checkpoint = model_checkpoint
    metric_name = "accuracy"
    model_name = checkpoint.split("/")[-1]
    save_path = f"{save_path}/{model_name}-snli"
    
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
        
    # BUILD DATASET
    train_dataset, validation_dataset = build_dataset(tokenizer, num_proc)
    
    # Train on smaller data for debugging purposes
    if mini_train:
        num_samples = int(0.1*len(train_dataset))
        print(f'only train on {num_samples} samples')
        train_dataset = train_dataset.select(range(num_samples)) 
    
    # LOAD PEFT MODEL
    if use_peft:
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=16,
            lora_alpha=32,
            lora_dropout=0.05
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        model_name = f"PEFT/{model_name}"
    
    # PREPARE FOR TRAINING
    args = TrainingArguments(
        save_path,
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        save_total_limit=1,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        overwrite_output_dir=True,
    )
    
    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    if do_train:
        trainer.train()  
    
    # EVAL PERFORMANCE    
    if do_eval:
        res = evaluate_test_data(tokenizer, trainer, model_name, num_proc)
    
        # LOG RESULT METRICS
        if do_log:
            log_and_save_results(res, results_dir = output_dir, outfile_name = 'snli_model_performances.csv')
        else:
            print(res)

In [2]:
if __name__ == "__main__":
    main(model_checkpoint="facebook/opt-1.3b", batch_size=64)

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-1.3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## PEFT

In [None]:
if __name__ == "__main__":
    main(model_checkpoint="bert-base-cased", use_peft=True)

In [None]:
if __name__ == "__main__":
    main(model_checkpoint="bert-large-cased", use_peft=True)

In [None]:
if __name__ == "__main__":
    main(model_checkpoint="roberta-base", use_peft=True)

In [None]:
if __name__ == "__main__":
    main(model_checkpoint="roberta-large", use_peft=True)

In [None]:
if __name__ == "__main__":
    main(model_checkpoint="facebook/bart-large", use_peft=True)

In [None]:
!python run_seqcls_nli.py --model_checkpoint "facebook/bart-base" --batch_size 32

# Testing

In [None]:
import fire
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

from tqdm.auto import tqdm
from transformers import AdamW, get_scheduler
from transformers import Trainer, TrainingArguments
import evaluate

from peft import LoraConfig, TaskType, get_peft_model

import os
from datetime import date

id2label = {0:'entailment', 1:'neutral', 2:'contradiction'}
label2id = {'entailment':0, 'neutral':1, 'contradiction':2}
num_labels = len(id2label)

def convertlabels2ids(example):
    example['label'] = label2id[example['label']]
    return example

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    

def log_and_save_results(res,
    results_dir = '../res',
    outfile_name = 'snli_model_performances.csv'
):
    outfile_path = os.path.join(results_dir, outfile_name)

    if not os.path.exists(results_dir): os.mkdir(results_dir)

    if not os.path.exists(outfile_path):
        with open(outfile_path,'a', newline='\n') as f:
            f.write("date; model_name; dataset; accuracy\n")

    today = date.today()

    for i  in res:
        model_name, dataset_str, accuracy = i
        with open(outfile_path,'a', newline='\n') as f:
            f.write(f"{today};{model_name}; {dataset_str}; {accuracy}\n")
        print(f"Accuracy of {model_name} on {dataset_str} dataset: {accuracy}")
    

def main(
    model_checkpoint,
    seed: int=42,
    batch_size: int=128,
    num_train_epochs: int=3,
    num_proc: int=4,
    output_dir: str="../res",
    use_peft: bool = False,
    do_train: bool = True,
    do_eval: bool=True,
    do_log: bool=True,
    mini_train: bool=False,
    save_path: str="/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/nlp-gen" 
):
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    
    checkpoint = model_checkpoint
    metric_name = "accuracy"
    model_name = checkpoint.split("/")[-1]
    save_path = f"{save_path}/{model_name}-snli"
    
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
        
    # BUILD DATASET
    dataset = load_dataset("snli")    
    def tokenize_function(examples):
        return tokenizer(examples['premise'],examples['hypothesis'])
    tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=num_proc).filter(lambda sample: sample['label'] in list(range(num_labels)) ) 
    
    # Train on smaller data for debugging purposes
    if mini_train:
        mini_train_data = tokenized_datasets["train"].select(range(int(0.1*len(tokenized_datasets["train"])))) 
    
    # LOAD PEFT MODEL
    if use_peft:
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=16,
            lora_alpha=32,
            lora_dropout=0.05
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        model_name = f"PEFT/{model_name}"
    
    # PREPARE FOR TRAINING
    args = TrainingArguments(
        save_path,
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        save_total_limit=1,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        overwrite_output_dir=True,
    )
    
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"] if not mini_train else mini_train_data,
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()  
    
    # EVAL PERFORMANCE
    def evaluate_test_data():
        
        test_datasets = ['snli','multi_nli','sagnikrayc/snli-bt','sagnikrayc/snli-cf-kaushik']
        dataset2split = {'snli':"test", 'multi_nli':"validation_mismatched", 'sagnikrayc/snli-bt':"test", 'sagnikrayc/snli-cf-kaushik':"test"}
        res = []

        for dataset_str in test_datasets:
            target_split = dataset2split[dataset_str] #"validation_mismatched" if dataset_str == 'multi_nli' else "test"
            dataset = load_dataset(dataset_str, split=target_split)
            
            if dataset_str in ['sagnikrayc/snli-bt','sagnikrayc/snli-cf-kaushik']: dataset = dataset.map(convertlabels2ids) 
            
            tokenized_test_dataset = dataset.map(tokenize_function, batched=True, num_proc=num_proc).filter(lambda sample: sample['label'] in list(range(num_labels)))
            
            results = trainer.evaluate(tokenized_test_dataset)
            res.append([model_name, dataset_str,results['eval_accuracy']])
        return res
    
    res = evaluate_test_data()
    
    # LOG RESULT METRICS
    if do_log:
        log_and_save_results(res, results_dir = output_dir, outfile_name = 'snli_model_performances.csv')

In [None]:
if __name__ == "__main__":
    main(model_checkpoint="bert-base-cased", num_train_epochs=1, mini_train=True)