In [None]:
import torch
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
)
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm  # For progress bar

# Load SST-2 dataset and specify the validation split
dataset = load_dataset("glue", "sst2", split="validation")

# List of models you want to use (with new models included)
models_info = [
    {"name": "flan-t5", "model_name": "google/flan-t5-base", "model_type": "seq2seq"},
    {"name": "bloom", "model_name": "bigscience/bloom-560m", "model_type": "causal"},
    # {"name": "opt", "model_name": "facebook/opt-1.3b", "model_type": "causal"},
    # {"name": "llama", "model_name": "meta-llama/Llama-2-7b-hf", "model_type": "causal"},
    {"name": "bart", "model_name": "facebook/bart-base", "model_type": "seq2seq"},
    # {"name": "blenderbot", "model_name": "facebook/blenderbot-400M-distill", "model_type": "seq2seq"},
    {"name": "distilbert", "model_name": "distilbert-base-uncased", "model_type": "embedding"},
    {"name": "minilm", "model_name": "microsoft/MiniLM-L12-H384-uncased", "model_type": "embedding"},
    {"name": "electra", "model_name": "google/electra-small-discriminator", "model_type": "embedding"}
]

# Load models and tokenizers
models = []
for info in models_info:
    tokenizer = AutoTokenizer.from_pretrained(info["model_name"])
    if info["model_type"] in ["seq2seq", "causal"]:
        if info["model_type"] == "seq2seq":
            model = AutoModelForSeq2SeqLM.from_pretrained(info["model_name"]).to('cuda' if torch.cuda.is_available() else 'cpu', non_blocking=True)
        else:
            model = AutoModelForCausalLM.from_pretrained(info["model_name"]).to('cuda' if torch.cuda.is_available() else 'cpu', non_blocking=True)
    else:  # Embedding models (DistilBERT, MiniLM, Electra)
        model = AutoModel.from_pretrained(info["model_name"]).to('cuda' if torch.cuda.is_available() else 'cpu', non_blocking=True)
    
    models.append((tokenizer, model, info["name"], info["model_type"]))

# Batch size for processing
BATCH_SIZE = 64

# Function to generate output from a model in batch
def generate_output_batch(input_texts, tokenizer, model, model_type):
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
    
    if model_type == "seq2seq":
        # Set max_new_tokens instead of max_length to control output size without truncating input
        outputs = model.generate(inputs["input_ids"], max_new_tokens=50)
        output_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        return output_texts
    
    elif model_type == "causal":
        # Set max_new_tokens for causal models as well
        outputs = model.generate(inputs["input_ids"], max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
        output_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        return output_texts
    
    else:  # Embedding-based models
        with torch.no_grad():
            outputs = model(**inputs)
        # Use CLS token or mean pooling for embedding-based models
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embedding for classification
        return embeddings  # Return the embedding vector for classification tasks

# Store output data
output_data = []

# Process data in batches
for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
    batch = dataset.select(range(i, min(i+BATCH_SIZE, len(dataset))))
    input_texts = [example["sentence"] for example in batch]

    # For each model, generate output in batch and store result
    for tokenizer, model, model_name, model_type in models:
        output_texts = generate_output_batch(input_texts, tokenizer, model, model_type)
        
        for input_text, output_text in zip(input_texts, output_texts):
            # If it's an embedding, convert to string for storage
            if model_type == "embedding":
                output_text = str(output_text.tolist())  # Store embedding as string
        
            output_data.append({
                "input_text": input_text,
                "generated_text": output_text,
                "model_name": model_name
            })

# Create DataFrame from the output data
output_df = pd.DataFrame(output_data)


# Save the result to a CSV file
output_df.to_csv("valid_df.csv", index=False)

print("New dataset saved as 'valid_df.csv'.")
print(output_df.head())


  0%|          | 0/28 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
 43%|████▎     | 12/28 [10:06<13:38, 51.16s/it]