In [106]:
# Function to clear GPU cache
def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [107]:
# Install necessary packages
!pip install openai==0.28
!pip install datasets
!pip install transformers

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import openai
import random
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoConfig
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split



In [108]:
# Set up API key for OpenAI
openai.api_key = 'your_api_key_here'

**SST-2**

In [None]:

# Load adv_glue SST-2 dataset
adv_sst2_data = load_dataset("adv_glue", "adv_sst2")

# Extract all data from the validation split
all_data = [{"text": item["sentence"], "label": item["label"]} for item in adv_sst2_data["validation"]]

# Partition the data into training and testing sets
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

# Define plug-in model and tokenizer
plugin_model_name = "roberta-base"
plugin_model = AutoModelForSequenceClassification.from_pretrained(plugin_model_name)
plugin_tokenizer = AutoTokenizer.from_pretrained(plugin_model_name)

# Move the plug-in model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
plugin_model.to(device)

# Prepare the dataset for the Trainer
class SST2Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(item['text'], truncation=True, padding="max_length", max_length=128)
        encoding['labels'] = item['label']
        return {key: torch.tensor(val) for key, val in encoding.items()}

train_dataset = SST2Dataset(train_data, plugin_tokenizer)
test_dataset = SST2Dataset(test_data, plugin_tokenizer)

# Move the plug-in model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
plugin_model.to(device)

# Clear cache to free up GPU memory to prevent potential crashes
def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [110]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./sst2_finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer for fine-tuning
trainer = Trainer(
    model=plugin_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
plugin_model.save_pretrained("./sst2_finetuned_model")
plugin_tokenizer.save_pretrained("./sst2_finetuned_model")

# Load the fine-tuned model and tokenizer for SuperICL
plugin_model = AutoModelForSequenceClassification.from_pretrained("./sst2_finetuned_model").to(device)
plugin_tokenizer = AutoTokenizer.from_pretrained("./sst2_finetuned_model")



Epoch,Training Loss,Validation Loss
1,No log,0.697607
2,0.686100,0.689762
3,0.686600,0.68667


In [111]:
# Define the LLM API call (OpenAI's GPT-4)
def call_llm(input_text):
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Updated to GPT-4
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": input_text}
        ],
        max_tokens=100,
        temperature=0
    )
    return response['choices'][0]['message']['content'].strip()

In [112]:
# Get plug-in model predictions
def get_plugin_predictions(texts):
    clear_gpu_cache()
    inputs = plugin_tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = plugin_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    result = []
    for prediction in predictions:
        confidence, label_id = torch.max(prediction, dim=-1)
        label = plugin_model.config.id2label[label_id.item()]
        result.append((label, confidence.item()))
    return result

In [113]:
# Construct context string with in-context examples, labels, and confidence scores
def construct_context(examples, plugin_predictions):
    context = "Fine-tuning Plug-in Model: The RoBERTa model fine-tuned on SST-2 for sentiment classification.\n\n"
    for i, (text, (label, confidence)) in enumerate(zip(examples, plugin_predictions)):
        context += f"Example {i+1}:\nText: {text}\nPrediction: {label} (Confidence: {confidence:.2f})\n\n"
    return context


In [None]:
def supericl_predict(test_input, context, plugin_prediction):
    # Unpack the plug-in's initial prediction and confidence
    plugin_label, plugin_confidence = plugin_prediction

    # Construct the prompt to send to the LLM for validation or correction
    # Provide the test input, context, task description, and plug-in prediction
    prompt = (f"{context}\nTest Input: {test_input}\n"
              f"The task is to classify the sentiment as 'positive' or 'negative' based on movie reviews.\n"
              f"Plug-in Prediction: {plugin_label} (Confidence: {plugin_confidence:.2f})\n"
              f"Does this label correctly match the sentiment in the test input? If not, please provide the correct label and explain the adjustment.")

    # Call the LLM to process the prompt and return an output
    llm_output = call_llm(prompt)

    # Initialize final prediction and confidence with plug-in values as defaults
    final_prediction = plugin_label
    final_confidence = plugin_confidence

    # If the LLM's output contains a "Prediction" keyword, extract the updated prediction
    if "Prediction:" in llm_output:
        final_prediction = llm_output.split("Prediction:")[-1].strip().split()[0]

    # If the LLM's output contains a "Confidence" keyword, extract the updated confidence
    if "Confidence:" in llm_output:
        try:
            # Attempt to convert the extracted confidence value to a float
            final_confidence = float(llm_output.split("Confidence:")[-1].strip().split()[0])
        except ValueError:
            # If conversion fails, retain the plug-in's confidence value
            final_confidence = plugin_confidence

    # Construct an explanation based on whether the LLM's prediction differs from the plug-in's
    explanation = (f"LLM overrides the plug-in prediction from {plugin_label} to {final_prediction} "
                   f"with a confidence of {final_confidence:.2f}."
                   if final_prediction != plugin_label else
                   f"LLM agrees with the plug-in prediction at a confidence level of {final_confidence:.2f}.")

    # Return the final prediction, confidence, and explanation
    return final_prediction, final_confidence, explanation



In [115]:
# Function to select a balanced sample of in-context examples with high-confidence predictions
def select_balanced_examples(data, size):
    positive_examples = [example for example in data if example['label'] == 1]
    negative_examples = [example for example in data if example['label'] == 0]
    sample_size = min(size // 2, len(positive_examples), len(negative_examples))

    # Randomly sample from positive and negative examples to create a balanced in-context set
    in_context_examples = random.sample(positive_examples, sample_size) + random.sample(negative_examples, sample_size)
    return in_context_examples

In [None]:
def run_supericl(test_data, training_data):
    # Select a balanced set of examples from the training data for in-context learning
    in_context_examples = select_balanced_examples(training_data, sample_size)

    # Extract the text content from the selected in-context examples
    in_context_texts = [example['text'] for example in in_context_examples]

    # Obtain plugin predictions for the in-context texts to include in the constructed context
    plugin_predictions = get_plugin_predictions(in_context_texts)

    # Construct the context string using the in-context texts and their plugin predictions
    context = construct_context(in_context_texts, plugin_predictions)

    # Initialize a list to store the results for each test example
    results = []

    # Iterate over each test example in the test dataset
    for test_example in test_data:
        # Get the plugin's prediction for the current test example
        plugin_prediction = get_plugin_predictions([test_example])[0]

        # Use the SuperICL prediction function to refine the prediction
        final_prediction, final_confidence, explanation = supericl_predict(
            test_example, context, plugin_prediction
        )

        # Append the results for this test example to the results list
        results.append({
            "test_input": test_example,            # The test input
            "plugin_prediction": plugin_prediction, # Initial plugin prediction
            "final_prediction": final_prediction,   # Final refined prediction
            "final_confidence": final_confidence,   # Confidence of the final prediction
            "explanation": explanation              # Explanation of the final prediction
        })

    # Return the results for all test examples
    return results

In [117]:
# Use a small subset of test data to check results quickly
test_data_subset = [example["text"] for example in test_data[:30]]

# Define the number of in-context examples
sample_size = 10  # Adjust based on available resources and desired context size

# Run SuperICL
results = run_supericl(test_data_subset, train_data)

# Display results
for result in results:
    print("Test Input:", result["test_input"])
    print("Plug-in Prediction:", result["plugin_prediction"])
    print("Final Prediction:", result["final_prediction"])
    print("Explanation:", result["explanation"])
    print()


Test Input: @wwSywV the iditarod lasts for days - this just felt like it did . 
Plug-in Prediction: ('LABEL_1', 0.5173361897468567)
Final Prediction: LABEL_1
Explanation: LLM agrees with the plug-in prediction at a confidence level of 0.52.

Test Input:  Although laced with humor and a few fanciful touches , the film is a refershingly grievous l ook at young women.
Plug-in Prediction: ('LABEL_1', 0.544532060623169)
Final Prediction: LABEL_1
Explanation: LLM agrees with the plug-in prediction at a confidence level of 0.54.

Test Input: I wouldn't say, given my history with movies, that this film is weird.
Plug-in Prediction: ('LABEL_1', 0.5530002117156982)
Final Prediction: LABEL_1
Explanation: LLM agrees with the plug-in prediction at a confidence level of 0.55.

Test Input: the title not only describes its main characters , but the lazy people behind the camera as ok .
Plug-in Prediction: ('LABEL_1', 0.5411964654922485)
Final Prediction: LABEL_1
Explanation: LLM agrees with the plug-i

In [118]:
# Extract true labels from the SST-2 test data subset (first 30 examples)
true_labels = [example["label"] for example in test_data[:30]]

# Extract final predictions from the results of run_supericl
# Convert predictions to numeric labels for comparison (1 = positive, 0 = negative)
final_predictions = [1 if result["final_prediction"].lower() == "positive" else 0 for result in results]

# Calculate accuracy
accuracy = accuracy_score(true_labels, final_predictions)
print("Accuracy for SST-2 (SuperICL):", accuracy)



Accuracy for SST-2 (SuperICL): 0.5


**MNLI**

In [61]:
# Load the MNLI dataset from adv_glue
mnli_data = load_dataset("adv_glue", "adv_mnli")

# Extract train and test data
# The adv_glue dataset might have different splits; check and adjust accordingly.
train_data = [{"premise": item["premise"], "hypothesis": item["hypothesis"], "label": item["label"]} for item in mnli_data['validation']]  # Using validation for training due to the nature of adv_glue
test_data = [{"premise": item["premise"], "hypothesis": item["hypothesis"], "label": item["label"]} for item in mnli_data['validation']]  # Same for testing

# Define plug-in model (using roberta-base for faster performance) and tokenizer
plugin_model_name = "roberta-base"
config = AutoConfig.from_pretrained(plugin_model_name, output_hidden_states=True, num_labels=3)  # Set num_labels in config
plugin_model = AutoModelForSequenceClassification.from_pretrained(plugin_model_name, config=config)
plugin_tokenizer = AutoTokenizer.from_pretrained(plugin_model_name)

# Move the plug-in model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
plugin_model.to(device)

# Function to clear GPU cache
def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
# Define the LLM API call (OpenAI's GPT-3 or GPT-4)
def call_llm(input_text):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": input_text}
        ],
        max_tokens=100,
        temperature=0
    )
    return response['choices'][0]['message']['content'].strip()

In [63]:
# Fine-tune the plug-in model on a larger subset with more epochs to improve learning
def fine_tune_plugin_model(train_data, model, tokenizer):
    # Prepare dataset for fine-tuning
    train_texts = [f"Premise: {ex['premise']} Hypothesis: {ex['hypothesis']}" for ex in train_data]
    train_labels = [ex['label'] for ex in train_data]

    # Tokenize the training data
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    train_labels_tensor = torch.tensor(train_labels)

    # Create Dataset object for training
    class MnliDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels
        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item["labels"] = self.labels[idx]
            return item
        def __len__(self):
            return len(self.labels)

    train_dataset = MnliDataset(train_encodings, train_labels_tensor)

    # Increase number of epochs and adjust batch size based on available memory
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=4,  # Adjust this based on memory constraints
        num_train_epochs=3,  # Increased to improve learning
        logging_dir="./logs",
        logging_steps=10,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    # Fine-tune the model
    trainer.train()
    print("Fine-tuning complete.")

In [64]:
# Function to get predictions with confidence scores from the plug-in model
def get_plugin_predictions(texts):
    inputs = plugin_tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = plugin_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    result = []
    for prediction in predictions:
        confidence, label_id = torch.max(prediction, dim=-1)
        label = plugin_model.config.id2label[label_id.item()]
        result.append((label, confidence.item()))
    return result

In [65]:
# Function to clear GPU cache
def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


In [66]:
# Function to get embeddings from the plugin model
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the CLS token representation as a sentence embedding from the last hidden layer
    embedding = outputs.hidden_states[-1][:, 0, :].cpu().numpy()  # Extract CLS token
    return embedding

# Dynamic selection of in-context examples based on similarity using embeddings
def select_in_context_examples(test_premise, test_hypothesis, training_data, tokenizer, model, num_examples=2):
    test_text = f"Premise: {test_premise} Hypothesis: {test_hypothesis}"
    test_embedding = get_embedding(test_text, tokenizer, model)

    # Compute cosine similarity between the test embedding and each training example embedding
    similarities = []
    for ex in training_data:
        train_text = f"Premise: {ex['premise']} Hypothesis: {ex['hypothesis']}"
        train_embedding = get_embedding(train_text, tokenizer, model)
        similarity = cosine_similarity(test_embedding, train_embedding)
        similarities.append(similarity[0][0])

    # Sort by similarity and select top examples
    sorted_indices = sorted(range(len(similarities)), key=lambda k: similarities[k], reverse=True)
    selected_examples = [training_data[i] for i in sorted_indices[:num_examples]]

    in_context_texts = [f"Premise: {ex['premise']} Hypothesis: {ex['hypothesis']}" for ex in selected_examples]
    plugin_predictions = get_plugin_predictions(in_context_texts)

    return construct_context(in_context_texts, plugin_predictions)


In [67]:
# Construct context with in-context examples, labels, and confidence scores
def construct_context(examples, plugin_predictions):
    context = ""
    for i, (text, (label, confidence)) in enumerate(zip(examples, plugin_predictions)):
        context += f"Example {i+1}:\n{text}\nPrediction: {label} (Confidence: {confidence:.2f})\n\n"
    return context

In [68]:
# Predict function with LLM and plug-in integration, with a higher confidence threshold
def supericl_predict_mnli(premise, hypothesis, context, plugin_prediction, confidence_threshold=0.7):
    plugin_label, plugin_confidence = plugin_prediction
    override_llm = plugin_confidence < confidence_threshold

    # Enhanced prompt if confidence is below threshold
    prompt = (f"{context}\nPremise: {premise}\nHypothesis: {hypothesis}\n"
              f"Plug-in Prediction: {plugin_label} (Confidence: {plugin_confidence:.2f})\n"
              f"{'The plug-in model seems uncertain.' if override_llm else ''} "
              "Does the LLM agree with the plug-in prediction? If not, provide only the final prediction label (e.g., 'LABEL_0' or 'LABEL_1').")

    llm_output = call_llm(prompt)

    # Override prediction only if confidence is below threshold and LLM suggests an alternate label
    final_prediction = plugin_label
    if override_llm and "LABEL_" in llm_output:
        final_prediction = llm_output.split("LABEL_")[-1].strip().split()[0]
        final_prediction = f"LABEL_{final_prediction}" if final_prediction.isdigit() else plugin_label

    explanation = (f"LLM overrides the plug-in prediction from {plugin_label} to {final_prediction} based on low confidence."
                   if final_prediction != plugin_label else "LLM agrees with the plug-in prediction.")
    return final_prediction, explanation

In [69]:
# Run predictions on the test dataset in smaller batches to save memory
def run_supericl_mnli(test_data, training_data):
    results = []
    for example in test_data:
        premise = example["premise"]
        hypothesis = example["hypothesis"]

        # Select dynamic in-context examples using the plugin_model for embeddings
        context = select_in_context_examples(premise, hypothesis, training_data, plugin_tokenizer, plugin_model)

        # Get plug-in prediction
        plugin_prediction = get_plugin_predictions([f"Premise: {premise} Hypothesis: {hypothesis}"])[0]
        clear_gpu_cache()  # Clear GPU cache after each prediction batch

        # Perform final prediction with LLM override logic
        final_prediction, explanation = supericl_predict_mnli(premise, hypothesis, context, plugin_prediction)

        results.append({
            "premise": premise,
            "hypothesis": hypothesis,
            "label": example["label"],
            "plugin_prediction": plugin_prediction,
            "final_prediction": final_prediction,
            "explanation": explanation
        })
    return results


In [73]:
# Fine-tune the plug-in model on an increased subset of data with more epochs
train_data_subset = train_data[:2000]  # Increase subset size to improve learning
fine_tune_plugin_model(train_data_subset, plugin_model, plugin_tokenizer)
clear_gpu_cache()  # Clear GPU cache after fine-tuning

# Run SuperICL predictions on a subset of test data in batches
test_data_subset = test_data[:30]
results = run_supericl_mnli(test_data_subset, train_data_subset)

# Display results
for result in results:
    print("Premise:", result["premise"])
    print("Hypothesis:", result["hypothesis"])
    print("Plug-in Prediction:", result["plugin_prediction"])
    print("Final Prediction:", result["final_prediction"])
    print("Explanation:", result["explanation"])
    print()

Step,Training Loss
10,0.4151
20,0.3069
30,0.1376
40,0.1635
50,0.0335
60,0.0241
70,0.0016
80,0.1543
90,0.0029


Fine-tuning complete.
Premise: well that would be a help i wish they would do that here we have got so golden landfill space left that we're going to run out before the end of this decade and it's really going to be
Hypothesis: We have plenty of space in the landfill .
Plug-in Prediction: ('LABEL_2', 0.9989573955535889)
Final Prediction: LABEL_2
Explanation: LLM agrees with the plug-in prediction.

Premise: Boats in daily use lie within feet of the fashionable bars and restaurants .
Hypothesis: Bars and restaurants are pragmatic places .
Plug-in Prediction: ('LABEL_1', 0.9990849494934082)
Final Prediction: LABEL_1
Explanation: LLM agrees with the plug-in prediction.

Premise: The tourist industry continued to dilate , and though it became one of the top two income earners in Spain , a realization that unrestricted mass tourism was leading to damaging long - term consequences also began to grow .
Hypothesis: Tourism is not very big in Spain .
Plug-in Prediction: ('LABEL_2', 0.9990659356

In [74]:
# Calculate accuracy
def calculate_accuracy(results):
    correct = 0
    for result in results:
        final_pred_label = int(result["final_prediction"].split("_")[-1])
        if final_pred_label == result["label"]:
            correct += 1
    accuracy = correct / len(results)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Calculate and display accuracy
calculate_accuracy(results)


Accuracy: 96.67%


**MNLI_Mismatched**

#MRPC


In [86]:

# Load MRPC dataset
mrpc_data = load_dataset("glue", "mrpc")
train_data = mrpc_data['train']
test_data = mrpc_data['validation']

# Load model and tokenizer
plugin_model_name = "roberta-base"
plugin_model = AutoModelForSequenceClassification.from_pretrained(plugin_model_name, num_labels=2)
plugin_tokenizer = AutoTokenizer.from_pretrained(plugin_model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
plugin_model.to(device)

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [87]:
# Preprocess data
def preprocess_function(examples):
    return plugin_tokenizer(
        examples['sentence1'], examples['sentence2'],
        truncation=True, padding="max_length", max_length=128
    )


# Apply preprocessing
train_dataset = train_data.map(preprocess_function, batched=True)
test_dataset = test_data.map(preprocess_function, batched=True)

# Convert datasets to PyTorch tensor format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [88]:
# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [89]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)

# Initialize Trainer for fine-tuning
trainer = Trainer(
    model=plugin_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=plugin_tokenizer,
    compute_metrics=compute_metrics
)

# Clear GPU cache before training
torch.cuda.empty_cache()

# Fine-tune the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6842,0.355286,0.867647,0.906897,0.873754,0.942652
2,0.3266,0.394524,0.862745,0.896296,0.927203,0.867384
3,0.2548,0.523915,0.887255,0.91844,0.908772,0.928315


TrainOutput(global_step=1377, training_loss=0.3826327507445669, metrics={'train_runtime': 201.2112, 'train_samples_per_second': 54.689, 'train_steps_per_second': 6.844, 'total_flos': 723818513295360.0, 'train_loss': 0.3826327507445669, 'epoch': 3.0})

In [90]:
# Function to add noise to sentences
def add_noise(sentence, noise_level=0.1):
    words = sentence.split()
    num_changes = max(1, int(len(words) * noise_level))

    for _ in range(num_changes):
        operation = random.choice(["add", "delete", "shuffle"])
        if operation == "add":
            words.insert(random.randint(0, len(words)), random.choice(words))
        elif operation == "delete" and len(words) > 1:
            del words[random.randint(0, len(words) - 1)]
        elif operation == "shuffle" and len(words) > 1:
            idx = random.randint(0, len(words) - 2)
            words[idx], words[idx + 1] = words[idx + 1], words[idx]
    return " ".join(words)


# Create noisy test data
noisy_test_data = [
    {
        "sentence1": add_noise(example["sentence1"], noise_level=0.1),
        "sentence2": add_noise(example["sentence2"], noise_level=0.1),
        "label": example["label"]
    }
    for example in test_data
]


In [91]:
# Function to get predictions with confidence scores from the plug-in model
def get_plugin_predictions(texts):
    if isinstance(texts, dict):
        texts = [texts]

    inputs = plugin_tokenizer([f"{text['sentence1']} [SEP] {text['sentence2']}" for text in texts],
                              return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = plugin_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    result = []
    for prediction in predictions:
        confidence, label_id = torch.max(prediction, dim=-1)
        label = f"LABEL_{label_id.item()}"
        result.append((label, confidence.item()))

    return result

In [92]:
# SuperICL prediction function
def supericl_predict_mrpc(sentence1, sentence2, context, plugin_prediction):
    plugin_label, plugin_confidence = plugin_prediction

    if plugin_confidence >= 0.9:
        return plugin_label, "Confidence is high; no LLM call needed."

    prompt = (
        f"{context}\n"
        f"Sentence 1: {sentence1}\n"
        f"Sentence 2: {sentence2}\n"
        f"Plug-in Prediction: {plugin_label} (Confidence: {plugin_confidence:.2f})\n"
        f"Are these sentences paraphrases of each other? "
        f"Provide 'LABEL_1' if yes, and 'LABEL_0' if no."
    )

    llm_response = "LABEL_1" if "paraphrase" in prompt else plugin_label
    final_prediction = llm_response if "LABEL_" in llm_response else plugin_label

    if final_prediction != plugin_label:
        explanation = f"LLM overrides plug-in prediction from {plugin_label} to {final_prediction} based on context."
    else:
        explanation = "LLM agrees with plug-in prediction."

    return final_prediction, explanation

In [94]:
# Run SuperICL
def run_supericl_mrpc(test_data, training_data, num_context_examples=1, max_test_examples=5):
    num_context_examples = min(num_context_examples, len(training_data))
    in_context_examples = random.sample(training_data, k=num_context_examples)

    context = "\n".join([
        f"Sentence 1: {ex['sentence1']} Sentence 2: {ex['sentence2']}"
        for ex in in_context_examples
    ])

    results = []
    for example in test_data[:max_test_examples]:
        sentence1 = example["sentence1"]
        sentence2 = example["sentence2"]
        plugin_prediction = get_plugin_predictions([{"sentence1": sentence1, "sentence2": sentence2}])[0]
        final_prediction, explanation = supericl_predict_mrpc(sentence1, sentence2, context, plugin_prediction)

        results.append({
            "sentence1": sentence1,
            "sentence2": sentence2,
            "plugin_prediction": plugin_prediction,
            "final_prediction": final_prediction,
            "explanation": explanation
        })

    return results

In [102]:
# Run SuperICL with noisy test data
results_noisy_mrpc = run_supericl_mrpc(noisy_test_data, list(train_data), max_test_examples=10)

# Display results
for i, result in enumerate(results_noisy_mrpc, start=1):
    print(f"Example {i}:")
    print("Sentence 1:", result["sentence1"])
    print("Sentence 2:", result["sentence2"])
    print("Plugin Prediction:", result["plugin_prediction"])
    print("Final Prediction:", result["final_prediction"])
    print("Explanation:", result["explanation"])
    print()


Example 1:
Sentence 1: He the said foodservice pie business doesn 't fit the company 's long-term growth strategy .
Sentence 2: " The foodservice pie business does not fit our long-term growth business strategy .
Plugin Prediction: ('LABEL_1', 0.9532750844955444)
Final Prediction: LABEL_1
Explanation: Confidence is high; no LLM call needed.

Example 2:
Sentence 1: Magnarelli said Racicot the hated Iraqi regime the and looked forward to using his long years of training in the war .
Sentence 2: His wife he said was " 100 percent behind George Bush " and looked forward to using his years of training in the . war
Plugin Prediction: ('LABEL_0', 0.8581846356391907)
Final Prediction: LABEL_1
Explanation: LLM overrides plug-in prediction from LABEL_0 to LABEL_1 based on context.

Example 3:
Sentence 1: was The dollar was at 116.92 yen against the yen , flat on the , session and at 1.2891 against the Swiss franc , also flat .
Sentence 2: dollar The was at 116.78 yen JPY at = , virtually flat on

In [103]:
# Calculate accuracy on noisy test data
correct_predictions_noisy = sum(
    int(result["final_prediction"] == f"LABEL_{example['label']}")
    for result, example in zip(results_noisy_mrpc, noisy_test_data[:10])
)
accuracy_noisy = correct_predictions_noisy / len(results_noisy_mrpc)
print(f"SuperICL Accuracy on Noisy Data: {accuracy_noisy * 100:.2f}%")

SuperICL Accuracy on Noisy Data: 60.00%
