In [1]:
%%capture
!pip install evaluate rouge_score bert_score bleuscore sacrebleu meteor bitsandbytes

In [2]:
import os
os.environ['HF_TOKEN'] = 'HF_TOKEN'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

In [3]:
import evaluate
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm
import numpy as np
import pandas as pd

import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

MODEL_MAP = {
    'qwen': 'Qwen/Qwen2.5-1.5B-Instruct',
    'opt': 'facebook/opt-iml-1.3b',
    'llama': 'meta-llama/Llama-3.2-1B-Instruct'
}

dataset_map = {
    'summarization': ('cnn_dailymail', '3.0.0'),
    'qa': ('squad', None),
    'paraphrase': ('quora', None)
}

def format_prompt(model_key, task, item):
    if task == 'summarization':
        input_text = item['article']
        instruction = "Summarize the following article. Only provide the highlights from the given article."
    elif task == 'qa':
        input_text = f"Context: {item['context']}\nQuestion: {item['question']}"
        instruction = "Answer the question based on the context."
    elif task == 'paraphrase':
        input_text = item['questions']['text'][0]
        instruction = "Paraphrase the following sentence. Only output a similar sentence or question."
    else:
        raise ValueError("Unknown task")

    if model_key == 'qwen':
        return (
            "<|im_start|>system\n"
            f"{instruction}<|im_end|>\n"
            "<|im_start|>user\n"
            f"{input_text}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
    elif model_key == 'llama':
        return (
            "<|start_header_id|>system<|end_header_id|>\n"
            f"{instruction}\n"
            "<|start_header_id|>user<|end_header_id|>\n"
            f"{input_text}\n"
            "<|start_header_id|>assistant<|end_header_id|>\n"
        )
    elif model_key == 'opt':
        return (
            f"Instruction: {instruction}\n"
            f"Input: {input_text}\n"
            "Output: "
        )
    else:
        raise ValueError(f"Unknown model key: {model_key}")

def load_model(model_id, device='cuda'):
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map=device,
        quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    )
    return tokenizer, model

def generate_output(tokenizer, model, prompt, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=False)

def evaluate_summarization(preds, refs):
    rouge = evaluate.load("rouge")
    results = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return results['rougeL']

def evaluate_qa(preds, refs):
    rouge = evaluate.load("rouge")
    bertscore = evaluate.load("bertscore")
    rouge_score = rouge.compute(predictions=preds, references=refs)['rougeL']
    bert_score = bertscore.compute(predictions=preds, references=refs, lang="en")['f1']
    return (rouge_score + sum(bert_score)/len(bert_score)) / 2

def evaluate_paraphrase(preds, refs):
    bleu = evaluate.load("sacrebleu")
    meteor = evaluate.load("meteor")
    bleu_score = bleu.compute(predictions=preds, references=[[r] for r in refs])['score']
    meteor_score = meteor.compute(predictions=preds, references=refs)['meteor']
    return (bleu_score + meteor_score) / 2

def run_predictions(model_key, task, n_samples=100, split='validation', finetuned=False):
    dataset_name, config = dataset_map[task]
    dataset = load_dataset(dataset_name, config)[split].select(range(n_samples))
    if not finetuned:
        tokenizer, model = load_model(MODEL_MAP[model_key])
    else:
        model_path = f"{model_key}_{task}"
        tokenizer, model = load_model(model_path)
        print(model_path, 'loaded')
        
    predictions, references = [], []
    for item in tqdm(dataset):
        prompt = format_prompt(model_key, task, item)

        if task == 'summarization':
            reference = item['highlights']
        elif task == 'qa':
            reference = item['answers']['text'][0] if item['answers']['text'] else "No Answer"
        elif task == 'paraphrase':
            reference = item['questions']['text'][1]
        else:
            raise ValueError("Unknown task")

        output = generate_output(tokenizer, model, prompt)
        predictions.append(output.strip())
        references.append(reference.strip())

    return predictions, references

def jaccard_similarity(str1, str2):
    set1, set2 = set(str1.split()), set(str2.split())
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0.0

def compute_agreement_matrix(task, n_samples=100, split='validation', finetuned=False):
    model_keys = list(MODEL_MAP.keys())
    all_preds = {}

    for key in model_keys:
        model_path = f"{key}_{task}" if finetuned else MODEL_MAP[key]
        preds, _ = run_predictions(key, task, n_samples, split, finetuned)
        all_preds[key] = preds

    matrix = pd.DataFrame(index=model_keys, columns=model_keys, dtype=float)
    for i, m1 in enumerate(model_keys):
        for j, m2 in enumerate(model_keys):
            if i == j:
                matrix.loc[m1, m2] = 1.0
            elif pd.isna(matrix.loc[m1, m2]):
                similarities = [jaccard_similarity(p1, p2) for p1, p2 in zip(all_preds[m1], all_preds[m2])]
                score = round(np.mean(similarities), 4)
                matrix.loc[m1, m2] = score
                matrix.loc[m2, m1] = score
    return matrix

def compute_baseline_scores(task, n_samples=100, split='validation'):
    model_keys = list(MODEL_MAP.keys())
    scores = {}

    for key in model_keys:
        preds, refs = run_predictions(key, task, n_samples, split)
        if task == 'summarization':
            score = evaluate_summarization(preds, refs)
        elif task == 'qa':
            score = evaluate_qa(preds, refs)
        elif task == 'paraphrase':
            score = evaluate_paraphrase(preds, refs)
        else:
            raise ValueError("Unknown task")
        scores[key] = round(score, 4)

    return pd.Series(scores, name=f'{task}_baseline_scores')

def finetune_model_on_eval_split(model_key, task, n_samples=100, output_dir="finetuned_model", split='validation'):
    dataset_name, config = dataset_map[task]
    dataset = load_dataset(dataset_name, config)[split].select(range(n_samples))
    tokenizer, base_model = AutoTokenizer.from_pretrained(MODEL_MAP[model_key], use_fast=True), MODEL_MAP[model_key]
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=BitsAndBytesConfig(load_in_8bit=True),
        device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)

    def preprocess(example):
        prompt = format_prompt(model_key, task, example)

        if task == 'summarization':
            reference = example['highlights']
        elif task == 'qa':
            reference = example['answers']['text'][0] if example['answers']['text'] else "No Answer"
        elif task == 'paraphrase':
            reference = example['questions']['text'][1]
        else:
            raise ValueError("Unknown task")

        full_input = prompt + "\n" + reference
        tokenized = tokenizer(full_input, truncation=True, padding="max_length", max_length=512)
        tokenized["labels"] = tokenized["input_ids"]
        return tokenized

    tokenized_dataset = dataset.map(preprocess, batched=False)
    split = tokenized_dataset.train_test_split(test_size=0.2)

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=2,
        num_train_epochs=3,
        eval_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        logging_steps=1,
        save_strategy="epoch",
        report_to='none',
        fp16=True,
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split['train'],
        eval_dataset=split['test'],
        tokenizer=tokenizer,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        # compute_metrics=compute_metric
    )

    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def compute_finetuned_scores(task, n_samples=100, split='validation'):
    model_keys = list(MODEL_MAP.keys())
    scores = {}

    for key in model_keys:
        model_path = f"{key}_{task}"
        if not os.path.exists(model_path):
            print(f"Finetuned model not found at {model_path}, skipping...")
            continue
        print(model_path, 'loaded')

        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            quantization_config=BitsAndBytesConfig(load_in_8bit=True)
        )

        dataset_name, config = dataset_map[task]
        dataset = load_dataset(dataset_name, config)[split].select(range(n_samples))

        predictions, references = [], []

        for item in tqdm(dataset, desc=f"Evaluating {key}_{task}"):
            prompt = format_prompt(key, task, item)

            if task == 'summarization':
                reference = item['highlights']
            elif task == 'qa':
                reference = item['answers']['text'][0] if item['answers']['text'] else "No Answer"
            elif task == 'paraphrase':
                reference = item['questions']['text'][1]
            else:
                raise ValueError("Unknown task")

            output = generate_output(tokenizer, model, prompt)
            predictions.append(output.strip())
            references.append(reference.strip())

        if task == 'summarization':
            score = evaluate_summarization(predictions, references)
        elif task == 'qa':
            score = evaluate_qa(predictions, references)
        elif task == 'paraphrase':
            score = evaluate_paraphrase(predictions, references)
        else:
            raise ValueError("Unknown task")

        scores[key] = round(score, 4)

    return pd.Series(scores, name=f'{task}_finetuned_scores')

def compute_inference_time_per_query(task, n_samples=10, finetuned=True, split='validation'):
    model_keys = list(MODEL_MAP.keys())
    times = {}

    for key in model_keys:
        model_path = f"{key}_{task}" if finetuned else MODEL_MAP[key]
        if finetuned and not os.path.exists(model_path):
            print(f"Finetuned model not found at {model_path}, skipping...")
            continue
        print(model_path, 'loaded')

        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            quantization_config=BitsAndBytesConfig(load_in_8bit=True)
        )

        dataset_name, config = dataset_map[task]
        dataset = load_dataset(dataset_name, config)[split].select(range(n_samples))

        total_time = 0

        for item in tqdm(dataset, desc=f"Measuring inference time: {key}_{task}"):
            prompt = format_prompt(model_key, task, item)

            if task == 'summarization':
                reference = item['highlights']
            elif task == 'qa':
                reference = item['answers']['text'][0] if item['answers']['text'] else "No Answer"
            elif task == 'paraphrase':
                reference = item['questions']['text'][1]
            else:
                raise ValueError("Unknown task")

            start_time = time.perf_counter()
            _ = generate_output(tokenizer, model, prompt)
            end_time = time.perf_counter()

            total_time += (end_time - start_time)

        avg_time = total_time / n_samples
        times[key] = round(avg_time, 4)
        logger.info(f"Avg inference time for {key} ({'finetuned' if finetuned else 'base'}) on {task}: {avg_time:.4f} seconds")

    return pd.Series(times, name=f"{task}_{'finetuned' if finetuned else 'base'}_inference_time")

2025-05-09 20:01:32.402920: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746820892.627656      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746820892.697649      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [15]:
print(compute_baseline_scores('summarization', 100))
print(compute_baseline_scores('qa', 100))
print(compute_baseline_scores('paraphrase', 10, 'train'))

  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 10%|█         | 1/10 [00:12<01:48, 12.05s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 20%|██        | 2/10 [00:18<01:08,  8.60s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 30%|███       | 3/10 [00:30<01:10, 10.11s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 40%|████      | 4/10 [00:42<01:05, 10.85s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 50%|█████     | 5/10 [00:43<00:37,  7.56s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 60%|██████    | 6/10 [00:46<00:24,  6.02s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 70%|███████   | 7/10 [00:47<00:12,  4.23s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 80%|████████  | 8/10 [00:51<00:08,  4.02s/it]Setting `p

qwen     0.8503
opt      1.6967
llama    0.7213
Name: paraphrase_baseline_scores, dtype: float64


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [27]:
finetune_model_on_eval_split('qwen', 'summarization', 100, 'qwen_summarization')
finetune_model_on_eval_split('qwen', 'qa', 100, 'qwen_qa')
finetune_model_on_eval_split('qwen', 'paraphrase', 100, 'qwen_paraphrase', 'train')

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,2.5846,2.609855
2,2.5248,2.577319
3,2.5217,2.568787


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.501,1.659271
2,1.6033,1.536844
3,1.6417,1.493082


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,3.3059,3.630843
2,3.0992,3.217761
3,2.8034,3.067207


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [28]:
finetune_model_on_eval_split('opt', 'summarization', 100, 'opt_summarization')
finetune_model_on_eval_split('opt', 'qa', 100, 'opt_qa')
finetune_model_on_eval_split('opt', 'paraphrase', 100, 'opt_paraphrase', 'train')

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,2.6159,2.633111
2,2.4695,2.601025
3,2.5675,2.59109


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.9604,1.971917
2,1.9734,1.787838
3,1.8787,1.72098


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,3.185,3.427264
2,3.0873,2.964802
3,2.6147,2.773673


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [29]:
finetune_model_on_eval_split('llama', 'summarization', 100, 'llama_summarization')
finetune_model_on_eval_split('llama', 'qa', 100, 'llama_qa')
finetune_model_on_eval_split('llama', 'paraphrase', 100, 'llama_paraphrase', 'train')

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,3.0193,3.068075
2,2.9016,2.981763
3,2.8678,2.955802


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,2.2516,2.41356
2,2.1931,2.144872
3,2.1659,2.050608


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,4.2432,4.67109
2,3.9037,4.103427
3,3.4986,3.921953


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [None]:
!rm models.zip && zip models.zip -r .

In [33]:
print(compute_finetuned_scores('summarization', 10))
print(compute_finetuned_scores('qa', 10))
print(compute_finetuned_scores('paraphrase', 10, 'train'))

qwen_summarization loaded


Evaluating qwen_summarization: 100%|██████████| 10/10 [01:05<00:00,  6.57s/it]


opt_summarization loaded


Evaluating opt_summarization: 100%|██████████| 10/10 [01:34<00:00,  9.46s/it]


llama_summarization loaded


Evaluating llama_summarization:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_summarization:  10%|█         | 1/10 [00:11<01:45, 11.76s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_summarization:  20%|██        | 2/10 [00:30<02:06, 15.79s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_summarization:  30%|███       | 3/10 [00:42<01:38, 14.07s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_summarization:  40%|████      | 4/10 [00:53<01:16, 12.75s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_summarization:  50%|█████     | 5/10 [01:04<01:01, 12.26s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_summarization:  60%|██████    | 6/10 [01:15<00:47, 11.93s/it]Setting `pad_token_id` t

qwen     0.0628
opt      0.0603
llama    0.0542
Name: summarization_finetuned_scores, dtype: float64
qwen_qa loaded


Evaluating qwen_qa: 100%|██████████| 10/10 [00:08<00:00,  1.15it/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


opt_qa loaded


Evaluating opt_qa: 100%|██████████| 10/10 [00:05<00:00,  1.83it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


llama_qa loaded


Evaluating llama_qa:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  10%|█         | 1/10 [00:00<00:02,  3.21it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  20%|██        | 2/10 [00:00<00:02,  2.86it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  30%|███       | 3/10 [00:01<00:03,  1.81it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  40%|████      | 4/10 [00:01<00:02,  2.31it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  50%|█████     | 5/10 [00:03<00:03,  1.34it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  60%|██████    | 6/10 [00:05<00:05,  1.34s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_qa:  70%|██

qwen     0.4154
opt      0.4154
llama    0.4137
Name: qa_finetuned_scores, dtype: float64
qwen_paraphrase loaded


Evaluating qwen_paraphrase: 100%|██████████| 10/10 [00:21<00:00,  2.18s/it]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


opt_paraphrase loaded


Evaluating opt_paraphrase: 100%|██████████| 10/10 [00:38<00:00,  3.89s/it]
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


llama_paraphrase loaded


Evaluating llama_paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_paraphrase:  10%|█         | 1/10 [00:01<00:15,  1.71s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_paraphrase:  20%|██        | 2/10 [00:02<00:10,  1.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_paraphrase:  30%|███       | 3/10 [00:03<00:08,  1.18s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_paraphrase:  40%|████      | 4/10 [00:04<00:05,  1.03it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_paraphrase:  50%|█████     | 5/10 [00:05<00:04,  1.12it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating llama_paraphrase:  60%|██████    | 6/10 [00:06<00:03,  1.01it/s]Setting `pad_token_id` to `eos_token_id`:1280

qwen     3.7943
opt      2.9533
llama    3.1807
Name: paraphrase_finetuned_scores, dtype: float64


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
import time
print(compute_inference_time_per_query('summarization', n_samples=10, finetuned=False))
print(compute_inference_time_per_query('qa', n_samples=10, finetuned=False))
print(compute_inference_time_per_query('paraphrase', n_samples=10, finetuned=False, split='train'))

Qwen/Qwen2.5-1.5B loaded


Measuring inference time: qwen_summarization:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_summarization:  10%|█         | 1/10 [00:00<00:03,  2.63it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_summarization:  20%|██        | 2/10 [00:02<00:13,  1.63s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_summarization:  30%|███       | 3/10 [00:12<00:35,  5.05s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_summarization:  40%|████      | 4/10 [00:12<00:18,  3.15s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_summarization:  50%|█████     | 5/10 [00:47<01:14, 14.87s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inferenc

facebook/opt-1.3b loaded





tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Measuring inference time: opt_summarization: 100%|██████████| 10/10 [00:45<00:00,  4.51s/it]

meta-llama/Llama-3.2-1B-Instruct loaded





tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Measuring inference time: llama_summarization:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_summarization:  10%|█         | 1/10 [00:05<00:49,  5.53s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_summarization:  20%|██        | 2/10 [00:05<00:19,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_summarization:  30%|███       | 3/10 [00:05<00:09,  1.39s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_summarization:  40%|████      | 4/10 [00:21<00:42,  7.13s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_summarization:  50%|█████     | 5/10 [00:22<00:24,  4.84s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring in

qwen     15.5970
opt       4.5124
llama     4.5618
Name: summarization_base_inference_time, dtype: float64
Qwen/Qwen2.5-1.5B loaded


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Measuring inference time: qwen_qa:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_qa:  10%|█         | 1/10 [00:00<00:04,  2.06it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_qa:  20%|██        | 2/10 [00:00<00:03,  2.06it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_qa:  30%|███       | 3/10 [00:01<00:03,  1.83it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_qa:  40%|████      | 4/10 [00:02<00:03,  1.88it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_qa:  50%|█████     | 5/10 [00:02<00:02,  2.14it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_qa:  60%|██████    | 6/10 [00:26<00:34,  8.55s/it]Set

facebook/opt-1.3b loaded


Measuring inference time: opt_qa: 100%|██████████| 10/10 [03:20<00:00, 20.08s/it]


meta-llama/Llama-3.2-1B-Instruct loaded


Measuring inference time: llama_qa:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_qa:  10%|█         | 1/10 [00:00<00:03,  2.78it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_qa:  20%|██        | 2/10 [00:00<00:02,  2.83it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_qa:  30%|███       | 3/10 [00:01<00:04,  1.51it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_qa:  40%|████      | 4/10 [00:02<00:03,  1.86it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_qa:  50%|█████     | 5/10 [00:05<00:07,  1.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_qa:  60%|██████    | 6/10 [00:06<00:05,  1.48s

qwen      5.3988
opt      20.0835
llama     2.7093
Name: qa_base_inference_time, dtype: float64
Qwen/Qwen2.5-1.5B loaded


Measuring inference time: qwen_paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_paraphrase:  10%|█         | 1/10 [00:36<05:26, 36.25s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_paraphrase:  20%|██        | 2/10 [00:58<03:45, 28.21s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_paraphrase:  30%|███       | 3/10 [01:34<03:42, 31.78s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_paraphrase:  40%|████      | 4/10 [02:10<03:19, 33.30s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_paraphrase:  50%|█████     | 5/10 [02:45<02:50, 34.02s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Measuring inference time: qwen_parap

facebook/opt-1.3b loaded


Measuring inference time: opt_paraphrase: 100%|██████████| 10/10 [02:45<00:00, 16.55s/it]


meta-llama/Llama-3.2-1B-Instruct loaded


Measuring inference time: llama_paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_paraphrase:  10%|█         | 1/10 [00:16<02:27, 16.34s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_paraphrase:  20%|██        | 2/10 [00:32<02:10, 16.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_paraphrase:  30%|███       | 3/10 [00:48<01:52, 16.02s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_paraphrase:  40%|████      | 4/10 [01:04<01:36, 16.04s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llama_paraphrase:  50%|█████     | 5/10 [01:07<00:56, 11.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Measuring inference time: llam

qwen     23.2030
opt      16.5447
llama    12.6517
Name: paraphrase_base_inference_time, dtype: float64





In [53]:
compute_agreement_matrix('summarization', 10, 'validation', True)

qwen_summarization loaded


100%|██████████| 10/10 [00:52<00:00,  5.28s/it]


opt_summarization loaded


100%|██████████| 10/10 [00:22<00:00,  2.28s/it]


llama_summarization loaded


  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 1/10 [00:06<00:59,  6.63s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 2/10 [00:15<01:03,  7.91s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 30%|███       | 3/10 [00:22<00:51,  7.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 4/10 [00:28<00:40,  6.78s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 50%|█████     | 5/10 [00:33<00:31,  6.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 6/10 [00:41<00:26,  6.72s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 70%|███████   | 7/10 [00:49<00:22,  7.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 8/10 [00:53<00:12,  6.18s/it]Setting `p

Unnamed: 0,qwen,opt,llama
qwen,1.0,0.9083,0.884
opt,0.9083,1.0,0.8956
llama,0.884,0.8956,1.0


In [54]:
compute_agreement_matrix('qa', 10, 'validation', True)

qwen_qa loaded


100%|██████████| 10/10 [00:04<00:00,  2.39it/s]


opt_qa loaded


100%|██████████| 10/10 [00:02<00:00,  4.43it/s]


llama_qa loaded


  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 1/10 [00:00<00:01,  6.32it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 2/10 [00:00<00:01,  5.66it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 30%|███       | 3/10 [00:00<00:02,  2.72it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 4/10 [00:01<00:02,  2.76it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 50%|█████     | 5/10 [00:01<00:02,  2.18it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 6/10 [00:03<00:03,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 70%|███████   | 7/10 [00:04<00:02,  1.41it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 8/10 [00:04<00:01,  1.70it/s]Setting `p

Unnamed: 0,qwen,opt,llama
qwen,1.0,0.9322,0.9902
opt,0.9322,1.0,0.9269
llama,0.9902,0.9269,1.0


In [5]:
compute_agreement_matrix('paraphrase', 10, 'train')

README.md:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

quora.py:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

100%|██████████| 10/10 [00:22<00:00,  2.30s/it]


tokenizer_config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:01<00:10,  1.14s/it][A
 20%|██        | 2/10 [00:02<00:11,  1.39s/it][A
 30%|███       | 3/10 [00:05<00:14,  2.08s/it][A
 40%|████      | 4/10 [00:06<00:10,  1.68s/it][A
 50%|█████     | 5/10 [00:09<00:10,  2.05s/it][A
 60%|██████    | 6/10 [00:11<00:07,  1.94s/it][A
 70%|███████   | 7/10 [00:12<00:05,  1.92s/it][A
 80%|████████  | 8/10 [00:13<00:03,  1.60s/it][A
 90%|█████████ | 9/10 [00:15<00:01,  1.64s/it][A
100%|██████████| 10/10 [00:16<00:00,  1.68s/it][A


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 1/10 [00:15<02:23, 15.98s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 2/10 [00:17<00:59,  7.41s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 30%|███       | 3/10 [00:17<00:30,  4.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 4/10 [00:19<00:19,  3.28s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 50%|█████     | 5/10 [00:20<00:11,  2.32s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 6/10 [00:21<00:07,  1.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 70%|███████   | 7/10 [00:21<00:04,  1.43s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 8/10 [00:22<00:02,  1.29s/it]Setting `p

Unnamed: 0,qwen,opt,llama
qwen,1.0,0.5547,0.5729
opt,0.5547,1.0,0.4916
llama,0.5729,0.4916,1.0


In [32]:
for v in dir():
    exec('del '+ v)
    del v

In [9]:
def collaborative_inference(task, model_idx1, model_idx2, n_samples=100, finetuned=True, split='validation'):
    keys = list(MODEL_MAP.keys())
    m1, m2 = keys[model_idx1], keys[model_idx2]
    path1 = f"{m1}_{task}" if finetuned else MODEL_MAP[m1]
    path2 = f"{m2}_{task}" if finetuned else MODEL_MAP[m2]

    if finetuned and (not os.path.exists(path1) or not os.path.exists(path2)):
        logger.warning("One of the finetuned models is missing.")
        return

    print(path1, path2)

    tokenizer1 = AutoTokenizer.from_pretrained(path1)
    tokenizer1.pad_token = tokenizer1.eos_token
    model1 = AutoModelForCausalLM.from_pretrained(
        path1,
        device_map='auto',
        quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    )

    tokenizer2 = AutoTokenizer.from_pretrained(path2)
    tokenizer2.pad_token = tokenizer2.eos_token
    model2 = AutoModelForCausalLM.from_pretrained(
        path2,
        device_map='auto',
        quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    )

    dataset_name, config = dataset_map[task]
    dataset = load_dataset(dataset_name, config)[split].select(range(n_samples))

    predictions, references = [], []

    for item in tqdm(dataset, desc=f"Collaborative inference: {m1} → {m2} on {task}"):
        prompt = format_prompt(m1, task, item)

        if task == 'summarization':
            reference = item['highlights']
        elif task == 'qa':
            reference = item['answers']['text'][0] if item['answers']['text'] else "No Answer"
        elif task == 'paraphrase':
            reference = item['questions']['text'][1]
        else:
            raise ValueError("Unknown task")

        # Step 1: First model generates output
        initial_output = generate_output(tokenizer1, model1, prompt).strip()
        initial_output[len(prompt):]
        # Step 2: Second model refines based on initial output
        # Format the second prompt using model2's formatting style
        if task == 'summarization':
            collab_item = {'article': f"{item['article']}\nInitial Summary: {initial_output}"}
        elif task == 'qa':
            collab_item = {'context': item['context'], 'question': item['question'] + f"\nInitial Answer: {initial_output}"}
        elif task == 'paraphrase':
            collab_item = {'questions': {'text': [item['questions']['text'][0] + f"\nInitial Paraphrase: {initial_output}", ""]}}

        collaboration_prompt = format_prompt(m2, task, collab_item)
        final_output = generate_output(tokenizer2, model2, collaboration_prompt).strip()
        final_output = final_output[len(collaboration_prompt):]
        print(final_output)
        predictions.append(initial_output.strip())
        references.append(reference.strip())

    # Evaluate
    if task == 'summarization':
        score = evaluate_summarization(predictions, references)
    elif task == 'qa':
        score = evaluate_qa(predictions, references)
    elif task == 'paraphrase':
        score = evaluate_paraphrase(predictions, references)
    else:
        raise ValueError("Unknown task")

    print(f"Collaborative inference score for {m1} → {m2} on {task}: {round(score, 4)}")
    return round(score, 4)

In [25]:
score = collaborative_inference('summarization', model_idx1=1, model_idx2=2, n_samples=10, finetuned=True)
print("Score:", score)

opt_summarization llama_summarization


Collaborative inference: opt → llama on summarization:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  10%|█         | 1/10 [00:27<04:05, 27.27s/it]

|end_header_id|>
Here are the highlights from the article:

* Zully Broussard selflessly gave one of her kidneys to a stranger, and it resulted in six patients receiving transplants.
* The California Pacific Medical Center is using a computer program to match donors and recipients, taking it from a simple swapping principle to a much higher level.
* The chain of surgeries is taking five surgeons, a covey of physician assistants, nurses, and anesthesiologists, and more than 40 support staff to perform the surgeries.
* The chain of surgeries is to be wrapped up on Friday, with the last donor giving a kidney to someone who has been biding time on a deceased donor list to complete the chain.
* The process of matching donors and recipients is taking about three to four months, compared to the three weeks it took in the past.
* The computer program, created by David Jacobs, has the potential to open up possibilities for pairing compatible donors and recipients.
* The significance of the altr

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  20%|██        | 2/10 [00:55<03:40, 27.57s/it]

|end_header_id|>
Here are the highlights from the article:

* The MLS is set to mark the beginning of its 20th season on Saturday.
* The league has grown from 10 teams in 1996 to 20 in 2015.
* The league is set to add four new teams in 2020.
* The new season is the first of a new domestic TV and media rights deal with FOX, ESPN, and Univision worth $700 million over eight years.
* The salary cap restricts the amount teams can spend on playing squads, with each team having a number of spaces that can be allocated to "off budget" signings.
* The league has seen significant growth in attendance and player development, with average attendances increasing from 31,683 in 1996 to 60,000 in 2019.
* The league has attracted a large following in the US, with World Cup winners Kaka and David Villa representing the league.
* The league has seen a significant increase in revenue, with the new season's domestic TV and media rights deal worth $700 million over eight years.
* The league has faced crit

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  30%|███       | 3/10 [01:10<02:35, 22.19s/it]

|end_header_id|>
Here are the highlights from the article:

* French striker Bafetimbi Gomis collapsed during Swansea's 3-2 loss at Tottenham in the Premier League.
* He was taken to hospital after collapsing in the first half at White Hart Lane.
* He was wearing an oxygen mask during treatment.
* Swansea tweeted that Gomis was "fine" after the match, with manager Garry Monk saying he was "feeling well".
* Gomis had similar fainting spells in France, which prompted his former club Lyon's president to warn of the risks.
* He has scored two league goals for Swansea this season, mostly in a backup role.
* He became the Welsh side's top striker when Wilfried Bony signed with Manchester City in January.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  40%|████      | 4/10 [01:41<02:32, 25.46s/it]

|end_header_id|>
Here are the highlights from the article:

* Rory McIlroy's second shot on the eighth hole of the WGC Cadillac Championship into a lake was a rare moment of frustration.
* McIlroy pulled his second shot into a lake using a 3-iron, which he joked was a 60-70 yard shot.
* He composed himself to finish the round with a second round of 70, leaving him one-under for the tournament and eight shots off the pace set by leader JB Holmes.
* McIlroy's frustration with elements of his game was still clear, as he said "I think every golfer feels it because I don't hit shots like the one I hit on 8 on the range."
* McIlroy's performance was an improvement on last week's performance at the Honda Classic event, where he failed to make the cut.
* Ryan Holmes scored a two-under-par 71 to remain in second position overall, two shots behind Holmes.
* Former world No 1. Adam Scott carded an impressive 68 to finish the day three shots off the pace at six-under.
* Bubba Watson and Henrik Ste

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  50%|█████     | 5/10 [02:00<01:56, 23.36s/it]

nd_header_id|>
Here are the highlights from the article:

* An 8th-grade student, Cayman Naib, has gone missing in Pennsylvania.
* He was last seen wearing a gray down winter jacket, black ski pants, and hiking boots.
* His parents, Farid and Becky Naib, are searching for him and have set up a Facebook group, "Find Cayman".
* Hundreds of volunteers have helped search for Cayman, including passing out fliers and canvassing areas.
* Weather has limited search efforts, with rain and snow hindering efforts on Wednesday and Thursday.
* Cayman's phone was out of power when he left school, and his friends have not been able to reach him.
* The Naib family has posted on social media, saying they are worried about Cayman's safety and are saying "Cayman, if you read this please know that you are forgiven for everything, and I mean everything, you have the ultimate free pass. Just come home, we are so worried about you".
* The search efforts will continue, with advanced tracking software and the 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  60%|██████    | 6/10 [02:46<02:03, 30.80s/it]

nd_header_id|>
My vote for Father of the Year goes to Curt Schilling.
The former Major League Baseball pitcher recently fired off a series of fastballs and mowed down a group of Twitter trolls who made the mistake of tweeting vulgar and sexually-explicit comments about Schilling's teenage daughter.
The drama started, innocently enough, on February 25, when Schilling played the role of a proud father.
He sent a tweet congratulating his daughter, Gabby, on being accepted to Salve Regina University, where she'll play softball.
It read: "Congrats to Gabby Schilling who will pitch for the Salve Regina Seahawks next year!! — Curt Schilling (@gehrig38)"
Almost immediately, responses came in from young men, complete strangers who apparently followed Schilling on Twitter.
The tweets quickly went from immature, to creepy, to repugnant.
Threats of rape were common.
The tweets were deleted, and the accounts were closed after this story went viral.
But not before Schilling captured some of the imag

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  70%|███████   | 7/10 [03:09<01:25, 28.47s/it]

|end_header_id|>
Here are the highlights of the article:

* Two American women, aged 21 and 25, have been arrested for carving their initials into a wall at the Colosseum in Rome.
* The women, from California, were spotted by fellow tourists, who then told security about the act.
* The two letters "J" and "N" were scratched on a brick wall at the historic Roman amphitheater.
* The women may face a fine for "aggravated damage" on a building of historical and artistic interest.
* If one Russian's experience is anything to go by, the price won't be cheap.
* The incident is not the first time that tourists have been caught carving graffiti on Rome's Colosseum.
* Last November, a Russian tourist was fined and given a four-month suspended sentence for carving his name into the landmark.
* The incident is also not the first time that tourists have been caught carving graffiti on other World Heritage Sites, including Machu Picchu in Peru and Angkor Archeological Park in Cambodia.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  80%|████████  | 8/10 [03:30<00:52, 26.00s/it]

nd_header_id|>
Here are the highlights from the article:

* Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the US for the first time.
* The tour will feature Prince, 3rdEyeGirl drummer Hannah Welton, and will be held in Louisville, Kentucky.
* Tickets will go on sale on Monday, March 9 at 10 a.m. local time.
* The tour will be the first time Prince has toured in the US since 2014.
* The concert venues will be revealed via Twitter prior to each show.
* A portion of the ticket sales will be donated to various Louisville charities.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  90%|█████████ | 9/10 [03:53<00:25, 25.01s/it]

|end_header_id|>
Here are the highlights of the article:

* A shooting at a bar in Mali killed 5 people, including 1 French and 1 Belgian citizen, and injured 8 others.
* Authorities called the shooting a "criminal and terrorist act" and attributed it to al-Murabitun, a North African jihadist group.
* The group claimed responsibility for the attack in an audio message, stating it was in retaliation for the killing of one of its leaders.
* The Malian government said it is committed to seeking peace and will not be intimidated by extremist groups.
* A power struggle in northern Mali led to the takeover of the region by Tuareg fighters, who later turned to Islamist radicals.
* Malian forces have battled various rebel factions, mostly in the northern region, with the help of French and African forces.
* The attack is the latest in a series of violent attacks in Mali, which plunged the country into chaos after soldiers staged a coup three years ago.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization: 100%|██████████| 10/10 [04:15<00:00, 25.53s/it]

d_header_id|>
Here are the highlights from the article:

* Manchester United defender Jonny Evans and Newcastle United striker Papiss Cisse have been charged by the Football Association for allegedly spitting during an altercation in a Premier League game.
* The incident occurred in the 38th minute of the game at St James' Park.
* The players have until 6pm GMT on Friday to respond to the charge.
* The charge is related to an alleged breach of FA Rule E1[a], which states that players must not spit at each other.
* Both Evans and Cisse released statements after the incident, with Evans saying he did not spit at Cisse and Cisse saying he reacted to something unpleasant.
* Former Liverpool midfielder Dietmar Hamann described the incident as "disgusting" and former Manchester United midfielder Paul Scholes said Jonny Evans is not a spitting player.
* Ex-Liverpool player Steve McManaman said Cisse stands up and spits at Evans' neck, which he finds disgusting.
* The incident has raised conce




Collaborative inference score for opt → llama on summarization: 0.06
Score: 0.06


In [10]:
score = collaborative_inference('summarization', model_idx1=1, model_idx2=0, n_samples=10, finetuned=True)
print("Score:", score)

opt_summarization qwen_summarization


Collaborative inference: opt → qwen on summarization:  10%|█         | 1/10 [00:22<03:26, 22.93s/it]

Zully Broussard donated one of her kidneys to a stranger, resulting in six other people receiving transplants through a process involving matched donor pairs or chains. This is possible because of the use of genetic profiles from donor-recipient pairs and the creation of a program called MatchGrid developed by a computer programmer named David Jacobs. The success of this process was due to the generosity of Broussard as well as the matchmaking abilities of the computer program.<|im_end|>


Collaborative inference: opt → qwen on summarization:  20%|██        | 2/10 [00:31<01:57, 14.73s/it]

MLS marks 20-year anniversary<|im_end|>


Collaborative inference: opt → qwen on summarization:  30%|███       | 3/10 [00:56<02:15, 19.30s/it]

Highlights:

• French striker Bafetimbi Gomis collapsed during Swansea's 3-2 loss at Tottenham in the Premier League.

• He spent the night in hospital as a precaution.

• Gomis had similar fainting spells in France.

• He was wearing an oxygen mask during the incident.

• Gomis has scored two league goals for Swansea this season.

• He became the Welsh side's top striker when Wilfried Bony signed with Manchester City in January.

• Gomis was taken to the hospital almost exactly three years ago at White Hart Lane, where Fabrice Muamba collapsed after suffering a cardiac arrest.<|im_end|>


Collaborative inference: opt → qwen on summarization:  40%|████      | 4/10 [01:43<03:00, 30.03s/it]

Highlights:
• Rory McIlroy pulled a second shot into a lake during the WGC Cadillac Championship
• The shot was considered a rare moment of frustration by the world's reigning No. 1 player
• McIlroy played the offending shot with a 3-iron instead of a longer club
• He jokingly stated that the club "must have gone a good 60, 70 yards"
• McIlroy finished the round with a second-round score of 70, one under par, placing him one under par for the tournament
• His frustration with certain aspects of his game was evident throughout the tournament
• McIlroy expressed concern about hitting shots that feel different on the course compared to practice sessions
• Ryan Holmes remained in second place after scoring a two-under-par 71
• Adam Scott finished the day three shots off the pace at six-under, finishing with an impressive 68
• Bubba Watson and Henrik Stenson tied for fourth place on four-under, with both players playing strong rounds<|im_end|>


Collaborative inference: opt → qwen on summarization:  50%|█████     | 5/10 [02:04<02:14, 26.90s/it]

Highlights:
•	An 13-year-old boy named Cayman Naib disappeared from a school in Pennsylvania.
•	The boy wore winter clothes when he was last seen but didn’t wear waterproof gear or take his backpack.
•	Parents have set up a Facebook group to help find him.
•	Several hundred people have volunteered to help search for Cayman.
•	Cayman’s school says he was upset about something sent home from school.
•	No one knows what happened to him yet.
•	Authorities are using advanced technology to try to locate him.<|im_end|>


Collaborative inference: opt → qwen on summarization:  60%|██████    | 6/10 [03:11<02:42, 40.52s/it]

wasn't just mean and ugly. It was threatening and scary. As a parent, it's the kind of thing that makes you rethink your opposition to public caning as a logical punishment for such transgressions. These misogynistic cowards may have thought they could hide in the darkness of anonymity, the sort that many have come to expect from social media sites, where you feel free to be a despicable human being because, you think, no one will ever find out who you really are and hold you accountable for your words. If so, they thought wrong. They couldn't hide. They were found out, and they got the throttling they so richly deserved. Thanks to dad. According to Schilling, who made it his mission to track down these cretins and make sure those they associate with know who they really are, two people have already paid a price due to their tweets. One was a student disc jockey at a community college in New Jersey, who was suspended, and the other was a part-time ticket seller for the New York Yankees

Collaborative inference: opt → qwen on summarization:  70%|███████   | 7/10 [03:42<01:52, 37.53s/it]

Highlights:
- Two American women were arrested for carving their initials into a wall with a coin inside Rome's Colosseum.
- The women, aged 21 and 25, were spotted carrying out the act by fellow tourists.
- The two letters -- J and N -- were about eight inches in length and scratched on a brick wall.
- The women, both from California, reportedly snapped a selfie of themselves with their initials before they were arrested.
- The incident comes as a reminder that no world landmark is safe from the salacious urges of tourists -- no matter how sacred it might be to the locals.
- The incident involves other instances of inappropriate tourist behavior, including carvings on famous landmarks around the world.<|im_end|>


Collaborative inference: opt → qwen on summarization:  80%|████████  | 8/10 [04:11<01:09, 34.58s/it]

Highlights:

- Prince and 3rdEyeGirl are bringing their Hit & Run Tour to the US for the first time.

- The first scheduled show is in Louisville, Kentucky, where 3rdEyeGirl drummer Hannah Welton is from.

- Tickets will go on sale on Wednesday, March 9 at 10 am local time.

- Concert venues have not been announced yet.

- The U.K. tour took place in 2014.

- Ticket proceeds will be donated to various charities in Louisville.<|im_end|>


Collaborative inference: opt → qwen on summarization:  90%|█████████ | 9/10 [04:54<00:37, 37.45s/it]

Summary:

On June 25, 2016, a shooting occurred at a bar frequented by expatriates in Mali’s capital city, Bamako. This incident resulted in the deaths of five individuals, including two French nationals and three Malian residents. Additionally, eight others were injured during the attack. 

The police stated that the perpetrator used a semi-automatic rifle and a shotgun to carry out the violence. The attack was identified as a "criminal and terrorist act" by the authorities.

In response to the event, the Malian government emphasized their commitment to maintaining peace and stability within the country. Meanwhile, a North African extremist group known as Al-Murabitun claimed responsibility for the attack via an audiotape, stating it was retaliatory against the death of their leader.

The United States recognized the threat posed by Al-Murabitun, describing them as a newly-formed militant organization operating in northern Mali. 

Additionally, French officials condemned the attack an

Collaborative inference: opt → qwen on summarization: 100%|██████████| 10/10 [05:19<00:00, 31.97s/it]

Highlights:

- Manchester United defender Jonny Evans and Newcastle United striker Papiss Cisse charged by the Football Association.
- Players had spat at each other during Wednesday night's Premier League game at St James' Park.
- Charges involve breaking FA Rule E1[a], which states spitting at another player is not acceptable.
- Players will have until 6 pm GMT on Friday to respond to charges.
- Six-game bans possible if found guilty.
- Evans says he didn't spit at Cisse, while Cisse reacts negatively afterward.
- Former pundits Dietmar Hamann and Paul Scholes express disapproval of the incident.<|im_end|>





Collaborative inference score for opt → qwen on summarization: 0.06
Score: 0.06


In [16]:
score = collaborative_inference('summarization', model_idx1=2, model_idx2=0, n_samples=10, finetuned=True)
print("Score:", score)

llama_summarization qwen_summarization


Collaborative inference: llama → qwen on summarization:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: llama → qwen on summarization:  10%|█         | 1/10 [00:27<04:06, 27.42s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Zully Broussard donated her kidney to a stranger, leading to a chain reaction of six transplants. Her generosity combined with big data processing, using genetic profiles to match donor pairs or chains quickly. This process allowed for more potential matches than previously possible, significantly expanding access to transplant opportunities. The chain of surgeries involves multiple doctors, including five surgeons, a team of support staff, and over 40 individuals working together.<|im_end|>


Collaborative inference: llama → qwen on summarization:  20%|██        | 2/10 [00:49<03:12, 24.11s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


MLS is now considered one of the major sports in America.<|im_end|>


Collaborative inference: llama → qwen on summarization:  30%|███       | 3/10 [01:00<02:08, 18.39s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


User provided summary is already present in the text.<|im_end|>


Collaborative inference: llama → qwen on summarization:  40%|████      | 4/10 [01:17<01:46, 17.73s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Rory McIlroy pulls second shot into lake during WGC Cadillac Championship, leaves 8th hole with one-shot deficit to leader.<|im_end|>


Collaborative inference: llama → qwen on summarization:  50%|█████     | 5/10 [01:39<01:36, 19.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The highlight summary of the article can be as follows:

An 8th-grade student named Cayman Naib has gone missing in Pennsylvania. His parents have created a Facebook group to help find him. Volunteers have been helping with searches, while advanced technology is being used to locate him. The parents have asked Cayman to return home and apologize for his actions.<|im_end|>


Collaborative inference: llama → qwen on summarization:  60%|██████    | 6/10 [02:12<01:36, 24.04s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The article discusses the case of Curt Schilling, a former Major League Baseball pitcher, whose daughter Gabby was accepted to Salve Regina University. After Gabby was congratulated on her acceptance, several Twitter users began making derogatory comments about her. Despite the initial deletion of the tweets, Schilling managed to capture some of the images and posted them online. This led to widespread backlash and resulted in the suspension of a student DJ and the firing of a ticket seller. The incident highlights the dangers of cyberbullying and the importance of parental responsibility. Schilling's actions demonstrate his commitment to protecting his daughter and promoting online accountability.<|im_end|>


Collaborative inference: llama → qwen on summarization:  70%|███████   | 7/10 [02:40<01:16, 25.35s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The article discusses various instances of tourists engaging in inappropriate behavior while visiting famous landmarks around the world. It mentions several cases such as the American women carving their initials into a wall at the Colosseum in Rome, a Russian tourist being fined for vandalizing the Colosseum, a Russian porn flick being shot near the Pyramids of Giza and the Sphinx, and nude photos taken at a sacred site in Cambodia.<|im_end|>


Collaborative inference: llama → qwen on summarization:  80%|████████  | 8/10 [03:07<00:51, 25.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prince is coming to town! The Rock King (or should we say Princess?) is returning to the United States with his band, 3rdEyeGirl. This marks the first time he'll be touring the country. Their first performance together will take place in Louisville, Kentucky, home of 3rdEyeGirl's drummer, Hannah Welton. Tickets go on sale this Wednesday morning at 10 AM local time. Prince will also release his new album "Rise" this spring, which includes two hit songs. The proceeds from ticket sales will benefit several local charities in Louisville. Check out the official website for more information.<|im_end|>


Collaborative inference: llama → qwen on summarization:  90%|█████████ | 9/10 [03:39<00:27, 27.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Summary: In Bamako, Mali, a shooting occurred at a popular bar frequented by expatriates. Five people were killed, including three Malians and two foreigners. The attackers claimed responsibility for the attack, saying they did so as retribution against one of their own members. Al-Murabitun, a newly-formed militant group, is suspected of being behind the attack. The incident took place during a period of political instability in Mali where military coups led to civil wars involving Tuareg groups and Islamist radicals. The attack highlighted ongoing conflicts within the country.<|im_end|>


Collaborative inference: llama → qwen on summarization: 100%|██████████| 10/10 [03:54<00:00, 23.42s/it]

To generate more accurate results, please refine your question.<|im_end|>





Collaborative inference score for llama → qwen on summarization: 0.0537
Score: 0.0537


In [15]:
score = collaborative_inference('summarization', model_idx1=1, model_idx2=2, n_samples=10, finetuned=True)
print("Score:", score)

opt_summarization llama_summarization


Collaborative inference: opt → llama on summarization:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  10%|█         | 1/10 [00:24<03:40, 24.49s/it]

|end_header_id|>
Here are the highlights from the article:

* Zully Broussard selflessly gave one of her kidneys to a stranger, and it paired up with big data to result in six patients receiving transplants.
* Her kidney was removed on Thursday, and went to a recipient who was paired with a donor.
* The chain of surgeries is to be wrapped up on Friday, and includes five surgeons, a covey of physician assistants, nurses, and anesthesiologists, and over 40 support staff.
* The chain of surgeries is taking place because of a computer program called MatchGrid, which matches donors and recipients based on genetic profiles.
* The program has been used to match donors and recipients in previous cases, but the current chain is the largest and most complex yet.
* Broussard's generosity is being recognized as a significant factor in the success of the transplant, and she has been praised for her altruism.
* The medical center plans to hold a reception for all 12 patients on Friday to celebrate t

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  20%|██        | 2/10 [00:52<03:33, 26.70s/it]

|end_header_id|>
Here are the highlights of the article:

* The first ever Major League Soccer (MLS) match was played on April 6, 1996, between San Jose Clash and DC United.
* The match was attended by 31,683 fans at Spartan Stadium in San Jose, California.
* The game was described by ESPN commentator Ty Keough as a "momentous 'birth of a new era for American soccer'".
* The MLS has made significant progress since its inception, with attendance increasing to higher-than-ever levels and the number of teams growing to 20.
* The league has also expanded its domestic TV and media rights deal with FOX, ESPN, and Univision worth $700 million over eight years.
* The new season marks the first of a new domestic TV and media rights deal with FOX, ESPN, and Univision worth $700 million over eight years.
* The league is expected to grow further, with the addition of four new teams in 2020.
* The MLS has also made progress in attracting and retaining top players, with teams like Orlando City Socce

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  30%|███       | 3/10 [01:13<02:47, 23.99s/it]

|end_header_id|>
Here are the highlights from the article:

* French striker Bafetimbi Gomis collapsed during Swansea's 3-2 loss at Tottenham in the Premier League on Wednesday.
* He left the pitch conscious after about five minutes of treatment, wearing an oxygen mask.
* Swansea tweeted that Gomis was "fine", with manager Garry Monk using the same word to describe his condition.
* Gomis spent the night in hospital as a precaution, Swansea said on its website.
* He was treated for low blood pressure, which causes fainting spells.
* Gomis had similar fainting spells in France, prompting his former club Lyon to express worry about his health.
* Swansea ran tests on Gomis before signing him on a free transfer last July.
* Gomis has scored two league goals for Swansea this season, mostly in a backup role.
* He became the Welsh side's top striker when Wilfried Bony signed with Manchester City in January.
* Other footballers, including Fabrice Muamba and Marc-Vivien Foe, did not survive afte

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  40%|████      | 4/10 [01:45<02:42, 27.02s/it]

|end_header_id|>
Here are the highlights from the article:

* Rory McIlroy's second shot into a lake on the eighth hole of the WGC Cadillac Championship was a rare moment of frustration.
* McIlroy felt the need to drop the ball and hit another shot, which he later joked was a "3-iron for the rest of the round".
* He finished the round with a second round of 70, one-under for the tournament and eight shots off the pace set by leader JB Holmes.
* McIlroy's frustration with his game was evident, with him saying "I think every golfer feels it because I don't hit shots like the one I hit on 8 on the range".
* He also mentioned that he gets out on the course and hits shots that he's not seeing when he's in a more relaxed environment.
* McIlroy's performance was an improvement on his last round at the Honda Classic, where he failed to make the cut.
* Ryan Holmes scored a two-under-par 71 to remain in second position overall, two shots behind Holmes.
* Former world No 1. Adam Scott carded an i

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  50%|█████     | 5/10 [02:04<02:00, 24.09s/it]

nd_header_id|>
Here are the highlights from the article:

* An 8th-grade student, Cayman Naib, has gone missing in Pennsylvania.
* He was last seen wearing a gray down winter jacket, black ski pants, and hiking boots.
* His parents, Farid and Becky Naib, are searching for him in the Radnor-Wayne area, 20 miles from Philadelphia.
* They believe Cayman left school upset after receiving an email from school about overdue home work.
* His phone was out of power at the time he left school.
* The parents have posted on a Facebook group called "Find Cayman" and have received help from hundreds of volunteers, including those who have passed out fliers and canvassed areas.
* Weather has limited search efforts, with rain and snow hindering the search on Wednesday and Thursday.
* The search will continue with the use of advanced, geo-spacial tracking software and the deployment of the Civil Air Patrol on Sunday.
* The families have appealed to Cayman to return home, saying they are "so worried ab

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  60%|██████    | 6/10 [02:49<02:04, 31.24s/it]

nd_header_id|>
My vote for Father of the Year goes to Curt Schilling. The former Major League Baseball pitcher recently fired off a series of fastballs and mowed down a group of Twitter trolls who made the mistake of tweeting vulgar and sexually-explicit comments about Schilling's teenage daughter. The drama started, innocently enough, on February 25, when Schilling played the role of a proud father. He sent a tweet congratulating his daughter, Gabby, on being accepted to Salve Regina University, where she'll play softball. It read: "Congrats to Gabby Schilling who will pitch for the Salve Regina Seahawks next year!! — Curt Schilling (@gehrig38)" Almost immediately, responses came in from young men, complete strangers who apparently followed Schilling on Twitter. The tweets quickly went from immature, to creepy, to repugnant. Threats of rape were common. The tweets were deleted, and the accounts were closed after this story went viral. But not before Schilling captured some of the imag

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  70%|███████   | 7/10 [03:15<01:28, 29.63s/it]

|end_header_id|>
Here are the highlights from the article:

* Two American women, aged 21 and 25, were arrested for carving their initials into a wall at Rome's Colosseum.
* The women, from California, were spotted by fellow tourists who told security about the act.
* The two letters, J and N, were about 8 inches in length and scratched on a brick wall.
* The women were arrested and may face a fine for "aggravated damage" on a historical and artistic building.
* This is not the first time tourists have been caught carving graffiti at Rome's Colosseum.
* The incident comes after a 2014 incident where a Russian tourist was caught carving a letter "K" in a section of brickwork.
* The women may also face a fine for "aggravated damage" at Egypt's Pyramids of Giza and the Sphinx.
* The incident is also related to a Russian porn flick that was shot next to the Pyramids of Giza and the Sphinx.
* Cambodia's Angkor Archeological Park has also experienced nudity-related incidents, including a 201

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  80%|████████  | 8/10 [03:35<00:53, 26.59s/it]

nd_header_id|>
Here are the highlights from the article:

* Prince and 3rdEyeGirl are touring the US for the first time.
* The first show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton.
* Tickets will go on sale Monday, March 9 at 10 a.m. local time.
* The show will be a dual-venue event, with Prince crowning dual rock charts.
* A venue has not been announced yet.
* Portions of the ticket sales will be donated to various Louisville charities.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization:  90%|█████████ | 9/10 [03:59<00:25, 25.87s/it]

|end_header_id|>
Here are the highlights from the article:

* A shooting at a bar popular with expatriates in Mali on Saturday killed 5 people, including 1 French and 1 Belgian citizen.
* 1 French citizen, 1 Belgian citizen, and 3 Malians were killed in the attack in Bamako.
* 8 people were wounded in the attack.
* Authorities called the shooting a "criminal and terrorist act".
* The government said Mali remains committed to seeking peace and will not be intimidated by extremist groups.
* A North African jihadist group, al-Murabitun, claimed responsibility for the attack.
* Al-Murabitun is considered a regional competitor to al-Qaeda in the Islamic Maghreb (AQIM).
* The U.S. State Department said al-Murabitun is a "newly-formed" militant group.
* French Foreign Minister Laurent Fabius said the victim from France was 31 years old.
* French President Francois Hollande condemned the attack and U.S. Secretary of State John Kerry expressed condolences to the victims' families.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on summarization: 100%|██████████| 10/10 [04:20<00:00, 26.06s/it]

d_header_id|>
Here are the highlights from the article:

* Manchester United defender Jonny Evans and Newcastle United striker Papiss Cisse have been charged by the Football Association for allegedly spitting during an altercation in a Premier League game.
* The incident occurred in the 38th minute of the game, with both players spitting at each other.
* The players have until 6pm GMT on Friday to respond to the charge.
* If found guilty, both players could face six-game bans.
* The charge is related to an alleged breach of FA Rule E1[a], which deals with spitting at another player.
* Former Liverpool midfielder Dietmar Hamann described the incident as "disgusting" and said that the behaviour towards each other and the referee is deteriorating on a weekly basis.
* Ex-Manchester United midfielder Paul Scholes said he did not believe Evans had deliberately spat at Cisse.
* Former Liverpool player Steve McManaman described the incident as "absolutely disgusting" and said that Cisse stands




Collaborative inference score for opt → llama on summarization: 0.06
Score: 0.06


In [12]:
score = collaborative_inference('qa', model_idx1=0, model_idx2=2, n_samples=10, finetuned=True)
print("Score:", score)

qwen_qa llama_qa


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Collaborative inference: qwen → llama on qa:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  10%|█         | 1/10 [00:01<00:13,  1.54s/it]

|end_header_id|>
The Denver Broncos represented the AFC at Super Bowl 50.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  20%|██        | 2/10 [00:03<00:12,  1.60s/it]

|end_header_id|>
Carolina Panthers represented the NFC at Super Bowl 50.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  30%|███       | 3/10 [00:04<00:10,  1.43s/it]

|end_header_id|>
Levi's Stadium<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  40%|████      | 4/10 [00:05<00:08,  1.37s/it]

|end_header_id|>
The Denver Broncos won Super Bowl 50.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  50%|█████     | 5/10 [00:07<00:07,  1.48s/it]

|end_header_id|>
Gold was used to emphasize the 50th anniversary of the Super Bowl.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  60%|██████    | 6/10 [00:09<00:06,  1.61s/it]

|end_header_id|>
The theme of Super Bowl 50 was the "Golden Anniversary".<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  70%|███████   | 7/10 [00:11<00:05,  1.82s/it]

|end_header_id|>
February 7, 2016<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  80%|████████  | 8/10 [00:12<00:03,  1.68s/it]

|end_header_id|>
AFC stands for American Football Conference.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa:  90%|█████████ | 9/10 [00:13<00:01,  1.46s/it]

|end_header_id|>
The golden anniversary<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on qa: 100%|██████████| 10/10 [00:15<00:00,  1.51s/it]

|end_header_id|>
AFC stands for American Football Conference.<|eot_id|>





Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collaborative inference score for qwen → llama on qa: 0.4077
Score: 0.4077


In [13]:
score = collaborative_inference('qa', model_idx1=2, model_idx2=0, n_samples=10, finetuned=True)
print("Score:", score)

llama_qa qwen_qa


Collaborative inference: llama → qwen on qa:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: llama → qwen on qa:  10%|█         | 1/10 [00:01<00:13,  1.52s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Denver Broncos<|im_end|>


Collaborative inference: llama → qwen on qa:  20%|██        | 2/10 [00:04<00:17,  2.25s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The Carolina Panthers represented the NFC at Super Bowl 50.<|im_end|>


Collaborative inference: llama → qwen on qa:  30%|███       | 3/10 [00:08<00:21,  3.03s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.<|im_end|>


Collaborative inference: llama → qwen on qa:  40%|████      | 4/10 [00:09<00:14,  2.35s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Denver Broncos<|im_end|>


Collaborative inference: llama → qwen on qa:  50%|█████     | 5/10 [00:10<00:09,  1.87s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The gold color<|im_end|>


Collaborative inference: llama → qwen on qa:  60%|██████    | 6/10 [00:20<00:18,  4.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Based on the provided context, the theme of Super Bowl 50 was a "golden anniversary". This refers to the significance and importance placed on celebrating the 50th edition of the Super Bowl due to its special status and historical significance in NFL history.<|im_end|>


Collaborative inference: llama → qwen on qa:  70%|███████   | 7/10 [00:22<00:11,  3.78s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


February 7, 2016<|im_end|>


Collaborative inference: llama → qwen on qa:  80%|████████  | 8/10 [00:23<00:05,  2.94s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


American Football Conference<|im_end|>


Collaborative inference: llama → qwen on qa:  90%|█████████ | 9/10 [00:28<00:03,  3.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Based on the given context, the theme of Super Bowl 50 was the "golden anniversary".<|im_end|>


Collaborative inference: llama → qwen on qa: 100%|██████████| 10/10 [00:31<00:00,  3.11s/it]

The answer is AFC, which stands for American Football Conference.<|im_end|>



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collaborative inference score for llama → qwen on qa: 0.4027
Score: 0.4027


In [24]:
score = collaborative_inference('qa', model_idx1=1, model_idx2=0, n_samples=10, finetuned=True)
print("Score:", score)

opt_qa qwen_qa


Collaborative inference: opt → qwen on qa:  10%|█         | 1/10 [00:02<00:23,  2.66s/it]

The Denver Broncos represented the AFC at Super Bowl 50.<|im_end|>


Collaborative inference: opt → qwen on qa:  20%|██        | 2/10 [00:04<00:15,  1.98s/it]

Carolina Panthers<|im_end|>


Collaborative inference: opt → qwen on qa:  30%|███       | 3/10 [00:09<00:25,  3.63s/it]

Based on the provided context, Super Bowl 50 took place at Levi's Stadium in the San Francisco Bay Area, specifically at Santa Clara, California.<|im_end|>


Collaborative inference: opt → qwen on qa:  40%|████      | 4/10 [00:11<00:18,  3.05s/it]

The Denver Broncos won Super Bowl 50.<|im_end|>


Collaborative inference: opt → qwen on qa:  50%|█████     | 5/10 [00:12<00:11,  2.29s/it]

gold<|im_end|>


Collaborative inference: opt → qwen on qa:  60%|██████    | 6/10 [00:15<00:10,  2.51s/it]

The theme of Super Bowl 50 was the "golden anniversary."<|im_end|>


Collaborative inference: opt → qwen on qa:  70%|███████   | 7/10 [00:18<00:07,  2.41s/it]

February 7, 2016<|im_end|>


Collaborative inference: opt → qwen on qa:  80%|████████  | 8/10 [00:38<00:16,  8.26s/it]

The AFC stands for American Football Conference.

To arrive at this answer:

1. I first identified the relevant information from the given context. It mentions "American Football Conference" and "National Football Conference."

2. These are the two main divisions or conferences in American football leagues.

3. The question asks specifically about the AFC, which matches perfectly with "American Football Conference."

4. Therefore, I concluded that AFC stands for American Football Conference based on the provided context.

This approach involves identifying key terms within the text and matching them to their corresponding definitions or abbreviations. In this case, "AFC" is directly defined by its description in the text.<|im_end|>


Collaborative inference: opt → qwen on qa:  90%|█████████ | 9/10 [00:41<00:06,  6.61s/it]

The theme of Super Bowl 50 was the "golden anniversary."<|im_end|>


Collaborative inference: opt → qwen on qa: 100%|██████████| 10/10 [00:58<00:00,  5.85s/it]

The acronym AFC stands for American Football Conference. This refers to one of two conferences in Major League Soccer and the NFL. AFC champions are from the American Football Conference.

In the context provided, it is mentioned that Super Bowl 50 was a championship game of the American Football Conference (AFC). The AFC champions were the Denver Broncos, who won against the NFC champion Carolina Panthers with a score of 24-10.

So, when referring to the American Football Conference, you can use the abbreviation AFC.<|im_end|>



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collaborative inference score for opt → qwen on qa: 0.4172
Score: 0.4172


In [14]:
score = collaborative_inference('qa', model_idx1=1, model_idx2=2, n_samples=10, finetuned=True)
print("Score:", score)

opt_qa llama_qa


Collaborative inference: opt → llama on qa:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  10%|█         | 1/10 [00:01<00:17,  1.97s/it]

|end_header_id|>
The American Football Conference (AFC) champion Denver Broncos represented the AFC at Super Bowl 50.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  20%|██        | 2/10 [00:03<00:13,  1.69s/it]

|end_header_id|>
The Carolina Panthers were the NFC champions.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  30%|███       | 3/10 [00:04<00:09,  1.41s/it]

|end_header_id|>
At Santa Clara, California<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  40%|████      | 4/10 [00:05<00:07,  1.32s/it]

|end_header_id|>
The Denver Broncos won Super Bowl 50.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  50%|█████     | 5/10 [00:07<00:07,  1.51s/it]

|end_header_id|>
The color used to emphasize the 50th anniversary of the Super Bowl was gold.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  60%|██████    | 6/10 [00:09<00:06,  1.55s/it]

|end_header_id|>
The theme of Super Bowl 50 was the "golden anniversary".<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  70%|███████   | 7/10 [00:10<00:04,  1.46s/it]

|end_header_id|>
February 7, 2016<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  80%|████████  | 8/10 [00:11<00:02,  1.34s/it]

|end_header_id|>
The AFC stands for American Football Conference.<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa:  90%|█████████ | 9/10 [00:13<00:01,  1.43s/it]

|end_header_id|>
The theme of Super Bowl 50 was the "golden anniversary".<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on qa: 100%|██████████| 10/10 [00:14<00:00,  1.44s/it]

|end_header_id|>
AFC stands for American Football Conference.<|eot_id|>



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Collaborative inference score for opt → llama on qa: 0.4172
Score: 0.4172


In [19]:
score = collaborative_inference('paraphrase', model_idx1=0, model_idx2=2, n_samples=10, finetuned=True, split='train')
print("Score:", score)

qwen_paraphrase llama_paraphrase


Collaborative inference: qwen → llama on paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  10%|█         | 1/10 [00:03<00:30,  3.41s/it]

|end_header_id|>
<|im_start|>How to start investing in shares of Indian companies?<|im_end|><|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  20%|██        | 2/10 [00:06<00:24,  3.00s/it]

|end_header_id|>
What is the story of the Kohinoor diamond?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  30%|███       | 3/10 [00:10<00:25,  3.66s/it]

|end_header_id|>
<|im_start|>How does a VPN impact my internet speed?<|im_end|><|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  40%|████      | 4/10 [00:15<00:24,  4.01s/it]

|end_header_id|>
How do you feel about yourself? Is there anything that could be changed to make you happier?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  50%|█████     | 5/10 [00:19<00:20,  4.03s/it]

|end_header_id|>
Which one dissolves quickly in water among sugar, salt, methane and carbon dioxide?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  60%|██████    | 6/10 [00:26<00:21,  5.26s/it]

|end_header_id|>
I'm a Capricorn with my sun in Capricorn, moon in Virgo, and rising in Capricorn. What does this mean for me?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  70%|███████   | 7/10 [00:28<00:12,  4.07s/it]

|end_header_id|>
Should I buy Tiago?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  80%|████████  | 8/10 [00:32<00:07,  3.98s/it]

|end_header_id|>
<|im_start|>What specific steps can I take to develop my skills and knowledge in geology?<|im_end|><|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase:  90%|█████████ | 9/10 [00:35<00:03,  3.72s/it]

|end_header_id|>
When should I use シ or し in Japanese?<|im_start|><|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: qwen → llama on paraphrase: 100%|██████████| 10/10 [00:38<00:00,  3.83s/it]

|end_header_id|>
Can I access my Charter Motorola DCX3400?<|eot_id|>





Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


Collaborative inference score for qwen → llama on paraphrase: 1.9313
Score: 1.9313


In [20]:
score = collaborative_inference('paraphrase', model_idx1=2, model_idx2=0, n_samples=10, finetuned=True, split='train')
print("Score:", score)

llama_paraphrase qwen_paraphrase


Collaborative inference: llama → qwen on paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: llama → qwen on paraphrase:  10%|█         | 1/10 [00:32<04:54, 32.74s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


across different sectors and companies to reduce risk.

10. **Stay updated with news and events**: Stay informed about economic developments, regulatory changes, and other relevant information affecting the stock market.
11. **Consider using financial advisors**: If necessary, consult a professional financial advisor who can provide guidance and advice tailored to your specific needs.

Remember that investing involves risks, including loss of principal. It's important to do thorough research and seek expert advice before making any investment decisions.<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  20%|██        | 2/10 [00:35<02:01, 15.14s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is the history of the Kohinoor diamond?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  30%|███       | 3/10 [00:38<01:07,  9.64s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Is it possible to use a VPN and still have fast internet speeds?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  40%|████      | 4/10 [00:40<00:39,  6.66s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How do you feel about being lonely?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  50%|█████     | 5/10 [00:42<00:24,  4.94s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Which substances dissolve in water quickly?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  60%|██████    | 6/10 [00:48<00:21,  5.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Astrology: I'm a Capricorn with a Moon in Cancer and an Ascendant in Virgo... what does this tell you about me?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  70%|███████   | 7/10 [00:50<00:12,  4.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Do I need to get tiago?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  80%|████████  | 8/10 [00:53<00:07,  3.84s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What are some ways to become a successful geologist?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  90%|█████████ | 9/10 [00:58<00:04,  4.04s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


When should I use シ instead of し?<|im_end|>


Collaborative inference: llama → qwen on paraphrase: 100%|██████████| 10/10 [01:01<00:00,  6.17s/it]

Can I hack my Charter Motorolla DCX3400?<|im_end|>



[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Collaborative inference score for llama → qwen on paraphrase: 0.984
Score: 0.984


In [21]:
score = collaborative_inference('paraphrase', model_idx1=1, model_idx2=0, n_samples=10, finetuned=True, split='train')
print("Score:", score)

opt_paraphrase qwen_paraphrase


Collaborative inference: opt → qwen on paraphrase:  10%|█         | 1/10 [00:03<00:31,  3.49s/it]

How do I start investing in the stock market?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  20%|██        | 2/10 [00:07<00:30,  3.78s/it]

What is the history behind the Kohinoor diamond?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  30%|███       | 3/10 [00:12<00:30,  4.30s/it]

Can you help me improve the speed of my internet connection when using a virtual private network (VPN)?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  40%|████      | 4/10 [00:16<00:24,  4.05s/it]

How do you feel about yourself? How can I help you?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  50%|█████     | 5/10 [00:21<00:21,  4.39s/it]

What dissolves quickly in water? Salt, Sugar, Carbon Dioxide, Methane.<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  60%|██████    | 6/10 [00:26<00:19,  4.90s/it]

I am a Capricorn Sun, Capricorn Moon, and Capricorn Rising... what does that mean for me?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  70%|███████   | 7/10 [00:28<00:11,  3.93s/it]

Would you recommend Tiago?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  80%|████████  | 8/10 [00:31<00:07,  3.63s/it]

What should I do to become a good geologist?<|im_end|>


Collaborative inference: opt → qwen on paraphrase:  90%|█████████ | 9/10 [00:58<00:10, 10.76s/it]

When do you use シ instead of し?<|im_end|>


Collaborative inference: opt → qwen on paraphrase: 100%|██████████| 10/10 [01:02<00:00,  6.28s/it]

Is it possible to hack my Charter Motorolla DCX3400?<|im_end|>





Collaborative inference score for opt → qwen on paraphrase: 2.534
Score: 2.534


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
score = collaborative_inference('paraphrase', model_idx1=1, model_idx2=2, n_samples=10, finetuned=True, split='train')
print("Score:", score)

opt_paraphrase llama_paraphrase


Collaborative inference: opt → llama on paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  10%|█         | 1/10 [00:02<00:26,  2.89s/it]

|end_header_id|>
What is the best way to start investing in the Indian stock market?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  20%|██        | 2/10 [00:06<00:25,  3.16s/it]

|end_header_id|>
What is the story of Kohinoor (Koh-i-Noor) Diamond?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  30%|███       | 3/10 [00:09<00:21,  3.11s/it]

|end_header_id|>
What can I do to boost the speed of my internet connection while I'm using a VPN?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  40%|████      | 4/10 [00:11<00:16,  2.71s/it]

|end_header_id|>
Why am I feeling very lonely?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  50%|█████     | 5/10 [00:14<00:13,  2.71s/it]

|end_header_id|>
Which one dissolves in water quickly?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  60%|██████    | 6/10 [00:16<00:10,  2.66s/it]

|end_header_id|>
What does the sign of Capricorn say about me?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  70%|███████   | 7/10 [00:17<00:06,  2.23s/it]

|end_header_id|>
Should I buy tiago?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  80%|████████  | 8/10 [00:20<00:04,  2.19s/it]

|end_header_id|>
What are the key skills and qualities of a good geologist?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase:  90%|█████████ | 9/10 [00:45<00:09,  9.37s/it]

|end_header_id|>
When do you use シ instead of し?<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: opt → llama on paraphrase: 100%|██████████| 10/10 [00:48<00:00,  4.81s/it]

|end_header_id|>
Can I hack my Charter Motorolla DCX3400?<|eot_id|>





Collaborative inference score for opt → llama on paraphrase: 2.534
Score: 2.534


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
score = collaborative_inference('paraphrase', model_idx1=2, model_idx2=0, n_samples=10, finetuned=True, split='train')
print("Score:", score)

llama_paraphrase qwen_paraphrase


Collaborative inference: llama → qwen on paraphrase:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Collaborative inference: llama → qwen on paraphrase:  10%|█         | 1/10 [00:03<00:31,  3.54s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How do I start investing in the Indian stock market?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  20%|██        | 2/10 [00:07<00:30,  3.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Tell me about the history of Kohinoor (Koh-i-Noor) Diamond?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  30%|███       | 3/10 [00:11<00:26,  3.76s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Is there any way to boost my internet speed when using a virtual private network?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  40%|████      | 4/10 [00:13<00:19,  3.26s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How do I feel about my mental health?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  50%|█████     | 5/10 [00:15<00:14,  2.83s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Which substance dissolves in water most readily?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  60%|██████    | 6/10 [00:18<00:11,  2.78s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What does this astrology tell you about yourself?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  70%|███████   | 7/10 [00:20<00:07,  2.53s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Is it wise to purchase Tiago?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  80%|████████  | 8/10 [00:23<00:05,  2.53s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How do I become a good geologist?<|im_end|>


Collaborative inference: llama → qwen on paraphrase:  90%|█████████ | 9/10 [00:26<00:02,  2.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How can I say "I am going to buy" in Japanese?<|im_end|>


Collaborative inference: llama → qwen on paraphrase: 100%|██████████| 10/10 [00:29<00:00,  2.99s/it]

Can you hack your Motorola Charter DCX3400?<|im_end|>





Collaborative inference score for llama → qwen on paraphrase: 1.2639
Score: 1.2639


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
import time

def average_inference_time(task, model_idx1, model_idx2, n_samples=100, finetuned=True, split='validation'):
    keys = list(MODEL_MAP.keys())
    m1, m2 = keys[model_idx1], keys[model_idx2]
    path1 = f"{m1}_{task}" if finetuned else MODEL_MAP[m1]
    path2 = f"{m2}_{task}" if finetuned else MODEL_MAP[m2]

    if finetuned and (not os.path.exists(path1) or not os.path.exists(path2)):
        logger.warning("One of the finetuned models is missing.")
        return

    tokenizer1 = AutoTokenizer.from_pretrained(path1)
    tokenizer1.pad_token = tokenizer1.eos_token
    model1 = AutoModelForCausalLM.from_pretrained(
        path1,
        device_map='auto',
        quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    )

    tokenizer2 = AutoTokenizer.from_pretrained(path2)
    tokenizer2.pad_token = tokenizer2.eos_token
    model2 = AutoModelForCausalLM.from_pretrained(
        path2,
        device_map='auto',
        quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    )

    dataset_name, config = dataset_map[task]
    dataset = load_dataset(dataset_name, config)[split].select(range(n_samples))
    
    times = []

    for item in dataset:
        prompt = format_prompt(m1, task, item)

        if task == 'summarization':
            reference = item['highlights']
        elif task == 'qa':
            reference = item['answers']['text'][0] if item['answers']['text'] else "No Answer"
        elif task == 'paraphrase':
            reference = item['questions']['text'][1]
        else:
            raise ValueError("Unknown task")
            
        inputs = tokenizer1(prompt, return_tensors="pt").to(model1.device)

        start = time.time()
        outputs1 = model1.generate(**inputs, max_new_tokens=50)
        output_text1 = tokenizer1.decode(outputs1[0], skip_special_tokens=True)

        inputs2 = tokenizer2(output_text1, return_tensors="pt").to(model2.device)
        outputs2 = model2.generate(**inputs2, max_new_tokens=50)
        end = time.time()

        times.append(end - start)

    avg_time = np.mean(times)
    logger.info(f"Average inference time per query (task={task}, models={m1}+{m2}): {avg_time:.4f} seconds")
    return avg_time

In [8]:
average_inference_time('summarization', 0, 2, 10)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


10.037115716934204

In [9]:
average_inference_time('summarization', 2, 0, 10)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


11.462097859382629

In [10]:
average_inference_time('qa', 0, 2, 10)

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2.794150233268738

In [11]:
average_inference_time('qa', 2, 0, 10)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


8.057641077041627

In [13]:
average_inference_time('paraphrase', 0, 2, 10, True, 'train')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2.9082411766052245

In [12]:
average_inference_time('paraphrase', 2, 0, 10, True, 'train')

README.md:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

quora.py:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


5.968151974678039