In [None]:
import pickle
from sentence_transformers import SentenceTransformer, util
import fasttext
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re
import pandas as pd

In [None]:
from datasets import load_dataset

wmt14_dataset = load_dataset('wmt14', 'de-en', split='validation')
cnn_dailymail_dataset = load_dataset('cnn_dailymail', '2.0.0', split='validation')
gsm8k_dataset = load_dataset('openai/gsm8k', 'main', split='test')

In [None]:
def train_classifier(labelled_dataset):
    def format_for_fasttext(row):
        return f"__label__{row['Chosen_Model']} {row['Input_Text']}"
    
    formatted_data = labelled_dataset.apply(format_for_fasttext, axis=1)
    formatted_data.to_csv('fasttext_train.txt', index=False, header=False)

    fasttext_classifier = fasttext.train_supervised('fasttext_train.txt', epoch=100)

    # rint(f"Done with training fasttext classifier!")
    return fasttext_classifier

In [None]:
def evaluate_classifier(classifier, model_outputs, task, idx):
    input_text, label = model_outputs[task][idx]["input"], model_outputs[task][idx]["label"]
    predicted_label, confidence_score = classifier.predict(input_text)
    large_model_output = model_outputs[task][idx]["13b"]

    model = ""
    if "7b" in predicted_label[0]: output = model_outputs[task][idx]["7b"]; model = "7b"
    elif "tiny" in predicted_label[0]: output = model_outputs[task][idx]["tiny"]; model = "tiny"
    elif "13b" in predicted_label[0]: output = model_outputs[task][idx]["13b"]; model = "tiny"

    normalized_bleu_diff = 0
    normalized_rouge_diff = 0
    normalized_acc_diff = 0
    
    if task == "wmt14":
        bleu_score_classifier = sentence_bleu([label.split()], output[0].split())
        bleu_score_13b = sentence_bleu([label.split()], large_model_output[0].split())
        bleu_diff = bleu_score_classifier - bleu_score_13b
        normalized_bleu_diff = bleu_diff / max(bleu_score_classifier, bleu_score_13b, 1e-6)

    if task == "cnn_dailymail":
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores_classifier = scorer.score(label, output[0])
        scores_13b = scorer.score(label, large_model_output[0])
        rouge1_diff = scores_classifier['rouge1'].fmeasure - scores_13b['rouge1'].fmeasure
        rouge2_diff = scores_classifier['rouge2'].fmeasure - scores_13b['rouge2'].fmeasure
        rougeL_diff = scores_classifier['rougeL'].fmeasure - scores_13b['rougeL'].fmeasure
        avg_rouge_diff = (rouge1_diff + rouge2_diff + rougeL_diff) / 3
        normalized_rouge_diff = avg_rouge_diff / max(scores_classifier['rougeL'].fmeasure, scores_13b['rougeL'].fmeasure, 1e-6) 
        
    if task == "gsm8k":
        def extract_final_number(raw_outputs):
            matches = re.findall(r'\b\d+\b', raw_outputs)
            if matches:
                return matches[-1]  
            return None

        answer = extract_final_number(output[0])
        large_model_answer = extract_final_number(large_model_output[0])
        if answer == label:
            classifier_correct = 1
        else:
            classifier_correct = 0
        
        if large_model_answer == label:
            large_model_correct = 1
        else:
            large_model_correct = 0
        
        acc_diff = classifier_correct - large_model_correct
        normalized_acc_diff = acc_diff / max(classifier_correct, large_model_correct, 1e-6) 

    
    emissions = carbon_emissions[model][task]
    if emissions == 1.774e-4:
        print(True)
    
    return (normalized_bleu_diff + normalized_rouge_diff + normalized_acc_diff) / 3, emissions

In [None]:
def create_new_dataset(similarity_scores, mid):
    input_texts = []
    chosen_models = []
    threshold = mid

    for idx, pair in enumerate(similarity_scores['wmt14']['tiny_13b']):
        # For WMT14 dataset
        input_text = wmt14_dataset[idx]['translation']['de']
        similarity_tiny_13b = similarity_scores['wmt14']['tiny_13b'][idx]
        similarity_7b_13b = similarity_scores['wmt14']['7b_13b'][idx]
    
        if similarity_tiny_13b >= threshold: chosen_model = 'tiny'
        elif similarity_7b_13b >= threshold: chosen_model = '7b'
        else: chosen_model = '13b'  
    
        input_texts.append(input_text)
        chosen_models.append(chosen_model)

    for idx, pair in enumerate(similarity_scores['cnn_dailymail']['tiny_13b']):
        # For CNN/DailyMail dataset
        input_text = cnn_dailymail_dataset[idx]['article']
        similarity_tiny_13b = similarity_scores['cnn_dailymail']['tiny_13b'][idx]
        similarity_7b_13b = similarity_scores['cnn_dailymail']['7b_13b'][idx]
    
        if similarity_tiny_13b >= threshold: chosen_model = 'tiny'
        elif similarity_7b_13b >= threshold: chosen_model = '7b'
        else: chosen_model = '13b'  
    
        input_texts.append(input_text)
        chosen_models.append(chosen_model)

    for idx, pair in enumerate(similarity_scores['gsm8k']['tiny_13b']):
        # For GSM8K dataset
        input_text = gsm8k_dataset[idx]['question']
        similarity_tiny_13b = similarity_scores['gsm8k']['tiny_13b'][idx]
        similarity_7b_13b = similarity_scores['gsm8k']['7b_13b'][idx]
    
        if similarity_tiny_13b >= threshold: chosen_model = 'tiny'
        elif similarity_7b_13b >= threshold: chosen_model = '7b'
        else: chosen_model = '13b'  
    
        input_texts.append(input_text)
        chosen_models.append(chosen_model)
        
    labelled_dataset = {'Input_Text': input_texts, 'Chosen_Model': chosen_models}
    df = pd.DataFrame(labelled_dataset)
    return df

In [None]:
middle_model_carbon = 3.569e-2 + 6.511e-2 + 1.067e-1

In [None]:
small_model_carbon = 9.278e-3 + 4.141e-2 + 1.616e-1

In [None]:
print(middle_model_carbon)

In [None]:
carbon_emissions = {
    "tiny": {"wmt14": 9.278e-6, "cnn_dailymail": 4.141e-5, "gsm8k": 3.616e-4},
    "7b": {"wmt14": 3.569e-5, "cnn_dailymail": 6.511e-5, "gsm8k": 1.067e-4},
    "13b": {"wmt14": 5.059e-5, "cnn_dailymail": 1.430e-4, "gsm8k": 1.774e-4}
}

In [None]:
def find_optimal_threshold(similarity_scores, model_outputs):
    low = 0
    high = 1
    epsilon = 0.01
    best_threshold = None
    best_difference = float('inf')
    
    while low <= high:
        mid = (low + high) / 2
        print(mid)
        
        new_dataset = create_new_dataset(similarity_scores, mid)
        classifier = train_classifier(new_dataset)
        
        total_difference = 0
        total_carbon_emissions = 0

        tasks = ["wmt14", "cnn_dailymail", "gsm8k"]
        for task in tasks:
            for idx in range(1000):
                difference, emissions = evaluate_classifier(classifier, model_outputs, task, idx)
                total_difference += difference
                total_carbon_emissions += emissions

        print(total_carbon_emissions)
        print(middle_model_carbon)
        if abs(total_difference) < best_difference and total_carbon_emissions < middle_model_carbon:
            best_difference = abs(total_difference)
            best_threshold = mid
            low = mid + epsilon
        else:
            high = mid - epsilon

    return best_threshold

In [None]:
with open("similarity_scores.pkl", "rb") as f:
    similarity_scores = pickle.load(f)
with open("combined_outputs.pkl", "rb") as f:
    model_outputs = pickle.load(f)

optimal_threshold = find_optimal_threshold(similarity_scores, model_outputs)

print(f"Optimal threshold: {optimal_threshold}")

In [None]:
threshold = .88

In [None]:
labelled_dataset = create_new_dataset(similarity_scores, threshold)
fasttext_classifier = train_classifier(labelled_dataset)

In [None]:
bleu_scores_sum = 0
num_7b = 0
num_tiny = 0
num_13b = 0

for idx in range(1000):
    input_text, label = model_outputs["wmt14"][idx]["input"], model_outputs["wmt14"][idx]["label"]
    predicted_label, confidence_score = fasttext_classifier.predict(input_text)
    if "7b" in predicted_label[0]: 
        num_7b += 1
        output = model_outputs["wmt14"][idx]["7b"]
    elif "tiny" in predicted_label[0]: 
        num_tiny += 1
        output = model_outputs["wmt14"][idx]["tiny"]
    elif "13b" in predicted_label[0]: 
        num_13b += 1
        output = model_outputs["wmt14"][idx]["13b"]
            
    bleu_scores_sum += sentence_bleu([label.split()], output[0].split())        

avg_bleu_wmt14 = bleu_scores_sum / 1000
print(f"Avg Bleu WMT14: {avg_bleu_wmt14}")
print(num_7b)
print(num_tiny)
print(num_13b)

In [None]:
bleu_scores_sum_7b = 0

for idx in range(1000):
    input_text = model_outputs["wmt14"][idx]["input"]
    label = model_outputs["wmt14"][idx]["label"]
    candidate = model_outputs["wmt14"][idx]["13b"][0]  

    bleu_score = sentence_bleu([label.split()], candidate.split())
    bleu_scores_sum_7b += bleu_score

avg_bleu_wmt14_7b = bleu_scores_sum_7b / 1000
print(f"Avg BLEU 7b WMT14: {avg_bleu_wmt14_7b}")

In [None]:
new_dataset = create_new_dataset(similarity_scores, 87)
classifier = train_classifier(new_dataset)
classifier.save_model('fasttext_classifier87.bin')

In [None]:
def find_optimal_threshold_linear(similarity_scores, model_outputs):
    low = 0
    high = 1
    epsilon = 0.01
    best_threshold = None
    best_difference = float('inf')
    
    threshold = low
    while threshold <= high:
        new_dataset = create_new_dataset(similarity_scores, threshold)
        classifier = train_classifier(new_dataset)
        
        total_difference = 0
        total_carbon_emissions = 0

        tasks = ["wmt14", "cnn_dailymail", "gsm8k"]
        for task in tasks:
            for idx in range(1000):
                difference, emissions = evaluate_classifier(classifier, model_outputs, task, idx)
                total_difference += difference
                total_carbon_emissions += emissions

        if abs(total_difference) < best_difference and total_carbon_emissions < middle_model_carbon:
            best_difference = abs(total_difference)
            best_threshold = threshold

        threshold += epsilon
            
    return best_threshold

Pre-Inferencing

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import torch

In [None]:
model_name = 'meta-llama/Llama-2-7b-chat-hf'
llama7b_tokenizer = AutoTokenizer.from_pretrained(model_name)
llama7b = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")

In [None]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tinyllama_tokenizer = AutoTokenizer.from_pretrained(model_name)
tinyllama = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")

In [None]:
llama13_config = BitsAndBytesConfig(load_in_4bit=True,
                                    bnb_4bit_compute_dtype=torch.float16)

In [None]:
model_name = 'meta-llama/Llama-2-13b-chat-hf'

llama13b_tokenizer = AutoTokenizer.from_pretrained(model_name)
llama13b = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', quantization_config=llama13_config)

In [None]:
from datasets import load_dataset
wmt14_dataset = load_dataset('wmt14', 'de-en', split='validation')

In [None]:
def generate_output(model, tokenizer, dataset, current_idx):
    outputs = []
    
    input_text = wmt14_dataset[current_idx]['translation']['de']
    input_prompt = "Translate the sentence from German to English: \n\n" + input_text + "\n\n Write the translation here: "

    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(inputs['input_ids'])
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    answer_prefix = "Write the translation here: "
    if answer_prefix in output_text:
        cleaned_output = output_text.split(answer_prefix)[-1].strip()
    else:
        cleaned_output = output_text.strip()

    first_sentence = cleaned_output.split('.')[0] + '.' if '.' in cleaned_output else cleaned_output
    outputs.append(first_sentence)
    
    return outputs

In [None]:
input_texts = []
outputs_7b = []
outputs_tiny = []
outputs_13b = []

In [None]:
def inference_wmt14(model, tokenizer, start, array, model_name):
    for current_idx in range(start, 1000):
        input_text = wmt14_dataset[current_idx]['translation']['de']
        output = generate_output(model, tokenizer, input_text, current_idx)
        array.append(output)
        
        print(f"{model_name} | CURRENT IDX: {current_idx} | Length: {len(array)}")
        if (model_name == "Llama7b"): 
            with open('THRESHOLDING_input_output_pairs_wmt14_7b', 'wb') as f: pickle.dump(array, f)
        if (model_name == "TinyLlama"): 
            with open('THRESHOLDING_input_output_pairs_wmt14_tiny', 'wb') as f: pickle.dump(array, f)
        if (model_name == "Llama13b"): 
            with open('THRESHOLDING_input_output_pairs_wmt14_13b', 'wb') as f: pickle.dump(array, f)

In [None]:
# inference_wmt14(llama7b, llama7b_tokenizer, 0, outputs_7b, "Llama7b")

In [None]:
# inference_wmt14(tinyllama, tinyllama_tokenizer, 0, outputs_tiny, "TinyLlama")

In [None]:
inference_wmt14(llama13b, llama13b_tokenizer, 971, outputs_13b, "Llama13b")

In [None]:
with open("THRESHOLDING_input_output_pairs_wmt14_7b", "rb") as f: outputs_7b = pickle.load(f)
print(len(outputs_7b))
with open("THRESHOLDING_input_output_pairs_wmt14_tiny", "rb") as f: outputs_tiny = pickle.load(f)
print(len(outputs_tiny))
with open("THRESHOLDING_input_output_pairs_wmt14_13b", "rb") as f: outputs_13b = pickle.load(f)
print(len(outputs_13b))

In [None]:
from datasets import load_dataset
cnn_dailymail_dataset = load_dataset('abisee/cnn_dailymail', '2.0.0', split='validation')

In [None]:
def generate_output(model, tokenizer, dataset, current_idx):
    outputs = []
    
    input_text = cnn_dailymail_dataset[current_idx]['article'] 
    input_prompt = "Summarize the following text in under 50 words: \n\n" + input_text + "\n\n Write the summary here: "
    
    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(inputs['input_ids'], max_new_tokens=100)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    summary_prefix = "Write the summary here: "
    if summary_prefix in output_text:
        cleaned_output = output_text.split(summary_prefix)[-1].strip()
    else:
        cleaned_output = output_text.strip()

    outputs.append(cleaned_output)
    
    return outputs

In [None]:
input_texts = []
outputs_7b = []
outputs_tiny = []
outputs_13b = []

In [None]:
def inference_cnn_dailymail(model, tokenizer, start, array, model_name):
    for current_idx in range(start, 1000):
        input_text = cnn_dailymail_dataset[current_idx]['article'] 
        output = generate_output(model, tokenizer, input_text, current_idx)
        array.append(output)
        
        print(f"{model_name} | CURRENT IDX: {current_idx} | Length: {len(array)}")
        if (model_name == "Llama7b"): 
            with open('THRESHOLDING_input_output_pairs_cnn_dailymail_7b', 'wb') as f: pickle.dump(array, f)
        if (model_name == "TinyLlama"): 
            with open('THRESHOLDING_input_output_pairs_cnn_dailymail_tiny', 'wb') as f: pickle.dump(array, f)
        if (model_name == "Llama13b"): 
            with open('THRESHOLDING_input_output_pairs_cnn_dailymail_13b', 'wb') as f: pickle.dump(array, f)

In [None]:
# inference_cnn_dailymail(llama7b, llama7b_tokenizer, 0, outputs_7b, "Llama7b")

In [None]:
# inference_cnn_dailymail(tinyllama, tinyllama_tokenizer, 0, outputs_tiny, "TinyLlama")

In [None]:
inference_cnn_dailymail(llama13b, llama13b_tokenizer, 860, outputs_13b, "Llama13b")

In [None]:
with open("THRESHOLDING_input_output_pairs_cnn_dailymail_7b", "rb") as f: outputs_7b = pickle.load(f)
print(len(outputs_7b))
with open("THRESHOLDING_input_output_pairs_cnn_dailymail_tiny", "rb") as f: outputs_tiny = pickle.load(f)
print(len(outputs_tiny))
with open("THRESHOLDING_input_output_pairs_cnn_dailymail_13b", "rb") as f: outputs_13b = pickle.load(f)
print(len(outputs_13b))

In [None]:
from datasets import load_dataset
gsm8k_dataset = load_dataset('openai/gsm8k', 'main', split='train')

In [None]:
def generate_output(model, tokenizer, dataset, current_idx):
    outputs = []
    
    input_text = gsm8k_dataset[current_idx]['question'] 
    input_prompt = "Answer the following math question: \n\n" + input_text + "\n\n Lets think step by step: "

    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(inputs['input_ids'])
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    answer_prefix = "Lets think step by step: "
    if answer_prefix in output_text:
        cleaned_output = output_text.split(answer_prefix)[-1].strip()
    else:
        cleaned_output = output_text.strip()

    outputs.append(cleaned_output)
    
    return outputs

In [None]:
input_texts = []
outputs_7b = []
outputs_tiny = []
outputs_13b = []

In [None]:
def inference_gsm8k(model, tokenizer, start, array, model_name):
    for current_idx in range(start, 1000):
        input_text = gsm8k_dataset[current_idx]['question'] 
        output = generate_output(model, tokenizer, input_text, current_idx)
        array.append(output)
        
        print(f"{model_name} | CURRENT IDX: {current_idx} | Length: {len(array)}")
        if (model_name == "Llama7b"): 
            with open('THRESHOLDING_input_output_pairs_gsm8k_7b', 'wb') as f: pickle.dump(array, f)
        if (model_name == "TinyLlama"): 
            with open('THRESHOLDING_input_output_pairs_gsm8k_tiny', 'wb') as f: pickle.dump(array, f)
        if (model_name == "Llama13b"): 
            with open('THRESHOLDING_input_output_pairs_gsm8k_13b', 'wb') as f: pickle.dump(array, f)

In [None]:
# inference_gsm8k(llama7b, llama7b_tokenizer, 0, outputs_7b, "Llama7b")

In [None]:
# inference_gsm8k(tinyllama, tinyllama_tokenizer, 0, outputs_tiny, "TinyLlama")

In [None]:
# inference_gsm8k(llama13b, llama13b_tokenizer, 997, outputs_13b, "Llama13b")

In [None]:
with open("THRESHOLDING_input_output_pairs_wmt14_7b", "rb") as f: outputs_wmt14_7b = pickle.load(f)
print(f"WMT14 7b: {len(outputs_wmt14_7b)}")

with open("THRESHOLDING_input_output_pairs_wmt14_tiny", "rb") as f: outputs_wmt14_tiny = pickle.load(f)
print(f"WMT14 Tiny: {len(outputs_wmt14_tiny)}")

with open("THRESHOLDING_input_output_pairs_wmt14_13b", "rb") as f: outputs_wmt14_13b = pickle.load(f)
print(f"WMT14 13b: {len(outputs_wmt14_13b)}")

# Load outputs for CNN/DailyMail
with open("THRESHOLDING_input_output_pairs_cnn_dailymail_7b", "rb") as f: outputs_cnn_dailymail_7b = pickle.load(f)
print(f"CNN/DailyMail 7b: {len(outputs_cnn_dailymail_7b)}")

with open("THRESHOLDING_input_output_pairs_cnn_dailymail_tiny", "rb") as f: outputs_cnn_dailymail_tiny = pickle.load(f)
print(f"CNN/DailyMail Tiny: {len(outputs_cnn_dailymail_tiny)}")

with open("THRESHOLDING_input_output_pairs_cnn_dailymail_13b", "rb") as f: outputs_cnn_dailymail_13b = pickle.load(f)
print(f"CNN/DailyMail 13b: {len(outputs_cnn_dailymail_13b)}")

# Load outputs for GSM8K
with open("THRESHOLDING_input_output_pairs_gsm8k_7b", "rb") as f: outputs_gsm8k_7b = pickle.load(f)
print(f"GSM8K 7b: {len(outputs_gsm8k_7b)}")

with open("THRESHOLDING_input_output_pairs_gsm8k_tiny", "rb") as f: outputs_gsm8k_tiny = pickle.load(f)
print(f"GSM8K Tiny: {len(outputs_gsm8k_tiny)}")

with open("THRESHOLDING_input_output_pairs_gsm8k_13b", "rb") as f: outputs_gsm8k_13b = pickle.load(f)
print(f"GSM8K 13b: {len(outputs_gsm8k_13b)}")

In [None]:
with open("THRESHOLDING_input_output_pairs_gsm8k_7b", "rb") as f: outputs_7b = pickle.load(f)
print(len(outputs_7b))
with open("THRESHOLDING_input_output_pairs_gsm8k_tiny", "rb") as f: outputs_tiny = pickle.load(f)
print(len(outputs_tiny))
with open("THRESHOLDING_input_output_pairs_gsm8k_13b", "rb") as f: outputs_13b = pickle.load(f)
print(len(outputs_13b))

In [None]:
def create_combined_dataset(outputs_7b, outputs_tiny, outputs_13b, dataset, input_key, label_key):
    combined = []
    for idx in range(len(outputs_7b)):
        if input_key == 'translation':  # Special case for WMT14
            sample = {
                'input': dataset[idx]['translation']['de'],
                'label': dataset[idx]['translation']['en'],
                '7b': outputs_7b[idx],
                'tiny': outputs_tiny[idx],
                '13b': outputs_13b[idx]
            }
        else:
            sample = {
                'input': dataset[idx][input_key],
                'label': dataset[idx][label_key],
                '7b': outputs_7b[idx],
                'tiny': outputs_tiny[idx],
                '13b': outputs_13b[idx]
            }
        combined.append(sample)
    return combined

In [None]:
combined_wmt14 = create_combined_dataset(outputs_wmt14_7b, outputs_wmt14_tiny, outputs_wmt14_13b, wmt14_dataset, 'translation', 'translation')
combined_cnn_dailymail = create_combined_dataset(outputs_cnn_dailymail_7b, outputs_cnn_dailymail_tiny, outputs_cnn_dailymail_13b, cnn_dailymail_dataset, 'article', 'highlights')
combined_gsm8k = create_combined_dataset(outputs_gsm8k_7b, outputs_gsm8k_tiny, outputs_gsm8k_13b, gsm8k_dataset, 'question', 'answer')

In [None]:
combined_outputs = {
    'wmt14': combined_wmt14,
    'cnn_dailymail': combined_cnn_dailymail,
    'gsm8k': combined_gsm8k
}

with open("combined_outputs.pkl", "wb") as f:
    pickle.dump(combined_outputs, f)

print("Combined outputs have been saved to 'combined_outputs.pkl'")

In [None]:
print(combined_outputs['wmt14'][100])