In [1]:
from rich import print as rprint
import pandas as pd

from src.dataloaders import OverrulingDataset, HeadlineDataset, CoQADataset, GSM8KDataset
from src.methods import frugalgpt_scorer
from src.methods.methods import EnsembleCascade, MOTLLMCascade, FrugalGPT, AutoMix
from src.api_service import TogetherAIAPI, OpenAIAPI



In [2]:
import os
os.environ['TOGETHER_API_KEY'] = '5de421f4d56d44ac7400e98c3cac5dc98e184bc92e297e552aadd7198def0661'

## GSM8K Task

In [3]:
Task1 = GSM8KDataset()
API1 = TogetherAIAPI(TaskData=Task1)


### Single Models Run

In [4]:
single_models = [
    'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
    'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    'meta-llama/Meta-Llama-3-8B-Instruct-Turbo',
    'meta-llama/Meta-Llama-3-70B-Instruct-Turbo',
    'meta-llama/Meta-Llama-3-8B-Instruct-Lite',
    'meta-llama/Meta-Llama-3-70B-Instruct-Lite',
    'meta-llama/Llama-3-8b-chat-hf',
    'meta-llama/Llama-3-70b-chat-hf',
    # 'microsoft/WizardLM-2-8x22B',
    'google/gemma-2-27b-it',
    'google/gemma-2-9b-it',
    'databricks/dbrx-instruct',
    'deepseek-ai/deepseek-llm-67b-chat',
    'google/gemma-2b-it',
    'Gryphe/MythoMax-L2-13b',
    'meta-llama/Llama-2-13b-chat-hf',
    'mistralai/Mistral-7B-Instruct-v0.1',
    'mistralai/Mistral-7B-Instruct-v0.2',
    'mistralai/Mistral-7B-Instruct-v0.3',
    # 'mistralai/Mixtral-8x7B-Instruct-v0.1',
    # 'mistralai/Mixtral-8x22B-Instruct-v0.1',
    # 'NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
    # 'NousResearch/Nous-Hermes-2-Yi-34B',
    # 'NousResearch/Hermes-3-Llama-3.1-405B-Turbo',
    'Qwen/Qwen1.5-72B-Chat',
    'Qwen/Qwen1.5-110B-Chat',
    # 'Qwen/Qwen2-72B-Instruct',
    'togethercomputer/StripedHyena-Nous-7B'
]

In [5]:
results = []

In [None]:
for model in single_models:
    print(f"Running inference on {model}...")
    single_run = EnsembleCascade( 
        # ensemble cascade works well for just a single model, if only one model is passed in
        API1, Task1, [model],
    )
    accurracy, avg_latency, total_cost = single_run.inference_cascade(len_data=50)
    print(accurracy, avg_latency, total_cost)
    results.append({
        "model": model.split('/')[-1],
        "accuracy": accurracy,
        "cost": total_cost,
        "avg_latency": avg_latency,
    })

Running inference on meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo...
Starting inference engine...
Pattern error in regex. Using the old-fashioned way...
Answer not found in ==> `{r}`


Pattern error in regex. Using the old-fashioned way...
Answer not found in ==> `{r}`


Pattern error in regex. Using the old-fashioned way...
Answer not found in ==> `{r}`


Pattern error in regex. Using the old-fashioned way...
Answer not found in ==> `{r}`


Pattern error in regex. Using the old-fashioned way...
Answer not found in ==> `{r}`


Calculating accuracy with offline labels...
0.78 1.0289650964736938 0.0136188
Running inference on meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo...
Starting inference engine...
Pattern error in regex. Using the old-fashioned way...
Answer not found in ==> `{r}`


Calculating accuracy with offline labels...
0.96 2.5431721925735475 0.06450576
Running inference on meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo...
Starting inference engine...
Calculating accuracy with o

In [None]:
df_results = pd.DataFrame(results) # add cascade results later

df_results.to_csv("single_models_gsm8k.csv", index=False)
df_results

### Two-level Cascade

In [None]:
test_mot = MOTLLMCascade(
    ServiceProvider=API1,
    TaskData=Task1,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ]
)
test_mot.inference_cascade(len_data=25)

In [None]:
test_ensemble = EnsembleCascade(
    ServiceProvider=API1,
    TaskData=Task1,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'google/gemma-2b-it',
        ],
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ],
)
test_ensemble.inference_cascade(len_data=25)

### Three-level Cascade

In [None]:
test_mot3 = MOTLLMCascade(
    ServiceProvider=API1,
    TaskData=Task1,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ]
)
test_mot.inference_cascade(len_data=25)

In [None]:
test_ensemble3 = EnsembleCascade(
    ServiceProvider=API1,
    TaskData=Task1,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'mistralai/Mistral-7B-Instruct-v0.3',
        ],
        [
            'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
            'meta-llama/Meta-Llama-3-70B-Instruct-Lite',
            'google/gemma-2-27b-it',
        ],
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ],
)
test_ensemble.inference_cascade(len_data=25)

It is quite interesting! A 3-level cascade can achieve a perfect accuracy score, comparable with the best model (Llama-3.1-405) performance; 0.02285:0.183515 = 0.12 (almost 10 times cheaper)

## OVERRULING TASK

In [None]:
Task2 = OverrulingDataset()
API2 = TogetherAIAPI(TaskData=Task2)

### Single Models Run

In [None]:
results = []

In [None]:
for model in single_models:
    print(f"Running inference on {model}...")
    single_run = EnsembleCascade( 
        # ensemble cascade works well for just a single model, if only one model is passed in
        API2, Task2, [model],
    )
    accurracy, avg_latency, total_cost = single_run.inference_cascade(len_data=50)
    print(accurracy, avg_latency, total_cost)
    results.append({
        "model": model.split('/')[-1],
        "accuracy": accurracy,
        "cost": total_cost,
        "avg_latency": avg_latency,
    })

In [None]:
df_results = pd.DataFrame(results) # add cascade results later

df_results.to_csv("single_models_overruling.csv", index=False)
df_results

### Two-level Cascade

In [None]:
test2_mot = MOTLLMCascade(
    ServiceProvider=API2,
    TaskData=Task2,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ]
)
test2_mot.inference_cascade(len_data=25)

In [None]:
test2_ensemble = EnsembleCascade(
    ServiceProvider=API2,
    TaskData=Task2,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'google/gemma-2b-it',
        ],
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ],
)
test2_ensemble.inference_cascade(len_data=25)

### Three-level Cascade

In [None]:
test2_mot3 = MOTLLMCascade(
    ServiceProvider=API2,
    TaskData=Task2,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ]
)
test2_mot3.inference_cascade(len_data=25)

In [None]:
test2_ensemble3 = EnsembleCascade(
    ServiceProvider=API2,
    TaskData=Task2,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'mistralai/Mistral-7B-Instruct-v0.3',
        ],
        [
            'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
            'meta-llama/Meta-Llama-3-70B-Instruct-Lite',
            'google/gemma-2-27b-it',
        ],
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ],
)
test2_ensemble3.inference_cascade(len_data=25)

## HEADLINE Task

In [None]:
Task3 = HeadlineDataset()
API3 = TogetherAIAPI(TaskData=Task3)

### Single Models Run

In [None]:
results = []

In [None]:
for model in single_models:
    print(f"Running inference on {model}...")
    single_run = EnsembleCascade( 
        # ensemble cascade works well for just a single model, if only one model is passed in
        API3, Task3, [model],
    )
    accurracy, avg_latency, total_cost = single_run.inference_cascade(len_data=50)
    print(accurracy, avg_latency, total_cost)
    results.append({
        "model": model.split('/')[-1],
        "accuracy": accurracy,
        "cost": total_cost,
        "avg_latency": avg_latency,
    })

In [None]:
df_results = pd.DataFrame(results) # add cascade results later

df_results.to_csv("single_models_headlines.csv", index=False)
df_results

### Two-level Cascade

In [None]:
test3_mot = MOTLLMCascade(
    ServiceProvider=API3,
    TaskData=Task3,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ]
)
test3_mot.inference_cascade(len_data=25)

In [None]:
test3_ensemble = EnsembleCascade(
    ServiceProvider=API3,
    TaskData=Task3,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'google/gemma-2b-it',
        ],
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ],
)
test3_ensemble.inference_cascade(len_data=25)

### Three-level Cascade

In [None]:
test3_mot3 = MOTLLMCascade(
    ServiceProvider=API3,
    TaskData=Task3,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ]
)
test3_mot3.inference_cascade(len_data=25)

In [None]:
test3_ensemble3 = EnsembleCascade(
    ServiceProvider=API3,
    TaskData=Task3,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'mistralai/Mistral-7B-Instruct-v0.3',
        ],
        [
            'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
            'meta-llama/Meta-Llama-3-70B-Instruct-Lite',
            'google/gemma-2-27b-it',
        ],
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ],
)
test3_ensemble3.inference_cascade(len_data=25)

## CoQA Task

In [None]:
Task4 = CoQADataset()
API4 = TogetherAIAPI(TaskData=Task4)

### Single Models Run

In [5]:
results = []

In [None]:
for model in single_models:
    print(f"Running inference on {model}...")
    single_run = EnsembleCascade( 
        # ensemble cascade works well for just a single model, if only one model is passed in
        API4, Task4, [model],
    )
    accurracy, avg_latency, total_cost = single_run.inference_cascade(len_data=50)
    print(accurracy, avg_latency, total_cost)
    results.append({
        "model": model.split('/')[-1],
        "accuracy": accurracy,
        "cost": total_cost,
        "avg_latency": avg_latency,
    })

Running inference on meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo...
Starting inference engine...
18 1
Answer: Janet eats 3 + 4 = 7 eggs for her breakfast and muffins. This leaves 16 - 7 = 9 eggs to sell. Janet makes 9 x $2 = $<<9*2=18>>18 every day at the farmers' market.
#### 18
['18']
Exiting at tier  0
3 1
Answer: Since the robe takes 2 bolts of blue fiber, it takes 2/2 = 1 bolt of white fiber. So in total, it takes 2 + 1 = 3 bolts. 
#### 3
['3']
Exiting at tier  0
90 1
Answer: The value of the house increased to 150% of $80,000, which is 1.5 * $80,000 = $120,000. The amount he spent on repairs was $50,000, so the total value of the house after repairs is $120,000 + $50,000 = $170,000. The profit he made is $170,000 - $80,000 = $<<170000-80000=90000>>90,000.
#### 90,000
['90']
Exiting at tier  0
540 1
Answer: James runs 60 meters each sprint. Since he runs 3 sprints, he runs 60 * 3 = 180 meters each time he runs. He runs 3 times a week, so he runs a total of 180 * 3 = 540 meters a w

In [None]:
df_results = pd.DataFrame(results) # add cascade results later

df_results.to_csv("single_models_coqa.csv", index=False)
df_results

### Two-level Cascade

In [5]:
test4_mot = MOTLLMCascade(
    ServiceProvider=API4,
    TaskData=Task4,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ]
)
test4_mot.inference_cascade(len_data=25)

Starting inference engine...
18 3
("Answer: Janet eats 3 + 4 = 7 eggs every day. She has 16 - 7 = 9 eggs left. Janet sells 9 eggs at the farmers' market for 9 x 2 = $<<9*2=18>>18.\n#### 18", "Answer: Janet eats 3 eggs for breakfast and bakes 4, so she uses 3+4 = <<3+4=7>>7 eggs. 16 - 7 = 9 eggs are left. Janet makes 9 x $2 = $<<9*2=18>>18 at the farmers' market every day.\n#### 18", "Answer: Janet's ducks lay 16 eggs a day. Janet eats 3 for breakfast, and uses 4 for muffins. So she has 16 - 3 - 4 = 9 eggs left. Each egg is sold for $2. So she makes 9 * 2 = $<<9*2=18>>18 every day.\n#### 18")
['18', '18', '18']
Exiting at tier  0
Answer not found in ==> `Answer: A robe takes 2 bolts of blue fiber. It takes half that much, which is 2 / 2 = 1, of white fiber. In total, a robe takes 2 + 1 = 3 bolts of fiber.`


3 2
('Answer: It takes 2 bolts of blue fiber. It takes half that amount of white fiber, which is 2 / 2 = 1 bolt. In total, it takes 2 + 1 = 3 bolts.\n#### 3', 'Answer: The robe take

(0.7, 2.0737594604492187, 0.01604366)

In [6]:
test4_ensemble = EnsembleCascade(
    ServiceProvider=API4,
    TaskData=Task4,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'google/gemma-2b-it',
        ],
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
    ],
)
test4_ensemble.inference_cascade(len_data=25)

Starting inference engine...
Answer not found in ==> `Answer: She eats 3 eggs for breakfast 3*16 = <<3*16=48>>48 times per day. She bakes 4 muffins for her friends 4 muffins * 4 = <<4*4=16>>16 times per day. She sells the remainder at the farmers' market 48-16 = <<48-16=32>>32 times per day. So, she sells 32 * 2 = $<<32*2=64>>64 eggs every day at the farmers' market. That means she makes 64 * 2 = $<<64*2=128>>128 in dollars every day at the farmers' market.`


18 2
('Answer: Janet lays 16 eggs per day. She eats 3 eggs and bakes 4 eggs, so she has 16 - 3 - 4 = 9 eggs left. She sells these eggs for $2 each, so she makes 9 * 2 = $18. \n#### 18', "Answer: Janet eats 3 + 4 = 7 eggs per day. She lays 16 eggs per day, so she sells 16 - 7 = 9 eggs per day. Since she sells 9 eggs at $2 each, she makes 9 * 2 = $<<9*2=18>>18 per day at the farmers' market. \n#### 18", "Answer: She eats 3 eggs for breakfast 3*16 = <<3*16=48>>48 times per day. She bakes 4 muffins for her friends 4 muffins * 4 = <<4

(0.8, 3.8748287916183473, 0.012942160000000003)

### Three-level Cascade

In [7]:
test4_mot3 = MOTLLMCascade(
    ServiceProvider=API4,
    TaskData=Task4,
    cascade_tier_models=[
        'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ]
)
test4_mot.inference_cascade(len_data=25)

Starting inference engine...
18 3
("Answer: Janet eats 3 + 4 = 7 eggs every day. She lays 16 eggs per day, so she has 16 - 7 = 9 eggs left. She sells 9 eggs at $2 each, so she makes 9 x 2 = $18 every day at the farmers' market.\n#### 18", "Answer: Janet eats 7 eggs a day for breakfast because 3 + 4 = 7 She has 16 - 7 = 9 eggs left after breakfast She sells the rest at the farmers' market for $2 each, so she makes 9 x 2 = $18.\n#### 18", "Answer: Janet eats 3 + 4 = <<3+4=7>>7 eggs in the morning. So, she is left with 16 - 7 = 9 eggs per day. Since she makes $2 per egg, she makes 9 * 2 = $<<9*2=18>>18 per day at the farmers' market.\n#### 18")
['18', '18', '18']
Exiting at tier  0
3 3
('Answer: A robe takes 2 bolts of blue fiber and half that much white fiber. Half of 2 is 1, so it takes 1 bolt of white fiber. In total, it takes 2 + 1 = 3 bolts.\n#### 3', 'Answer: A robe takes 2 bolts of blue fiber. It takes half as much white fiber as blue, so it takes 2 / 2 = 1 bolts of white fiber.  I

(0.9, 2.439996600151062, 0.038672759999999994)

In [8]:
test4_ensemble3 = EnsembleCascade(
    ServiceProvider=API4,
    TaskData=Task4,
    cascade_tier_models=[
        [
            'meta-llama/Llama-3-8b-chat-hf',
            'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'mistralai/Mistral-7B-Instruct-v0.3',
        ],
        [
            'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
            'meta-llama/Meta-Llama-3-70B-Instruct-Lite',
            'google/gemma-2-27b-it',
        ],
        'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    ],
)
test4_ensemble3.inference_cascade(len_data=25)

Starting inference engine...
18 3
("Here is the completed text:\n\nAnswer: Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. That leaves 16 - 3 - 4 = 9 eggs. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. So, she makes 9 x 2 = <<9*2=18>>18 dollars every day at the farmers' market.\n#### 18", 'Answer: Janet eats 3 for breakfast and bakes 4, so she eats and bakes 3+4 = 7 eggs. There are 16 - 7 = 9 eggs left. Janet sells 9 eggs at $2 each so 9*2 = $<<9*2=18>>18.\n#### 18', " Answer: Janet sells the remaining eggs after eating three for breakfast and using four for muffins. So, she sells 16 - 3 - 4 = <<16-3-4=9>>9 eggs daily. She sells these eggs for $2 each, so her daily income at the farmers' market is 9 * 2 = $<<9*2=18>>18.\n#### 18")
['18', '18', '18']
Exiting at tier  0
3 3
('#### 3\n\nExplanation:\nA robe takes 2 bolts of blue fiber. It takes half that much white fiber,

(1.0, 5.176269102096557, 0.02285)

In [None]:

test = OverrulingDataset(train_frugalgpt_scorer=True)
test2 = HeadlineDataset(train_frugalgpt_scorer=True)
test3 = CoQADataset(train_frugalgpt_scorer=True)
test4 = GSM8KDataset(train_frugalgpt_scorer=True)

#### Plots

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from adjustText import adjust_text

sns.set_style("whitegrid")

In [None]:
font_ = 15
legendfont_ = 15
matplotlib.rcParams.update({"text.usetex": False, 'font.size': font_})
matplotlib.rcParams.update({"legend.fontsize": legendfont_})
plt.rcParams['font.family'] = 'serif'

In [None]:
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_results['cost'], df_results['accuracy'], s=30, color='blue')

texts = []
for i, model in enumerate(df_results['model']):
    texts.append(plt.text(df_results['cost'][i], df_results['accuracy'][i], model, fontsize=10))

plt.title('Accuracy vs Cost', fontsize=14)
plt.xlabel('Cost', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)

plt.grid(True, linestyle='--', alpha=0.7)

# Adjust the text labels to minimize overlap
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_results['avg_latency'], df_results['accuracy'], s=30, color='blue')

texts = []
for i, model in enumerate(df_results['model']):
    texts.append(plt.text(df_results['avg_latency'][i], df_results['accuracy'][i], model, fontsize=10))

plt.title('Accuracy vs Latency', fontsize=14)
plt.xlabel('Latency', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)

plt.grid(True, linestyle='--', alpha=0.7)

# Adjust the text labels to minimize overlap
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

plt.tight_layout()
plt.show()