In [1]:
from run_bench import save_overall_results, dataset_metadata
from pathlib import Path
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
import dash

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fig = save_overall_results(results_folder=Path("results"), method="plotly")
fig

In [3]:
results_df = pl.read_parquet(Path("results/results.parquet"))
# sort by score
results_df = results_df.sort("score").reverse()
# extract score and model columns
# max model col width
with pl.Config(fmt_str_lengths=50, tbl_rows=1000):
    print(results_df.select(["model", "normalized_score", "score"]))

shape: (42, 3)
┌─────────────────────────────────────────────────────┬──────────────────┬───────┐
│ model                                               ┆ normalized_score ┆ score │
│ ---                                                 ┆ ---              ┆ ---   │
│ str                                                 ┆ f64              ┆ i64   │
╞═════════════════════════════════════════════════════╪══════════════════╪═══════╡
│ dolphin-2_2-yi-34b.Q3_K_L.gguf                      ┆ 0.635556         ┆ 2860  │
│ yi-34b-giftedconvo-merged.Q3_K_L.gguf               ┆ 0.618889         ┆ 2785  │
│ yi-34b.Q4_K_S.gguf                                  ┆ 0.615778         ┆ 2771  │
│ openchat_3.5.Q6_K.gguf                              ┆ 0.615333         ┆ 2769  │
│ nous-capybara-34b.Q3_K_L.gguf                       ┆ 0.607333         ┆ 2733  │
│ openhermes-2.5-mistral-7b.Q6_K.gguf                 ┆ 0.603111         ┆ 2714  │
│ mistral-7b-openorca-oasst_top1_2023-08-25-v1.Q6_K.… ┆ 0.602889        

In [4]:
row_dicts = results_df.select(["model", "normalized_score", "normalized_category_scores"]).to_dicts()
for row in row_dicts:
    for d in sorted(list(row["normalized_category_scores"])):
        row[d] = row["normalized_category_scores"].get(d)
    del row["normalized_category_scores"]
scores_df = pl.DataFrame(row_dicts)
scores_df


model,normalized_score,Comprehension/Reading Comprehension,Knowledge/Multi-subject Test,Math/Mathmatical Reasoning,Reasoning/Commonsense Reasoning,Safety/Truthfulness
str,f64,f64,f64,f64,f64,f64
"""dolphin-2_2-yi…",0.635556,0.85,0.765,0.334,0.596,0.622
"""yi-34b-giftedc…",0.618889,0.858,0.774,0.286,0.558,0.646
"""yi-34b.Q4_K_S.…",0.615778,0.864,0.75,0.314,0.5745,0.566
"""openchat_3.5.Q…",0.615333,0.824,0.695,0.316,0.606,0.584
"""nous-capybara-…",0.607333,0.846,0.76,0.264,0.557,0.608
"""openhermes-2.5…",0.603111,0.796,0.717,0.258,0.5765,0.634
"""mistral-7b-ope…",0.602889,0.792,0.698,0.268,0.6135,0.516
"""yi-34b-200k-ll…",0.594667,0.87,0.735,0.374,0.4985,0.644
"""speechless-mis…",0.594667,0.81,0.704,0.258,0.588,0.524
"""neural-chat-7b…",0.592,0.794,0.683,0.276,0.5615,0.646


In [5]:
row_dicts = results_df.select(["model", "normalized_score", "score", "normalized_scores"]).to_dicts()
for row in row_dicts:
    for d in sorted(list(row["normalized_scores"])):
        row[d] = row["normalized_scores"].get(d)
    del row["normalized_scores"]
scores_df = pl.DataFrame(row_dicts)
scores_df

model,normalized_score,score,ai2_arc-arc-challenge-test,alekseykorshuk_hellaswag-arc-easy-validation,anli--test_r1,cais_mmlu-all-validation,cosmos_qa--validation,math_qa--validation,race-high-validation,truthful_qa-multiple_choice-validation,winogrande-winogrande_s-validation
str,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""dolphin-2_2-yi…",0.635556,2860,0.876,0.754,0.64,0.654,0.852,0.334,0.85,0.622,0.138
"""yi-34b-giftedc…",0.618889,2785,0.882,0.614,0.622,0.666,0.862,0.286,0.858,0.646,0.134
"""yi-34b.Q4_K_S.…",0.615778,2771,0.868,0.766,0.538,0.632,0.866,0.314,0.864,0.566,0.128
"""openchat_3.5.Q…",0.615333,2769,0.808,0.702,0.686,0.582,0.826,0.316,0.824,0.584,0.21
"""nous-capybara-…",0.607333,2733,0.864,0.68,0.558,0.656,0.862,0.264,0.846,0.608,0.128
"""openhermes-2.5…",0.603111,2714,0.83,0.682,0.638,0.604,0.798,0.258,0.796,0.634,0.188
"""mistral-7b-ope…",0.602889,2713,0.808,0.666,0.65,0.588,0.798,0.268,0.792,0.516,0.34
"""yi-34b-200k-ll…",0.594667,2676,0.858,0.672,0.466,0.612,0.85,0.374,0.87,0.644,0.006
"""speechless-mis…",0.594667,2676,0.83,0.686,0.65,0.578,0.792,0.258,0.81,0.524,0.224
"""neural-chat-7b…",0.592,2664,0.796,0.702,0.57,0.57,0.746,0.276,0.794,0.646,0.228


In [6]:
df = scores_df.to_pandas()
# extract all columns with float or int type

list(zip(df.select_dtypes(include=["float", "int"]).quantile(0.9).index.tolist(), df.select_dtypes(include=["float", "int"]).quantile(0.9).to_list()))

[('normalized_score', 0.6069111111111111),
 ('score', 2731.1),
 ('ai2_arc-arc-challenge-test', 0.8552),
 ('alekseykorshuk_hellaswag-arc-easy-validation', 0.6878),
 ('anli--test_r1', 0.6374),
 ('cais_mmlu-all-validation', 0.6112),
 ('cosmos_qa--validation', 0.8475999999999999),
 ('math_qa--validation', 0.3112),
 ('race-high-validation', 0.8572),
 ('truthful_qa-multiple_choice-validation', 0.6458),
 ('winogrande-winogrande_s-validation', 0.4758)]

In [7]:
# using dash, create a table
app = dash.Dash(__name__)

app.layout = dash.dash_table.DataTable(
    id="table",
    columns=[{"name": i, "id": i} for i in scores_df.columns],
    data=df.to_dict("records"),
    style_cell={"textAlign": "left"},
    sort_action="native",
    # highlight highest value in each column by green and lowest by red
    style_data_conditional=[
        {
            'if': {
                'filter_query': '{{{}}} >= {}'.format(col, value),
                'column_id': col
            },
            'backgroundColor': 'green',
            'color': 'white'
        } for (col, value) in list(zip(df.select_dtypes(include=["float", "int"]).quantile(0.9).index.tolist(), df.select_dtypes(include=["float", "int"]).quantile(0.95).to_list()))
    ] + [
        {
            'if': {
                'filter_query': '{{{}}} <= {}'.format(col, value),
                'column_id': col
            },
            'backgroundColor': 'red',
            'color': 'white'
        } for (col, value) in list(zip(df.select_dtypes(include=["float", "int"]).quantile(0.9).index.tolist(), df.select_dtypes(include=["float", "int"]).quantile(0.1).to_list()))
    ],
    style_header={
        'backgroundColor': 'rgb(30, 30, 30)',
        'color': 'white'
    },
    style_data={
        'backgroundColor': 'rgb(50, 50, 50)',
        'color': 'white'
    },
)
# dark theme


app.run_server(debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter

In [8]:
# from transformers import AutoTokenizer
# from petals import AutoDistributedModelForCausalLM

# # Choose any model available at https://health.petals.dev
# model_name = "petals-team/StableBeluga2"  # This one is fine-tuned Llama 2 (70B)

# # Connect to a distributed network hosting model layers
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoDistributedModelForCausalLM.from_pretrained(model_name)
# prompt = """
# There are three sisters in a room alone. Anna is reading a book. Alice is playing a game called Zonda. Zonda requires two people to play it.
# What is the third sister, Amanda, most likely doing? Explain why.
# Select from the following options:
# (A) Also reading a book, like Anna
# (B) Having music lessons from Alice
# (C) Playing Zonda with Alice
# (D) Observing the other sisters, while they do their activities only
# (E) Trying to think of something to so
# Answer: (
# """
# from time import time
# # Run the model as if it were on your computer
# inputs = tokenizer(prompt, return_tensors="pt")["input_ids"]
# s = time()
# outputs = model.generate(inputs, max_new_tokens=100)
# print(tokenizer.decode(outputs[0]))
# print(f"Time taken: {time() - s:.2f}s")

In [9]:
from pathlib import Path
import json
from run_bench import get_datasets, calculate_category_scores

results_folder = Path("results")

results_df = pl.DataFrame()
results_folders = [f for f in Path(results_folder).iterdir() if not f.is_file()]
for result_folder in results_folders:
    print(result_folder)
    metadata = json.load(
        open(result_folder / "metadata.json", "r"),
    )
    datasets_df = pl.read_parquet(
        result_folder / "datasets_results.parquet",
    )
    number_of_samples = metadata["number_of_samples"]

    datasets = get_datasets(number_of_samples, metadata["dataset_seed"])

    scores = {
        r["dataset"]: r["score"]
        for r in datasets_df.group_by("dataset")
        .agg(pl.sum("score").alias("score"))
        .to_dicts()
    }
    normalized_scores = {k: v / number_of_samples for k, v in scores.items()}
    category_scores = calculate_category_scores(datasets_df)
    normalized_category_scores = {
        k: v / (len([d for d in datasets if d.category == k]) * number_of_samples)
        for k, v in category_scores.items()
    }
    score = sum(scores.values())
    normalized_score = score / (len(metadata["datasets"]) * number_of_samples)
    # normalized_score = sum(normalized_category_scores.values())/len(normalized_category_scores)

    result = {
        "model": metadata["model"],
        "model_seed": metadata["model_seed"],
        "dataset_seed": metadata["dataset_seed"],
        "score": score,
        "normalized_score": normalized_score,
        "scores": scores,
        "normalized_scores": normalized_scores,
        "category_scores": category_scores,
        "normalized_category_scores": normalized_category_scores,
        "number_of_samples": number_of_samples,
        "run_folder_name": result_folder.name,
    }
    results_df = pl.DataFrame([*results_df.to_dicts(), result])
results_df.sort("normalized_score").reverse().select(["model", "normalized_score", "score"])

results/zephyr-7b-beta.Q6_K.gguf-ms-1-ds-1-2023-11-01T03-04-40-459619
results/LLaMA2-13B-Tiefighter.Q6_K.gguf-ms-1-ds-1-2023-10-31T17-42-32-959945
results/tinyllama-1.1b-chat-v0.3.Q6_K.gguf-ms-1-ds-1-2023-11-01T00-46-15-309444
results/yi-6b.Q6_K.gguf-ms-1-ds-1-2023-11-10T20-18-50-468379
results/causallm_14b.Q5_1.gguf-ms-1-ds-1-2023-11-03T10-35-36-677324
results/dolphin-2_2-yi-34b.Q3_K_L.gguf-ms-1-ds-1-2023-11-14T10-12-24-796197
results/xwin-lm-13b-v0.2.Q6_K.gguf-ms-1-ds-1-2023-11-01T00-58-44-761255
results/yi-34b.Q4_K_S.gguf-ms-1-ds-1-2023-11-05T16-42-46-725311
results/calm2-7b-chat.Q6_K.gguf-ms-1-ds-1-2023-11-05T20-31-36-555148
results/deepseek-coder-6.7b-instruct.Q6_K.gguf-ms-1-ds-1-2023-11-05T14-31-32-915729
results/Nous-Capybara-7B-V1.9.f16.gguf-ms-1-ds-1-2023-11-06T16-26-44-714790
results/dolphin-2.2.1-mistral-7b.Q6_K.gguf-ms-1-ds-1-2023-10-31T21-36-34-229872
results/Toppy-M-7B.q6_k.gguf-ms-1-ds-1-2023-11-05T15-49-14-211762
results/speechless-mistral-dolphin-orca-platypus-samantha

model,normalized_score,score
str,f64,i64
"""dolphin-2_2-yi…",0.635556,2860
"""yi-34b-giftedc…",0.618889,2785
"""yi-34b.Q4_K_S.…",0.615778,2771
"""openchat_3.5.Q…",0.615333,2769
"""nous-capybara-…",0.607333,2733
"""openhermes-2.5…",0.603111,2714
"""mistral-7b-ope…",0.602889,2713
"""yi-34b-200k-ll…",0.594667,2676
"""speechless-mis…",0.594667,2676
"""neural-chat-7b…",0.592,2664
