In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pprint import pprint

In [2]:
def text_similarity(ref,message):
    model = SentenceTransformer('all-MiniLM-L6-v2')  
    embeddings = model.encode([ref, message])
    similarity_score = cosine_similarity(
        [embeddings[0]], 
        [embeddings[1]] 
    )[0][0]

    return similarity_score

In [3]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    return json_data

In [4]:
bench_file = "../docs/qa_benchmark.json"
benchmark = read_json(bench_file)

In [8]:
qa_file = "../docs/qa_records_gemma-1.1-2b.json"
qa = read_json(qa_file)

In [5]:
def compare_qa(qa1,qa2):
    q_score = text_similarity(qa1["Query"],qa2["Query"])
    r_score = text_similarity(qa1["Response"],qa2["Response"])
    result = {"Query":qa1["Query"],
              "q_score":q_score,
              "r_score":r_score}
    return result


In [6]:
def evaluate_qa(qa,benchmark):
    #q_score_total = 0
    r_score_total = 0

    for i in range(len(qa)-1):
        if qa[i+1]["Query"]==benchmark[i+1]["Query"]:
            qa[i+1]["r_score"] = text_similarity(qa[i+1]["Response"],benchmark[i+1]["Response"])
            r_score_total += qa[i+1]["r_score"]   
    r_score_average = r_score_total/(len(qa)-1)
    qa[0]["r_score_average"] = r_score_average

    return qa


In [10]:
qa_files = [
    "qa_records_Llama-3-8B.json",
    "qa_records_Llama-3.2-3B.json",
    "qa_records_Llama-3.2-3B_p.json",
    "qa_records_Llama-3.2-1B.json",
    "qa_records_Llama-3.2-1B_p.json",
    "qa_records_Llama-3.1-8B.json",
    "qa_records_gemma-1.1-2b.json",
    "qa_records_Qwen2.5-1.5B.json",
    "qa_records_Qwen2.5-0.5B.json",
    "qa_records_Phi-4-mini.json"
]

In [7]:
qa_files_llama_3_2 = [    
    "qa_records_Llama-3.2-3B_p.json",
    "qa_records_Llama-3.2-3B_1.json",
    "qa_records_Llama-3.2-3B_2.json",
    "qa_records_Llama-3.2-3B_3.json",
    "qa_records_Llama-3.2-3B_4.json",
]

In [None]:
qa_records = []
for f in qa_files:
    qf = read_json("../docs/"+f)
    #print(f"{f} has {len(qf)} items.")
    qf_ev = evaluate_qa(qf,benchmark)
    qa_records.append(qf_ev[0])

pprint(qa_records)

[{'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct(control tokens)',
  'Response Time(s)': '43.70',
  'Size(GB)': '6.43',
  'r_score_average': 0.6064372857411703},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '40.31',
  'Size(GB)': '6.43',
  'r_score_average': 0.4558170661330223},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '45.36',
  'Size(GB)': '6.43',
  'r_score_average': 0.41377802938222885},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '44.38',
  'Size(GB)': '6.43',
  'r_score_average': 0.4472521295150121},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '51.09',
  'Size(GB)': '6.43',
  'r_score_average': 0.42030222713947296}]


In [None]:
qa_records_json = "../docs/qa_records_comparison.json"

In [10]:
with open(qa_records_json, "w", encoding="utf-8") as f:
        json.dump(qa_records, f, indent=4, ensure_ascii=False)

In [11]:
data = read_json(qa_records_json)
pprint(data)

[{'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct(control tokens)',
  'Response Time(s)': '43.70',
  'Size(GB)': '6.43',
  'r_score_average': 0.6064372857411703},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '40.31',
  'Size(GB)': '6.43',
  'r_score_average': 0.4558170661330223},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '45.36',
  'Size(GB)': '6.43',
  'r_score_average': 0.41377802938222885},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '44.38',
  'Size(GB)': '6.43',
  'r_score_average': 0.4472521295150121},
 {'Context Length(k)': '128',
  'Model': 'Llama-3.2-3B-Instruct',
  'Response Time(s)': '51.09',
  'Size(GB)': '6.43',
  'r_score_average': 0.42030222713947296}]


In [12]:
# Filter the data to exclude models with '8B' in their names
filtered_data = [d for d in data if '8B' not in d['Model'] and 'gemma' not in d['Model']]

# Convert the filtered list of dictionaries into a pandas DataFrame
df = pd.DataFrame(filtered_data)

# Rename the columns for the metrics that will be plotted
df.rename(columns={'Response Time(s)': 'Inference Speed', 
                   'Size(GB)': 'Size Index',
                   'r_score_average': 'Average Accuracy'}, inplace=True)

# 'Response Time(s)' and 'Size(GB)' have values where smaller is better.
# To make the comparison consistent with 'Context Length(k)' and 'r_score_average' (where bigger is better),
# we normalize the data. A simple way is to take the inverse,
# but to keep it on a similar scale, we'll invert it after scaling.
# A more robust solution is to use a Min-Max scaling, which we will do here.

# Define the metrics to be scaled and those to keep as-is
metrics_to_normalize = ['Inference Speed', 'Size Index']
metrics_to_keep = ['Average Accuracy']
all_metrics = metrics_to_normalize + metrics_to_keep

# Create a copy of the dataframe to store the scaled values
df_scaled = df.copy()

for metric in metrics_to_normalize:
    values = df_scaled[metric].values
    min_val = values.min()
    max_val = values.max()

    if max_val - min_val == 0:
        # Avoid division by zero if all values are the same
        df_scaled[metric] = 0.5
    else:
        df_scaled[metric] = (values - min_val) / (max_val - min_val)
        # Invert the scale for metrics where lower is better
        if metric in ['Inference Speed', 'Size Index']:
            df_scaled[metric] = 1 - df_scaled[metric]

# Melt the DataFrame to a "long" format, which is required for seaborn lineplot
df_melted = df_scaled.melt(id_vars='Model', value_vars=all_metrics, var_name='Metric', value_name='Value')


# Create the line chart
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
sns.lineplot(data=df_melted, x='Metric', y='Value', hue='Model', marker='o', style='Model')

# Add plot title and labels
plt.title('Normalized Performance Comparison of Language Models', fontsize=16)
plt.xlabel('Metrics', fontsize=12)
plt.ylabel('Normalized Value', fontsize=12)
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


TypeError: unsupported operand type(s) for -: 'str' and 'str'