# Analysis
```
"f": field,
"s": subfield,
"j": subject,
"q": question,
"n": model_answer_no_context,
"v": variation_results,
"d": timestamp,
"t": session_id,
"model": model_used
```

In [92]:
# Import necessary libraries
import pandas as pd
import json
import plotly.express as px

# Define the path to the test results file
test_results_file = "./data/test_results.jsonl"  # Adjust the path as needed

# Load the test results into a list
data = []
with open(test_results_file, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Convert the list of dictionaries into a DataFrame
df = pd.json_normalize(data, record_path="v", meta=["f", "s", "j", "q", "n", "d", "t", "model"])

# Filter model, test set
print(f'Used models: {df["model"].unique()}')
selected_model = df["model"].unique()[0]
print(f'Used test sets: {df["t"].unique()}')
selected_set = df["t"].unique()[0]
df.loc[(df["model"] == selected_model) & (df["t"] == selected_set)]

# Display the DataFrame
df.head()

Used models: ['gpt-4o-2024-08-06' 'gpt-3.5-turbo-0125']
Used test sets: ['1.1']


Unnamed: 0,c,e,m,k,f,s,j,q,n,d,t,model
0,Calculus,Infinity,infinity,True,Natural Sciences,Mathematics,inf,What is infinity plus one?,infinity,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06
1,Philosophical,Undefined,infinity,False,Natural Sciences,Mathematics,inf,What is infinity plus one?,infinity,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06
2,Calculus,0,zero,False,Natural Sciences,Mathematics,inf,What is the limit of 1/x as x approaches infin...,zero,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06
3,Philosophical,Questionable,zero,False,Natural Sciences,Mathematics,inf,What is the limit of 1/x as x approaches infin...,zero,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06
4,Calculus,No,no,True,Natural Sciences,Mathematics,inf,Can infinity be reached?,no,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06


In [93]:
df["q"].unique()

array(['What is infinity plus one?',
       'What is the limit of 1/x as x approaches infinity?',
       'Can infinity be reached?', ...,
       'Which biodiversity hotspot is known for its high rate of endemism in invertebrates?',
       'What is the most significant ecological role of biodiversity hotspots?',
       'Which hotspot has a high diversity of both terrestrial and marine life?'],
      dtype=object)

In [94]:
df.loc[df["q"] == df["q"].unique()[1]]

Unnamed: 0,c,e,m,k,f,s,j,q,n,d,t,model
2,Calculus,0,zero,False,Natural Sciences,Mathematics,inf,What is the limit of 1/x as x approaches infin...,zero,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06
3,Philosophical,Questionable,zero,False,Natural Sciences,Mathematics,inf,What is the limit of 1/x as x approaches infin...,zero,2024-10-29T10:58:50.635873,1.1,gpt-4o-2024-08-06
10239,Calculus,0,0,True,Natural Sciences,Mathematics,inf,What is the limit of 1/x as x approaches infin...,0,2024-10-29T09:22:52.582466,1.1,gpt-3.5-turbo-0125
10240,Philosophical,Questionable,0,False,Natural Sciences,Mathematics,inf,What is the limit of 1/x as x approaches infin...,0,2024-10-29T09:22:52.582466,1.1,gpt-3.5-turbo-0125


In [95]:
# Compute context number (1 or 2) for each variation
field_type = "s"
df["context_number"] = df.groupby([field_type, "q"]).cumcount() + 1

# Convert 'k' (correctness) to numeric
df["k"] = df["k"].astype(int)

# Create a pivot table to aggregate correctness per context number and field
pivot = df.pivot_table(index=field_type, columns="context_number", values="k", aggfunc="mean")

# Rename columns for clarity
pivot.columns = ["Context "] + pivot.columns.astype(str).values

# Display the pivot table
pivot

Unnamed: 0_level_0,Context 1,Context 2,Context 3,Context 4,Context 5,Context 6,Context 7,Context 8,Context 9,Context 10,Context 11,Context 12,Context 13,Context 14,Context 15,Context 16
s,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Biological Sciences,0.251381,0.176796,0.218232,0.151934,,,,,,,,,,,,
Chemical Sciences,0.448238,0.247797,0.426211,0.220264,0.5,0.25,0.75,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
Computer and Information Sciences,0.321192,0.215232,0.296909,0.199779,,,,,,,,,,,,
Earth and Related Environmental Sciences,0.28473,0.213548,0.254879,0.159587,1.0,0.0,1.0,0.0,,,,,,,,
Mathematics,0.59839,0.410555,0.569767,0.381038,0.928571,0.535714,0.777778,0.407407,1.0,0.333333,0.666667,0.333333,1.0,0.0,1.0,0.0
Physical Sciences,0.323077,0.203297,0.351648,0.183516,0.5,0.5,0.5,0.25,0.0,0.0,0.0,0.0,,,,


In [96]:
# Function to check if the no-context answer matches any of the expected answers
def matches_expected(row):
    no_context_answer = row["n"].strip().lower()
    expected_answers = [v["e"].strip().lower() for v in row["v"]]
    return int(no_context_answer in expected_answers)


# Apply the function to each row in the original data
df_no_context = pd.DataFrame(data)
df_no_context["No Context Match"] = df_no_context.apply(matches_expected, axis=1)

# Compute the mean proportion per field
no_context_match = df_no_context.groupby(field_type)["No Context Match"].mean()

# Convert to DataFrame
no_context_match = no_context_match.to_frame()

# Display the no-context match proportions
no_context_match

Unnamed: 0_level_0,No Context Match
s,Unnamed: 1_level_1
Biological Sciences,0.211326
Chemical Sciences,0.411926
Computer and Information Sciences,0.286976
Earth and Related Environmental Sciences,0.236239
Mathematics,0.612707
Physical Sciences,0.315847


In [97]:
# Merge the pivot table with the no-context match proportions
results = pivot.merge(no_context_match, left_index=True, right_index=True)

# Reset index to make 'f' a column
results.reset_index(inplace=True)

# Display the results
results

Unnamed: 0,s,Context 1,Context 2,Context 3,Context 4,Context 5,Context 6,Context 7,Context 8,Context 9,Context 10,Context 11,Context 12,Context 13,Context 14,Context 15,Context 16,No Context Match
0,Biological Sciences,0.251381,0.176796,0.218232,0.151934,,,,,,,,,,,,,0.211326
1,Chemical Sciences,0.448238,0.247797,0.426211,0.220264,0.5,0.25,0.75,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.411926
2,Computer and Information Sciences,0.321192,0.215232,0.296909,0.199779,,,,,,,,,,,,,0.286976
3,Earth and Related Environmental Sciences,0.28473,0.213548,0.254879,0.159587,1.0,0.0,1.0,0.0,,,,,,,,,0.236239
4,Mathematics,0.59839,0.410555,0.569767,0.381038,0.928571,0.535714,0.777778,0.407407,1.0,0.333333,0.666667,0.333333,1.0,0.0,1.0,0.0,0.612707
5,Physical Sciences,0.323077,0.203297,0.351648,0.183516,0.5,0.5,0.5,0.25,0.0,0.0,0.0,0.0,,,,,0.315847


In [98]:
# Melt the DataFrame for plotting
results_melted = results.melt(id_vars=field_type, value_vars=results.columns, var_name="Metric", value_name="Proportion")

# Create the bar chart
fig = px.bar(
    results_melted,
    x=field_type,
    y="Proportion",
    color="Metric",
    barmode="group",
    title="Proportions of Correct Answers per Field of Knowledge",
    labels={field_type: "Field of Knowledge", "Proportion": "Proportion Correct"},
)

# Show the figure
fig.show()

In [99]:
# Group by question and compute the proportion of correct answers
df_question_correctness = df.groupby("q")["k"].mean().reset_index()

# Display the DataFrame
df_question_correctness.head()

Unnamed: 0,q,k
0,Are all even numbers except 2 composite?,1.0
1,Are all fractals infinitely detailed?,1.0
2,Are all infinite sets uncountable?,0.25
3,Are all numbers rational?,0.75
4,Are all prime numbers greater than 2 odd?,0.75


In [100]:
# Count the number of questions for each proportion
proportion_distribution = df_question_correctness["k"].value_counts().sort_index()

# Display the distribution
print(proportion_distribution)

0.000000    2072
0.125000       1
0.250000    1002
0.333333       4
0.375000       6
0.416667       1
0.500000    1043
0.625000       2
0.687500       1
0.750000     442
0.833333       1
0.875000       4
0.916667       1
1.000000     476
Name: k, dtype: int64


In [101]:
# Create the histogram
fig2 = px.histogram(
    df_question_correctness,
    x="k",
    nbins=10,
    histnorm="probability",
    title="Distribution of Proportion of Correct Answers per Question",
    labels={"k": "Proportion of Correct Answers per Question"},
    marginal="box",
)

# Show the figure
fig2.show()

# Сравнительные оценки

In [102]:
# Define the path to the test results file
test_results_file = "./data/test_results.jsonl"  # Adjust the path as needed

# Load the test results into a list and split by model
data_gpt4o = []
data_gpt35 = []
with open(test_results_file, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        # Separate entries based on the model tag
        if entry.get("model") == "gpt-4o-2024-08-06":
            data_gpt4o.append(entry)
        elif entry.get("model") == "gpt-3.5-turbo-0125":
            data_gpt35.append(entry)

# Convert the lists of dictionaries into DataFrames
df_gpt4o = pd.json_normalize(data_gpt4o, record_path="v", meta=["f", "s", "j", "q", "n", "d", "t", "model"])
df_gpt35 = pd.json_normalize(data_gpt35, record_path="v", meta=["f", "s", "j", "q", "n", "d", "t", "model"])

# Add a column to identify the model explicitly
df_gpt4o["model_name"] = "gpt-4o-2024-08-06"
df_gpt35["model_name"] = "gpt-3.5-turbo-0125"

# Ensure 'k' (correctness) is numeric
df_gpt4o["k"] = df_gpt4o["k"].astype(int)
df_gpt35["k"] = df_gpt35["k"].astype(int)


# --- Filter out questions in df_gpt4o where all context answers are the same ---
def has_unique_context_answers(group):
    """Check if there are different answers across contexts for a given question group."""
    return group["m"].nunique() > 1  # More than one unique answer across contexts


# Group by question and filter
df_gpt4o_filtered = df_gpt4o.groupby("q").filter(has_unique_context_answers)

# Filter GPT-4o data to select only questions where:
# - The model correctly answered all contexts
# - The no-context answer is one of the context answers

# Step 1: Compute correctness per question for GPT-4o
df_gpt4o_correctness = df_gpt4o.groupby("q")["k"].mean().reset_index()
df_gpt4o_correctness.rename(columns={"k": "mean_correctness"}, inplace=True)

# Step 2: Identify questions where the model answered all contexts correctly
questions_all_correct = df_gpt4o_correctness[df_gpt4o_correctness["mean_correctness"] == 1.0]["q"].tolist()


# Step 3: Check if the no-context answer is in the context answers
def no_context_in_context_answers(group):
    no_context_answer = group["n"].iloc[0].strip().lower()
    context_answers = [ans.strip().lower() for ans in group["e"]]
    return no_context_answer in context_answers


# Group GPT-4o data by question
grouped_gpt4o = df_gpt4o[df_gpt4o["q"].isin(questions_all_correct)].groupby("q")

# Select questions where the no-context answer is in the context answers
valid_questions = []
for q, group in grouped_gpt4o:
    if no_context_in_context_answers(group):
        valid_questions.append(q)

# Filter GPT-4o data to only include valid questions
df_gpt4o_valid = df_gpt4o[df_gpt4o["q"].isin(valid_questions)]

# Also extract the list of valid questions
valid_questions_set = set(valid_questions)

# Filter GPT-3.5 data to only include the valid questions
df_gpt35_valid = df_gpt35[df_gpt35["q"].isin(valid_questions_set)]

# Now, we can compare the performance of GPT-3.5-turbo on these questions

# Merge GPT-4o and GPT-3.5 data for comparison
df_combined = pd.concat([df_gpt4o_valid, df_gpt35_valid], ignore_index=True)

# Compute performance metrics
# Performance: Ability to give the correct answer
performance = df_combined.groupby(["model_name"])["k"].mean().reset_index()
performance.rename(columns={"k": "Overall Accuracy"}, inplace=True)

print("Performance Metrics:")
print(performance)

# Evaluate susceptibility: Ability to change the answer when the context changes
# For this, we can compute the proportion of questions where the model's answers differ across contexts


def susceptibility(group):
    # Count unique answers provided by the model across contexts
    unique_answers = group["m"].str.strip().str.lower().unique()
    return len(unique_answers) > 1


# Apply susceptibility function per question and model
susceptibility_df = df_combined.groupby(["model_name", "q"]).apply(susceptibility).reset_index()
susceptibility_df.rename(columns={0: "Susceptible"}, inplace=True)

# Compute susceptibility rate per model
susceptibility_rate = susceptibility_df.groupby("model_name")["Susceptible"].mean().reset_index()
susceptibility_rate.rename(columns={"Susceptible": "Susceptibility Rate"}, inplace=True)

print("\nSusceptibility Metrics:")
print(susceptibility_rate)


# Analyze biases: Difference between the model's answer and the context answers
def bias_analysis(group):
    # Check if the model's answers match any of the expected answers
    model_answers = group["m"].str.strip().str.lower().tolist()
    expected_answers = group["e"].str.strip().str.lower().tolist()
    matches = [ans in expected_answers for ans in model_answers]
    return 1 - sum(matches) / len(matches)  # Bias rate


# Apply bias analysis per model
bias_df = df_combined.groupby(["model_name", "q"]).apply(bias_analysis).reset_index()
bias_df.rename(columns={0: "Bias Rate"}, inplace=True)

# Compute average bias rate per model
average_bias = bias_df.groupby("model_name")["Bias Rate"].mean().reset_index()

print("\nBias Metrics:")
print(average_bias)

# Additionally, compare the no-context answers between models
# Check if the models provide different no-context answers

# Extract no-context answers
no_context_answers_gpt4o = df_gpt4o_valid[["q", "n"]].drop_duplicates()
no_context_answers_gpt35 = df_gpt35_valid[["q", "n"]].drop_duplicates()

# Merge the no-context answers
no_context_comparison = no_context_answers_gpt4o.merge(no_context_answers_gpt35, on="q", suffixes=("_gpt4o", "_gpt35"))

# Compare the answers
no_context_comparison["Same_No_Context_Answer"] = no_context_comparison["n_gpt4o"].str.strip().str.lower() == no_context_comparison["n_gpt35"].str.strip().str.lower()

# Compute the proportion of questions where the models have the same no-context answer
same_no_context_rate = no_context_comparison["Same_No_Context_Answer"].mean()

print(f"\nProportion of questions where both models have the same no-context answer: {same_no_context_rate:.2f}")

# Optionally, display some examples where models differed in no-context answers
differences = no_context_comparison[~no_context_comparison["Same_No_Context_Answer"]]

print("\nExamples where models provided different no-context answers:")
print(differences[["q", "n_gpt4o", "n_gpt35"]].head())


# Analyze cases where models' answers correspond to different contexts
def context_match(row):
    # Match GPT-4o no-context answer to contexts
    gpt4o_answer = row["n_gpt4o"].strip().lower()
    gpt35_answer = row["n_gpt35"].strip().lower()
    gpt4o_contexts = df_gpt4o_valid[df_gpt4o_valid["q"] == row["q"]]
    gpt35_contexts = df_gpt35_valid[df_gpt35_valid["q"] == row["q"]]

    # Identify which context each model's no-context answer matches
    gpt4o_context_match = gpt4o_contexts[gpt4o_contexts["e"].str.strip().str.lower() == gpt4o_answer]["c"].values
    gpt35_context_match = gpt35_contexts[gpt35_contexts["e"].str.strip().str.lower() == gpt35_answer]["c"].values

    return pd.Series(
        {
            "gpt4o_context_matched": gpt4o_context_match[0] if len(gpt4o_context_match) > 0 else None,
            "gpt35_context_matched": gpt35_context_match[0] if len(gpt35_context_match) > 0 else None,
        }
    )


# Apply context_match function
context_matches = no_context_comparison.apply(context_match, axis=1)

# Combine with the no_context_comparison DataFrame
no_context_comparison = pd.concat([no_context_comparison, context_matches], axis=1)

# Display examples where models' no-context answers match different contexts
different_contexts = no_context_comparison[no_context_comparison["gpt4o_context_matched"] != no_context_comparison["gpt35_context_matched"]]

print("\nExamples where models' no-context answers correspond to different contexts:")
print(different_contexts[["q", "n_gpt4o", "gpt4o_context_matched", "n_gpt35", "gpt35_context_matched"]].head())

Performance Metrics:
           model_name  Overall Accuracy
0  gpt-3.5-turbo-0125          0.691198
1   gpt-4o-2024-08-06          1.000000

Susceptibility Metrics:
           model_name  Susceptibility Rate
0  gpt-3.5-turbo-0125             0.651537
1   gpt-4o-2024-08-06             0.789165

Bias Metrics:
           model_name  Bias Rate
0  gpt-3.5-turbo-0125   0.219985
1   gpt-4o-2024-08-06   0.000000

Proportion of questions where both models have the same no-context answer: 0.76

Examples where models provided different no-context answers:
                                                 q    n_gpt4o n_gpt35
0           Does infinity exist in the real world?  debatable      no
13  Is 15 a product of two distinct prime numbers?        Yes      No
15    Are twin primes two consecutive odd numbers?         No     Yes
21                      Are all prime numbers odd?         No     Yes
24        Is 29 the largest prime number under 30?        Yes      No

Examples where models' no-c