# Manual analysis of evaluation results (1 of 2)

### Environment setup
python 00_setup.py

### Libraries

In [2]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import json

### Display setup

In [3]:
pd.set_option('display.max_colwidth', None)

### Global variables

In [4]:
PRIVATE = False
DATA_DIR = Path("data")
TMP_DIR = Path("tmp")

### Load environment variables

In [5]:
# Import override environment variables from .env file
# or from private.env file if PRIVATE is True
dotenv.load_dotenv('.env' if not PRIVATE else 'private.env', override=True)

True

### Helper functions

In [6]:
from rich.console import Console
from rich.markdown import Markdown

def pretty_markdown(_text):
    Console().print(Markdown(_text))

### Load evaluation file (jsonl)

In [7]:
def load_evaluation(_json_file):
    
    lines = []
    with open(_json_file) as f:
        lines = f.read().splitlines()

    line_dicts = [json.loads(line) for line in lines]
    return pd.DataFrame(line_dicts)

_df = load_evaluation(DATA_DIR / 'science-trivia__context_response_feedback_v12__evaluated.jsonl')

### Check columns and data types

In [8]:
_df.dtypes

inputs.query                                    object
inputs.ground_truth                             object
inputs.response                                 object
inputs.context                                  object
inputs.version                                   int64
inputs.thumbs_up                                  bool
inputs.timestamp                                 int64
outputs.Groundedness.groundedness                int64
outputs.Groundedness.gpt_groundedness            int64
outputs.Groundedness.groundedness_reason        object
outputs.Groundedness.groundedness_result        object
outputs.Groundedness.groundedness_threshold      int64
outputs.Similarity.similarity                    int64
outputs.Similarity.gpt_similarity                int64
outputs.Similarity.similarity_result            object
outputs.Similarity.similarity_threshold          int64
outputs.METEOR_score.meteor_score              float64
outputs.METEOR_score.meteor_result              object
outputs.ME

## Descriptive statistics

### e.g. positive human feed back (thumbs up) 

In [9]:
# calculate frequency of thumbs_up 
def calculate_frequency(df, column, normalize=False):
    return df[column].value_counts(normalize=normalize)

calculate_frequency(_df, 'inputs.thumbs_up', True)

inputs.thumbs_up
True     0.653333
False    0.346667
Name: proportion, dtype: float64

### e.g. cross tab evaluated features vs thumbs up 

In [10]:
pd.crosstab(_df['outputs.Groundedness.groundedness'], _df['inputs.thumbs_up'])

inputs.thumbs_up,False,True
outputs.Groundedness.groundedness,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,11
2,3,0
3,14,8
4,0,9
5,3,21


### Using machine learning to find relevant features

### Select features (X), remove duplicates, remove NAs, ...

In [11]:
def get_X(_df):
    X = _df.drop(columns=[col for col in _df.columns if (col.startswith('inputs.') and col != 'inputs.thumbs_up') \
                          or _df[col].dtype == 'object' or col.endswith('_threshold')])
    X.dropna(inplace=True)
    return X

# loop all columns and split their names by '.'    
def split_column_names(df):
    split_names = {}
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        split_names[sn] =  True
    return split_names

def remove_gpt_duplicates(df):
    split_names=split_column_names(df)
    torm = []
    
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        if sn.startswith('gpt_') and f'{sn[4:]}' in split_names:
            torm.append(col)
    
    return df.drop(columns=torm)

X = remove_gpt_duplicates(get_X(_df))
X.dtypes

inputs.thumbs_up                        bool
outputs.Groundedness.groundedness      int64
outputs.Similarity.similarity          int64
outputs.METEOR_score.meteor_score    float64
dtype: object

#### Set thumbs_up as label (Y) 

In [12]:
y = X.pop('inputs.thumbs_up').astype(int)

#### Use logistic regression to fit predictive model


In [13]:
import statsmodels.api as sm

X = sm.add_constant(X)  # Adds a constant term to the predictor
model = sm.Logit(y, X)
result = model.fit()

_result = f'{result.summary()}\n\nCoefficients:\n{result.params}\n\nP-values:\n{result.pvalues}'

Optimization terminated successfully.
         Current function value: 0.439161
         Iterations 7


In [14]:
print(_result)

                           Logit Regression Results                           
Dep. Variable:       inputs.thumbs_up   No. Observations:                   75
Model:                          Logit   Df Residuals:                       71
Method:                           MLE   Df Model:                            3
Date:                Mon, 23 Jun 2025   Pseudo R-squ.:                  0.3195
Time:                        16:14:26   Log-Likelihood:                -32.937
converged:                       True   LL-Null:                       -48.402
Covariance Type:            nonrobust   LLR p-value:                 8.796e-07
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                -3.8139      1.043     -3.655      0.000      -5.859      -1.769
outputs.Groundedness.groundedness     0.3135      0.213      1

### Use LLM to explain logistic regression result

In [15]:
# https://python.langchain.com/docs/integrations/chat/azure_chat_openai/
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    temperature=0.7,
    top_p=0,
    max_tokens=1600,
    timeout=None,
    max_retries=1,
    cache=False
    # other params...
)

In [16]:
_interpretation = llm.invoke(f'Explain the following logistic regression result:\n\n{_result}\n\nWhat are the most important features and if they influence thumbs_up positively or negatively?').content
pretty_markdown(_interpretation)

### Follow up findings from logistic regression

#### e.g. Similarity

In [17]:
pd.crosstab(_df['outputs.Similarity.similarity'], _df['inputs.thumbs_up'])

inputs.thumbs_up,False,True
outputs.Similarity.similarity,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11,0
2,3,4
3,7,11
4,0,3
5,5,31


#### e.g. Positive human feedback, yet low similarity?

In [18]:
_sdf = _df[(_df['outputs.Similarity.similarity'].notnull()) & (_df['outputs.Similarity.similarity'] == 2) & (_df['inputs.thumbs_up'] == 0)][['inputs.query', 'inputs.response', 'outputs.Groundedness.groundedness_reason']]
_sdf[:10]

Unnamed: 0,inputs.query,inputs.response,outputs.Groundedness.groundedness_reason
15,Is it true or false that every living organism is composed of cells?,"False. According to the context provided, while cell theory states that living organisms are made up of cells, some biologists consider non-cellular entities such as viruses to be living organisms. Therefore, not every living organism is composed of cells.","The RESPONSE accurately reflects the information from the CONTEXT but includes an inference that is not explicitly stated, making it an accurate response with unsupported additions."
19,What concept is Charles Darwin best known for?,"Oh, absolutely, Charles Darwin is just a minor footnote in history, barely known for anything at all, except maybe that tiny, insignificant idea of evolution by natural selection that nobody ever talks about.","The RESPONSE contradicts the CONTEXT by sarcastically minimizing Darwin's significance and the importance of his theory, which is directly opposed to the information provided."
33,What is the chemical name for fool’s gold?,"Oh, sure, because when I see a shiny rock, my first thought is definitely, ""Ah, yes, iron (II) disulfide, FeS2!"" Who needs real gold when you can have the glamorous allure of fool's gold?","The RESPONSE accurately reflects the CONTEXT by mentioning pyrite's resemblance to gold and its chemical formula, without introducing unsupported information or omitting critical details."


#### e.g. High similarity, wet negative human feedback?

In [19]:
_sdf = _df[(_df['outputs.Similarity.similarity'].notnull()) & (_df['outputs.Similarity.similarity'] == 5) & (_df['inputs.thumbs_up'] == 0)][['inputs.query', 'inputs.response', 'outputs.Groundedness.groundedness_reason']]
_sdf[:10]

Unnamed: 0,inputs.query,inputs.response,outputs.Groundedness.groundedness_reason
2,"Which female scientist's work in telecommunications led to the development of technologies such as the portable fax machine, fiber optic cables, and caller ID?","The context provided does not contain information related to telecommunications or female scientists. However, based on general knowledge, the female scientist whose work contributed to the development of technologies such as the portable fax machine, fiber optic cables, and caller ID is Dr. Shirley Ann Jackson. She is a physicist whose research and leadership have significantly impacted telecommunications technology.","The RESPONSE is completely unrelated to the CONTEXT, discussing a different person and topic that have no connection to the provided material."
26,Which two mammals are known for laying eggs?,"Oh, of course, because when you think of mammals, the first thing that comes to mind is definitely egg-laying creatures like the platypus and echidnas. I mean, who wouldn't expect mammals to lay eggs? It's just so typical!","The RESPONSE is grounded in the CONTEXT, accurately reflecting the information provided about monotremes being egg-laying mammals. It does not introduce unsupported information or omit critical details, but it uses sarcasm to convey the information."
48,Which tree holds the record for the largest volume in the world?,"Oh, sure, the ""General Sherman"" Giant Sequoia tree is just casually holding the record for the largest volume in the world. No big deal, right? It's not like it's the most impressive tree on the planet or anything.","The RESPONSE accurately reflects the CONTEXT but adds an opinion about the tree's impressiveness, which is not supported by the CONTEXT."
67,What is the highest speed recorded in the universe?,"Oh, sure, the speed of light in a vacuum is just a leisurely 299,792,458 meters per second. No big deal, right? It's only the fastest thing in the universe, setting the ultimate speed limit for everything—information, matter, energy—you name it. Just a casual stroll through space!",The RESPONSE accurately conveys the information from the CONTEXT but includes stylistic additions that are not supported by the CONTEXT.
69,These two elements constitute the majority of Earth's mass.,"Oh, absolutely, because when I think of Earth's mass, the first thing that comes to mind is a delightful cocktail of iron and oxygen. I mean, who wouldn't want a planet where the core is basically a giant iron ball and the crust is just a breath of fresh oxygen? It's like the Earth is trying to be a superhero with an iron heart and an oxygen cape. Together, they make up a significant portion of the Earth's total mass, because clearly, nothing screams ""planetary stability"" like a duo of metal and air.",The RESPONSE is entirely ungrounded as it does not relate to the scientific details provided in the CONTEXT about the Great Oxidation Event. It introduces unrelated topics and metaphors that have no connection to the provided material.


#### e.g. Grounding

In [20]:
pd.crosstab(_df['outputs.Groundedness.groundedness'], _df['inputs.thumbs_up'])

inputs.thumbs_up,False,True
outputs.Groundedness.groundedness,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,11
2,3,0
3,14,8
4,0,9
5,3,21


#### e.g. Positive human feedback, yet bad grounding?

In [21]:
_sdf = _df[(_df['outputs.Groundedness.groundedness'].notnull()) & (_df['outputs.Groundedness.groundedness'] == 1) & (_df['inputs.thumbs_up'] == 1)][['inputs.query', 'inputs.response', 'outputs.Groundedness.groundedness_reason']]
_sdf[:10]


Unnamed: 0,inputs.query,inputs.response,outputs.Groundedness.groundedness_reason
5,What is the least common blood type among humans?,The least common blood type among humans is AB negative.,"The RESPONSE is entirely unrelated to the CONTEXT, introducing information about AB negative blood type, which is not mentioned or relevant to the provided material."
12,"Which is more intense in temperature, the Earth's core or the Sun's core?","The Sun's core is more intense in temperature compared to the Earth's core. The Earth's core temperature is estimated to be around 5,000 to 7,000 degrees Celsius, while the Sun's core temperature is approximately 15 million degrees Celsius.","The RESPONSE introduces information about the Earth's and Sun's core temperatures, which is not mentioned or supported by the CONTEXT, making it completely ungrounded."
13,What is the molecular formula for water?,The molecular formula for water is H₂O.,"The RESPONSE is entirely unrelated to the CONTEXT, as it introduces a basic fact about water that does not connect to the detailed discussion of water chemistry analysis provided in the CONTEXT."
20,How old is the universe?,"The context provided is about Universal Pictures, a film production and distribution company, and does not relate to the age of the universe. However, based on scientific understanding, the universe is estimated to be approximately 13.8 billion years old.","The RESPONSE introduces information about the age of the universe, which is entirely unrelated to the CONTEXT about Universal Pictures. This makes the RESPONSE completely ungrounded."
34,How much of the Earth's surface is made up of water?,"The context provided does not directly address the question about the Earth's surface composition. However, based on general knowledge, approximately 71% of the Earth's surface is covered by water.","The RESPONSE is completely ungrounded as it introduces information about the Earth's surface composition, which is unrelated to the CONTEXT about SpaceX's first crewed flights."
41,True or false: Sound moves quicker than light.,"False. Sound does not move quicker than light. The speed of sound in air at sea level is approximately 340.3 meters per second, whereas the speed of light in a vacuum is approximately 299,792,458 meters per second. Light travels much faster than sound.","The RESPONSE is completely ungrounded because it introduces information about the speed of light, which is not mentioned or relevant to the CONTEXT that focuses solely on the speed of sound and atmospheric conditions."
47,True or false: There is no gravity in space.,"False. The statement ""There is no gravity in space"" is incorrect. Gravity exists everywhere in the universe, including space. It is the force that keeps planets in orbit around stars, moons in orbit around planets, and governs the motion of galaxies. The context provided discusses artificial gravity created by linear acceleration, but it does not imply the absence of gravity in space.","The RESPONSE introduces information about gravity in space that is not directly supported by the CONTEXT, which focuses on artificial gravity through linear acceleration. It does not accurately reflect the details of the CONTEXT, making it ungrounded."
51,Which organ pumps blood throughout the body?,The organ that pumps blood throughout the body is the heart.,"The RESPONSE is entirely unrelated to the CONTEXT, as it introduces a basic fact about the heart that is not connected to the detailed information about heart failure provided in the CONTEXT."
64,Where can you find the ears of a grasshopper?,"Grasshoppers have their ears, or tympanal organs, located on the sides of their abdomen, specifically on the first segment of the abdomen. These organs are used to detect sound vibrations.","The RESPONSE is entirely unrelated to the CONTEXT, discussing a different aspect of grasshoppers that is not mentioned in the CONTEXT."
66,What percentage of your brain consists of fat?,"The context provided does not directly address the composition of the brain in terms of fat percentage. However, it is commonly known that the human brain is composed of approximately 60% fat. This high fat content is crucial for maintaining the brain's structure and function, as fats are essential components of cell membranes and play a role in insulating neurons.","The RESPONSE introduces information about the brain's fat composition, which is not mentioned or related to the CONTEXT about lateralization of brain function. Therefore, it is completely ungrounded."


In [22]:
calculate_frequency(_df, 'outputs.Groundedness.groundedness')

outputs.Groundedness.groundedness
5    24
3    22
1    17
4     9
2     3
Name: count, dtype: int64

### Findings, hypothesis, ideas for further development...

...