# Manual analysis of evaluation results (2 of 2)

### Environment setup
python 00_setup.py

In [9]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import json

In [10]:
# Global variables
PRIVATE = False
DATA_DIR = Path("data")
TMP_DIR = Path("tmp")

In [11]:
def x(_json_file):
    
    lines = []
    with open(_json_file) as f:
        lines = f.read().splitlines()

    line_dicts = [json.loads(line) for line in lines]
    return pd.DataFrame(line_dicts)

_df = x(TMP_DIR / 'science-trivia__context_response_feedback_v12_locally_evaluated.jsonl')

In [12]:
_df.dtypes

inputs.query                                   object
inputs.ground_truth                            object
inputs.response                                object
inputs.context                                 object
inputs.version                                  int64
inputs.thumbs_up                                 bool
outputs.Groundedness.groundedness               int64
outputs.Groundedness.gpt_groundedness           int64
outputs.Groundedness.groundedness_reason       object
outputs.Groundedness.groundedness_result       object
outputs.Groundedness.groundedness_threshold     int64
outputs.Answer_length.answer_length             int64
outputs.Friendliness.score                      int64
outputs.Friendliness.reason                    object
line_number                                     int64
dtype: object

## Display helper

In [13]:
pd.set_option('display.max_colwidth', None)


In [14]:
# remove all columns that start with 'inputs.' but is not 'inputs.thumbs_up' or is of type 'object' from _df and return X
def get_X(_df):
    X = _df.drop(columns=[col for col in _df.columns if (col.startswith('inputs.') and col != 'inputs.thumbs_up') \
                          or _df[col].dtype == 'object' or col.endswith('_threshold') or 'groundedness_pro' in col])
    return X

In [15]:
X = get_X(_df)
X.dropna(inplace=True)

In [16]:
len(X)

75

In [17]:
# loop all columns and split their names by '.'    
def split_column_names(df):
    split_names = {}
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        split_names[sn] =  True
    return split_names

def remove_gpt_duplicates(df):
    split_names=split_column_names(df)
    torm = []
    
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        if sn.startswith('gpt_') and f'{sn[4:]}' in split_names:
            torm.append(col)
    
    return df.drop(columns=torm)

X = remove_gpt_duplicates(X)

In [18]:
X.dtypes

inputs.thumbs_up                        bool
outputs.Groundedness.groundedness      int64
outputs.Answer_length.answer_length    int64
outputs.Friendliness.score             int64
line_number                            int64
dtype: object

In [19]:
y = X.pop('inputs.thumbs_up').astype(int)

In [20]:
# Logistic regression to predict inputs.thumbs_up from the other columns
import statsmodels.api as sm

X = sm.add_constant(X)  # Adds a constant term to the predictor
model = sm.Logit(y, X)
result = model.fit()

_result = f'{result.summary()}\n\nCoefficients:\n{result.params}\n\nP-values:\n{result.pvalues}'

Optimization terminated successfully.
         Current function value: 0.599052
         Iterations 6


In [22]:
print(_result)

                           Logit Regression Results                           
Dep. Variable:       inputs.thumbs_up   No. Observations:                   75
Model:                          Logit   Df Residuals:                       70
Method:                           MLE   Df Model:                            4
Date:                Mon, 23 Jun 2025   Pseudo R-squ.:                 0.07175
Time:                        16:33:10   Log-Likelihood:                -44.929
converged:                       True   LL-Null:                       -48.402
Covariance Type:            nonrobust   LLR p-value:                    0.1388
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
const                                  -1.4232      2.361     -0.603      0.547      -6.052       3.205
outputs.Groundedness.groundedness       0.0581      0.18

In [23]:
# https://python.langchain.com/docs/integrations/chat/azure_chat_openai/
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    temperature=0.7,
    top_p=0,
    max_tokens=1600,
    timeout=None,
    max_retries=1,
    cache=False
    # other params...
)

In [24]:
from rich.console import Console
from rich.markdown import Markdown

def pretty_markdown(_text):
    Console().print(Markdown(_text))

In [25]:
_interpretation = llm.invoke(f'Explain the following logistic regression result:\n\n{_result}\n\nWhat are the most important features and how do they influence thumbs_up?').content
pretty_markdown(_interpretation)

### Follow up findings from logistic regression

In [26]:
pd.crosstab(_df['outputs.Friendliness.score'], _df['inputs.thumbs_up'])

inputs.thumbs_up,False,True
outputs.Friendliness.score,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1,0
3,5,2
4,16,43
5,4,4


#### e.g. High friendliness, yet negative human feedback?

In [28]:
# Evaluator excepted non-similar answers to be correct
# Not how groundedness reason helps interpretation 
_sdf = _df[(_df['outputs.Friendliness.score'].notnull()) & (_df['outputs.Friendliness.score'] == 5) & (_df['inputs.thumbs_up'] == 0)][['inputs.query', 'inputs.ground_truth', 'inputs.response', 'outputs.Friendliness.reason']]
_sdf[:10]


Unnamed: 0,inputs.query,inputs.ground_truth,inputs.response,outputs.Friendliness.reason
18,"What is responsible for the coloration of human hair, skin, and nails?",Melanin.,"Oh, absolutely, melanin is the grand maestro orchestrating the dazzling symphony of colors in our hair, skin, and nails. Those melanocytes, tucked away in the basal layer of the epidermis, are clearly working overtime to ensure we all have our unique shades. And while the context might not roll out the red carpet for nails, let's not forget that melanin is the unsung hero behind the pigmentation in every nook and cranny of our bodies. Bravo, melanin, bravo!","The response is enthusiastic, engaging, and celebrates the topic in a warm and friendly manner."
28,True or false: An object's gravitational pull increases with its mass.,True.,"Oh, absolutely! Because, you know, when objects get heavier, they just can't help but throw around their gravitational weight like they're the center of the universe. It's like they're saying, ""Look at me, I'm so massive, I just have to pull everything towards me!""","The response is playful, engaging, and uses humor to explain the concept, making it very friendly."
67,What is the highest speed recorded in the universe?,"Light travels at a speed of 299,792,458 meters per second.","Oh, sure, the speed of light in a vacuum is just a leisurely 299,792,458 meters per second. No big deal, right? It's only the fastest thing in the universe, setting the ultimate speed limit for everything—information, matter, energy—you name it. Just a casual stroll through space!","The response is very friendly, engaging, and uses humor to make the information approachable and enjoyable."
70,What is the force responsible for attracting objects towards Earth's center?,Gravity.,"Oh, sure, gravity is just that little thing that keeps us from floating off into space. It's the magical force that makes sure your keys always fall to the ground instead of flying away to join the birds. According to Newton, it's the universal magnet that pulls everything towards everything else, but on Earth, it's just obsessed with dragging everything down to the planet's core. Because, you know, why would we want to defy gravity and have a little fun?","The response is playful, engaging, and friendly while explaining the concept of gravity in an approachable manner."
