# Manual analysis of evaluation results (2 of 2)

### Environment setup
python 00_setup.py

In [None]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import json

In [None]:
# Global variables
PRIVATE = False
DATA_DIR = Path("data")
TMP_DIR = Path("tmp")

In [None]:
def x(_json_file):
    
    lines = []
    with open(_json_file) as f:
        lines = f.read().splitlines()

    line_dicts = [json.loads(line) for line in lines]
    return pd.DataFrame(line_dicts)

_df = x(TMP_DIR / 'science-trivia__context_response_feedback_v12_locally_evaluated.jsonl')

In [None]:
_df.dtypes

## Display helper

In [None]:
pd.set_option('display.max_colwidth', None)


In [None]:
# remove all columns that start with 'inputs.' but is not 'inputs.thumbs_up' or is of type 'object' from _df and return X
def get_X(_df):
    X = _df.drop(columns=[col for col in _df.columns if (col.startswith('inputs.') and col != 'inputs.thumbs_up') \
                          or _df[col].dtype == 'object' or col.endswith('_threshold') or 'groundedness_pro' in col])
    return X

In [None]:
X = get_X(_df)
X.dropna(inplace=True)

In [None]:
len(X)

In [None]:
# loop all columns and split their names by '.'    
def split_column_names(df):
    split_names = {}
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        split_names[sn] =  True
    return split_names

def remove_gpt_duplicates(df):
    split_names=split_column_names(df)
    torm = []
    
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        if sn.startswith('gpt_') and f'{sn[4:]}' in split_names:
            torm.append(col)
    
    return df.drop(columns=torm)

X = remove_gpt_duplicates(X)

In [None]:
X.dtypes

In [None]:
y = X.pop('inputs.thumbs_up').astype(int)

In [None]:
# Logistic regression to predict inputs.thumbs_up from the other columns
import statsmodels.api as sm

X = sm.add_constant(X)  # Adds a constant term to the predictor
model = sm.Logit(y, X)
result = model.fit()

_result = f'{result.summary()}\n\nCoefficients:\n{result.params}\n\nP-values:\n{result.pvalues}'

In [None]:
print(_result)

In [None]:
# https://python.langchain.com/docs/integrations/chat/azure_chat_openai/
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    temperature=0.7,
    top_p=0,
    max_tokens=1600,
    timeout=None,
    max_retries=1,
    cache=False
    # other params...
)

In [None]:
from rich.console import Console
from rich.markdown import Markdown

def pretty_markdown(_text):
    Console().print(Markdown(_text))

In [None]:
_interpretation = llm.invoke(f'Explain the following logistic regression result:\n\n{_result}\n\nWhat are the most important features and how do they influence thumbs_up?').content
pretty_markdown(_interpretation)

### Follow up findings from logistic regression

In [None]:
pd.crosstab(_df['outputs.Friendliness.score'], _df['inputs.thumbs_up'])

#### e.g. High friendliness, yet negative human feedback?

In [None]:
# Evaluator excepted non-similar answers to be correct
# Not how groundedness reason helps interpretation 
_sdf = _df[(_df['outputs.Friendliness.score'].notnull()) & (_df['outputs.Friendliness.score'] == 5) & (_df['inputs.thumbs_up'] == 0)][['inputs.query', 'inputs.ground_truth', 'inputs.response', 'outputs.Friendliness.reason']]
_sdf[:10]
