# Manual analysis of evaluation results (1 of 2)

### Environment setup
python 00_setup.py

### Libraries

In [None]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import json

### Display setup

In [None]:
pd.set_option('display.max_colwidth', None)

### Global variables

In [None]:
PRIVATE = False
DATA_DIR = Path("data")
TMP_DIR = Path("tmp")

### Load environment variables

In [None]:
# Import override environment variables from .env file
# or from private.env file if PRIVATE is True
dotenv.load_dotenv('.env' if not PRIVATE else 'private.env', override=True)

### Helper functions

In [None]:
from rich.console import Console
from rich.markdown import Markdown

def pretty_markdown(_text):
    Console().print(Markdown(_text))

### Load evaluation file (jsonl)

In [None]:
def load_evaluation(_json_file):
    
    lines = []
    with open(_json_file) as f:
        lines = f.read().splitlines()

    line_dicts = [json.loads(line) for line in lines]
    return pd.DataFrame(line_dicts)

_df = load_evaluation(DATA_DIR / 'science-trivia__context_response_feedback_v12__evaluated.jsonl')

### Check columns and data types

In [None]:
_df.dtypes

## Descriptive statistics

### e.g. positive human feed back (thumbs up) 

In [None]:
# calculate frequency of thumbs_up 
def calculate_frequency(df, column, normalize=False):
    return df[column].value_counts(normalize=normalize)

calculate_frequency(_df, 'inputs.thumbs_up', True)

### e.g. cross tab evaluated features vs thumbs up 

In [None]:
pd.crosstab(_df['outputs.Groundedness.groundedness'], _df['inputs.thumbs_up'])

### Using machine learning to find relevant features

### Select features (X), remove duplicates, remove NAs, ...

In [None]:
def get_X(_df):
    X = _df.drop(columns=[col for col in _df.columns if (col.startswith('inputs.') and col != 'inputs.thumbs_up') \
                          or _df[col].dtype == 'object' or col.endswith('_threshold')])
    X.dropna(inplace=True)
    return X

# loop all columns and split their names by '.'    
def split_column_names(df):
    split_names = {}
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        split_names[sn] =  True
    return split_names

def remove_gpt_duplicates(df):
    split_names=split_column_names(df)
    torm = []
    
    for col in df.columns:
        parts = col.split('.')
        sn=parts[len(parts)-1]
        if sn.startswith('gpt_') and f'{sn[4:]}' in split_names:
            torm.append(col)
    
    return df.drop(columns=torm)

X = remove_gpt_duplicates(get_X(_df))
X.dtypes

#### Set thumbs_up as label (Y) 

In [None]:
y = X.pop('inputs.thumbs_up').astype(int)

#### Use logistic regression to fit predictive model


In [None]:
import statsmodels.api as sm

X = sm.add_constant(X)  # Adds a constant term to the predictor
model = sm.Logit(y, X)
result = model.fit()

_result = f'{result.summary()}\n\nCoefficients:\n{result.params}\n\nP-values:\n{result.pvalues}'

In [None]:
print(_result)

### Use LLM to explain logistic regression result

In [None]:
# https://python.langchain.com/docs/integrations/chat/azure_chat_openai/
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    temperature=0.7,
    top_p=0,
    max_tokens=1600,
    timeout=None,
    max_retries=1,
    cache=False
    # other params...
)

In [None]:
_interpretation = llm.invoke(f'Explain the following logistic regression result:\n\n{_result}\n\nWhat are the most important features and if they influence thumbs_up positively or negatively?').content
pretty_markdown(_interpretation)

### Follow up findings from logistic regression

#### e.g. Similarity

In [None]:
pd.crosstab(_df['outputs.Similarity.similarity'], _df['inputs.thumbs_up'])

#### e.g. Positive human feedback, yet low similarity?

In [None]:
_sdf = _df[(_df['outputs.Similarity.similarity'].notnull()) & (_df['outputs.Similarity.similarity'] == 2) & (_df['inputs.thumbs_up'] == 0)][['inputs.query', 'inputs.response', 'outputs.Groundedness.groundedness_reason']]
_sdf[:10]

#### e.g. High similarity, wet negative human feedback?

In [None]:
_sdf = _df[(_df['outputs.Similarity.similarity'].notnull()) & (_df['outputs.Similarity.similarity'] == 5) & (_df['inputs.thumbs_up'] == 0)][['inputs.query', 'inputs.response', 'outputs.Groundedness.groundedness_reason']]
_sdf[:10]

#### e.g. Grounding

In [None]:
pd.crosstab(_df['outputs.Groundedness.groundedness'], _df['inputs.thumbs_up'])

#### e.g. Positive human feedback, yet bad grounding?

In [None]:
_sdf = _df[(_df['outputs.Groundedness.groundedness'].notnull()) & (_df['outputs.Groundedness.groundedness'] == 1) & (_df['inputs.thumbs_up'] == 1)][['inputs.query', 'inputs.response', 'outputs.Groundedness.groundedness_reason']]
_sdf[:10]


In [None]:
calculate_frequency(_df, 'outputs.Groundedness.groundedness')

### Findings, hypothesis, ideas for further development...

...