# Result Analysis for the ROSA document search application

This notebook runs the testing code for the ROSA document search application with different prompts and queries. The answers are then compared with the ground truth using similarity metrics.

In [2]:
import pandas as pd
import Levenshtein
import subprocess

In [1]:
# Define metric functions
# Levenshtein similarity
def string_similarity(str1: str, str2: str) -> float:
    distance = Levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))

    # Ensure that we don't divide by zero
    if max_len == 0:
        return 1.0

    similarity = 1.0 - distance / max_len
    return similarity

# Bert similarity
def bert_score(generated_answer, answer):
    bertscore = load("bertscore")
    score = bertscore.compute(predictions=[generated_answer],
                              references=[answer],
                              model_type="bert-base-uncased")
    bert_f1_score = score["f1"][0]
    return bert_f1_score

In [4]:
# Load evaluation dataset
eval_dataset = pd.read_csv('../data/processed/validation_data.csv')[['Question', 'Answer']]
questions = eval_dataset['Question'].tolist()

In [26]:
# Define prompt list
prompts = ['prompts/prompt_0.txt',
           'prompts/prompt_1.txt']

In [None]:
# Run the app for the combinations of prompts and questions
for prompt in prompts:
    for query in questions:
        subprocess.run(['python3',
                        'test-app.py',
                        '--query',
                        query,
                        '--prompt',
                        prompt])


## Prompt 0 Analysis

In [5]:
df = pd.read_json("results/prompt_0.json",
                  orient="records")

In [6]:
result_df = pd.merge(df,
                     eval_dataset,
                     left_on='query',
                     right_on='Question')
result_df = result_df.drop('Question', axis=1)
result_df = result_df.rename(columns={"Answer":"actual_answer"})
result_df.head()

Unnamed: 0,prompt,query,sources,answer,actual_answer
0,"""<s>[INST] <<SYS>> \n""\n""Below is the only inf...",What is Red Hat OpenShift Service on AWS (ROSA)?,../data/external/rosa-docs/rosa_architecture.m...,"Based on the information provided, here is a...",Red Hat Openshift Service on AWS (ROSA) is a f...
1,"""<s>[INST] <<SYS>> \n""\n""Below is the only inf...",Where can I go to get more information/details?,../data/external/rosa-docs/logging.md\n../data...,"Based on the information provided, it appear...",- [ROSA Webpage](https://www.openshift.com/pro...
2,"""<s>[INST] <<SYS>> \n""\n""Below is the only inf...",What are the benefits of Red Hat OpenShift Ser...,../data/external/rosaworkshop/14-faq.md\n../da...,"Based on the information provided, the benef...",- **Native AWS service:** Access and use Red H...
3,"""<s>[INST] <<SYS>> \n""\n""Below is the only inf...",What are the differences between Red Hat OpenS...,../data/external/rosaworkshop/14-faq.md\n../da...,"Based on the information provided, the main ...",Everything you need to deploy and manage conta...
4,"""<s>[INST] <<SYS>> \n""\n""Below is the only inf...",What exactly am I responsible for and what is ...,../data/external/rosaworkshop/14-faq.md\n../da...,Based on the information provided in the tab...,"In short, anything that is related to deployin..."


In [None]:
result_df['answer'] = result_df['answer'].apply(lambda x: x.replace("Based on the information provided,", ""))
result_df['bert_score'] = result_df.apply(lambda x: bert_score(x['answer'], x['actual_answer']), axis=1)
result_df['lv_sim'] = result_df.apply(lambda x: string_similarity(x['answer'], x['actual_answer']), axis=1)

In [None]:
result_df['bert_score'].mean()

0.5498255989514291

In [None]:
result_df['lv_sim'].mean()

0.23166314659038895

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)
pd.set_option('display.colheader_justify', 'center')

  pd.set_option('display.max_colwidth', -1)


In [None]:
def insert_newlines(text, line_length=20):
    result = []
    tl = text.split()
    for i, item in enumerate(tl):
        result.append(item)
        if (i + 1) % line_length == 0:
            result.append('\n')
    return ' '.join(result)

In [None]:
for i,j  in result_df[['answer', 'actual_answer', 'query']].iterrows():
    print ("Query:")
    print (j['query'], '\n')
    print ("Actual answer:")
    print(insert_newlines(j['actual_answer']), '\n')
    print ("Generated answer")
    print(insert_newlines(j['answer']), '\n')

## Prompt 1 Analysis

In [15]:
df = pd.read_json("results/prompt_1.json",
                  orient="records")

In [16]:
result_df = pd.merge(df,
                     eval_dataset,
                     left_on='query',
                     right_on='Question')
result_df = result_df.drop('Question', axis=1)
result_df = result_df.rename(columns={"Answer":"actual_answer"})
result_df.head()

Unnamed: 0,prompt,query,sources,answer,actual_answer
0,"""<s>[INST] <<SYS>> \n""\nYou are an engineer re...",What is Red Hat OpenShift Service on AWS (ROSA)?,../data/external/rosa-docs/rosa_architecture.m...,Thank you for your question. Red Hat OpenShi...,Red Hat Openshift Service on AWS (ROSA) is a f...
1,"""<s>[INST] <<SYS>> \n""\nYou are an engineer re...",Where can I go to get more information/details?,../data/external/rosa-docs/logging.md\n../data...,Thank you for your question. For more inform...,- [ROSA Webpage](https://www.openshift.com/pro...
2,"""<s>[INST] <<SYS>> \n""\nYou are an engineer re...",What are the benefits of Red Hat OpenShift Ser...,../data/external/rosaworkshop/14-faq.md\n../da...,Thank you for your question. Red Hat OpenShi...,- **Native AWS service:** Access and use Red H...
3,"""<s>[INST] <<SYS>> \n""\nYou are an engineer re...",What are the differences between Red Hat OpenS...,../data/external/rosaworkshop/14-faq.md\n../da...,Thank you for your question. Red Hat OpenShi...,Everything you need to deploy and manage conta...
4,"""<s>[INST] <<SYS>> \n""\nYou are an engineer re...",What exactly am I responsible for and what is ...,../data/external/rosaworkshop/14-faq.md\n../da...,Thank you for your question. Based on the in...,"In short, anything that is related to deployin..."


In [None]:
result_df['answer'] = result_df['answer'].apply(lambda x: x.replace("Thank you for your question.", ""))
result_df['bert_score'] = result_df.apply(lambda x: bert_score(x['answer'], x['actual_answer']), axis=1)
result_df['lv_sim'] = result_df.apply(lambda x: string_similarity(x['answer'], x['actual_answer']), axis=1)

In [None]:
result_df['bert_score'].mean()

0.5743495482664842

In [None]:
result_df['lv_sim'].mean()

0.22871913382263181

In [None]:
for i,j  in result_df[['answer', 'actual_answer', 'query']].iterrows():
    print ("Query:")
    print (j['query'], '\n')
    print ("Actual answer:")
    print(insert_newlines(j['actual_answer']), '\n')
    print ("Generated answer")
    print(insert_newlines(j['answer']), '\n')

# Conclusion
The notebook adds a structure that can be used to test the document search application with different cases. Next, this should be used to test more prompts. Another next step is to add RAG metric evaluation in this test notebook. 