# Imports

In [None]:
import evaluate
import pandas as pd
from tqdm import tqdm
import json

from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')
bleu = evaluate.load('bleu')

# Methods

## Score computing method

In [None]:
def evaluate_with_hf(founds, answer):
    # A kapott válaszokat egy sztringgé alakítjuk a kiértékeléshez
    predicted_text = " ".join(founds)
    reference_text = " ".join(answer)

    # Rouge
    rouge_scores = rouge.compute(predictions=[predicted_text], references=[reference_text])

    # Meteor
    meteor_score = meteor.compute(predictions=[predicted_text], references=[reference_text])

    # Bleu
    bleu_score = bleu.compute(predictions=[predicted_text], references=[reference_text])

    return rouge_scores, meteor_score, bleu_score

## Token Based Methods

In [None]:
def get_response(resp):
    return resp.split("\n---------------------\n")[1].split("\n\n")

def tb_query(index, df, top_k, size, min_k=1):
    for k in range(min_k, top_k+1):
        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(df["question"].values, desc=f"({size} token, k = {k}) Get responses", leave=False):
            resps.append(get_response(query_engine.query(q).response))

        df[f"answer_tb_{size}_k{k}"] = resps
    
    answer_df = df.loc[:, df.columns.str.contains('answer', case=False)]

    return answer_df

In [None]:
def compute_scores_tb(dataframe, size):
    num_columns = dataframe.shape[1] - 1

    res_df = pd.DataFrame()

    for k in range(1, int(num_columns) + 1):

        rouge_scores = [0] * len(dataframe)
        meteor_scores = [0] * len(dataframe)
        bleu_scores = [0] * len(dataframe)
        answers = [0] * len(dataframe)

        for i in tqdm(range(0,len(dataframe)), desc=f"(k = {k}) Computing {size} scores"):
            rouge_score, meteor_score, bleu_score = evaluate_with_hf(
                dataframe[f"answer_tb_{size}_k{k}"].values[i],
                dataframe["answer"].values[i])
            
            rouge_scores[i] = rouge_score
            meteor_scores[i] = meteor_score
            bleu_scores[i] = bleu_score
            answers[i] = dataframe[f"answer_tb_{size}_k{k}"].values[i],

        res_df[f"rouge_score_tb_{size}_k{k}"] = rouge_scores
        res_df[f"meteor_score_tb_{size}_k{k}"] = meteor_scores
        res_df[f"bleu_score_tb_{size}_k{k}"] = bleu_scores
        res_df[f"answer_tb_{size}_k{k}"] = answers

    return res_df

In [None]:
def compute_mean_scores_tb(dataframe, size):
    num_columns = (dataframe.shape[1]) / 4

    results = []

    for k in range(1, int(num_columns) + 1):
        # Rouge átlagok számítása
        rouge1_avg = dataframe[f'rouge_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rouge1']).mean()
        rouge2_avg = dataframe[f'rouge_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rouge2']).mean()
        rougeL_avg = dataframe[f'rouge_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rougeL']).mean()
        rougeLsum_avg = dataframe[f'rouge_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rougeLsum']).mean()

        # Meteor átlag
        meteor_avg = dataframe[f'meteor_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['meteor']).mean()

        # Bleu értékek átlagolása
        bleu_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['bleu']).mean()

        # Precision értékek átlagolása a precisions listából
        precision1_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][0]).mean()
        precision2_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][1]).mean()
        precision3_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][2]).mean()
        precision4_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][3]).mean()

        # Egyéb Bleu metrikák átlagolása
        brevity_penalty_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['brevity_penalty']).mean()
        length_ratio_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['length_ratio']).mean()
        translation_length_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['translation_length']).mean()
        reference_length_avg = dataframe[f'bleu_score_tb_{size}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['reference_length']).mean()


        results.append({
            "k": k,
            "rouge1_avg": rouge1_avg,
            "rouge2_avg": rouge2_avg,
            "rougeL_avg": rougeL_avg,
            "rougeLsum_avg": rougeLsum_avg,
            "meteor_avg": meteor_avg,
            "bleu_avg": bleu_avg,
            "precision1_avg": precision1_avg,
            "precision2_avg": precision2_avg,
            "precision3_avg": precision3_avg,
            "precision4_avg": precision4_avg,
            "brevity_penalty_avg": brevity_penalty_avg,
            "length_ratio_avg": length_ratio_avg,
            "translation_length_avg": translation_length_avg,
            "reference_length_avg": reference_length_avg
        })

    results_df = pd.DataFrame(results)

    return results_df

## Sentence Based Methods

In [None]:
def get_response(resp):
    return resp.split("\n---------------------\n")[1].split("\n\n")

def sb_query(index, df, top_k, min_k=1):
    for k in range(min_k, top_k+1):
        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(df["question"].values, desc=f"(sentence based, k = {k}) Get responses", leave=False):
            resps.append(get_response(query_engine.query(q).response))

        df[f"answer_sb_k{k}"] = resps
    
    answer_df = df.loc[:, df.columns.str.contains('answer', case=False)]

    return answer_df

In [None]:
def compute_scores_sb(dataframe):
    num_columns = dataframe.shape[1] - 1

    res_df = pd.DataFrame()

    for k in range(1, int(num_columns) + 1):

        rouge_scores = [0] * len(dataframe)
        meteor_scores = [0] * len(dataframe)
        bleu_scores = [0] * len(dataframe)
        answers = [0] * len(dataframe)

        for i in tqdm(range(0,len(dataframe)), desc=f"(k = {k}) Computing scores"):
            rouge_score, meteor_score, bleu_score = evaluate_with_hf(
                dataframe[f"answer_sb_k{k}"].values[i],
                dataframe["answer"].values[i])
            
            rouge_scores[i] = rouge_score
            meteor_scores[i] = meteor_score
            bleu_scores[i] = bleu_score
            answers[i] = dataframe[f"answer_sb_k{k}"].values[i],

        res_df[f"rouge_score_sb_k{k}"] = rouge_scores
        res_df[f"meteor_score_sb_k{k}"] = meteor_scores
        res_df[f"bleu_score_sb_k{k}"] = bleu_scores
        res_df[f"answer_sb_k{k}"] = answers

    return res_df


In [None]:
def compute_mean_scores_sb(dataframe):
    num_columns = (dataframe.shape[1]) / 4

    results = []

    for k in range(1, int(num_columns) + 1):
        # Rouge átlagok számítása
        rouge1_avg = dataframe[f'rouge_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rouge1']).mean()
        rouge2_avg = dataframe[f'rouge_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rouge2']).mean()
        rougeL_avg = dataframe[f'rouge_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rougeL']).mean()
        rougeLsum_avg = dataframe[f'rouge_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rougeLsum']).mean()

        # Meteor átlag
        meteor_avg = dataframe[f'meteor_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['meteor']).mean()

        # Bleu értékek átlagolása
        bleu_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['bleu']).mean()

        # Precision értékek átlagolása a precisions listából
        precision1_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][0]).mean()
        precision2_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][1]).mean()
        precision3_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][2]).mean()
        precision4_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][3]).mean()

        # Egyéb Bleu metrikák átlagolása
        brevity_penalty_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['brevity_penalty']).mean()
        length_ratio_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['length_ratio']).mean()
        translation_length_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['translation_length']).mean()
        reference_length_avg = dataframe[f'bleu_score_sb_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['reference_length']).mean()


        results.append({
            "k": k,
            "rouge1_avg": rouge1_avg,
            "rouge2_avg": rouge2_avg,
            "rougeL_avg": rougeL_avg,
            "rougeLsum_avg": rougeLsum_avg,
            "meteor_avg": meteor_avg,
            "bleu_avg": bleu_avg,
            "precision1_avg": precision1_avg,
            "precision2_avg": precision2_avg,
            "precision3_avg": precision3_avg,
            "precision4_avg": precision4_avg,
            "brevity_penalty_avg": brevity_penalty_avg,
            "length_ratio_avg": length_ratio_avg,
            "translation_length_avg": translation_length_avg,
            "reference_length_avg": reference_length_avg
        })

    results_df = pd.DataFrame(results)

    return results_df

## Sliding Window Based Methods

In [None]:
def compute_scores_sw(dataframe, window, overlap):
    num_columns = dataframe.shape[1] - 1

    res_df = pd.DataFrame()

    for k in range(1, int(num_columns) + 1):

        rouge_scores = [0] * len(dataframe)
        meteor_scores = [0] * len(dataframe)
        bleu_scores = [0] * len(dataframe)
        answers = [0] * len(dataframe)

        for i in tqdm(range(0,len(dataframe)), desc=f"(k = {k}) Computing scores {window}-{overlap}"):
            rouge_score, meteor_score, bleu_score = evaluate_with_hf(
                dataframe[f"answer_sw_{window}_{overlap}_k{k}"].values[i],
                dataframe["answer"].values[i])
            
            rouge_scores[i] = rouge_score
            meteor_scores[i] = meteor_score
            bleu_scores[i] = bleu_score
            answers[i] = dataframe[f"answer_sw_{window}_{overlap}_k{k}"].values[i],

        res_df[f"rouge_score_sw_{window}_{overlap}_k{k}"] = rouge_scores
        res_df[f"meteor_score_sw_{window}_{overlap}_k{k}"] = meteor_scores
        res_df[f"bleu_score_sw_{window}_{overlap}_k{k}"] = bleu_scores
        res_df[f"answer_sw_{window}_{overlap}_k{k}"] = answers

    return res_df

In [None]:
def compute_mean_scores_sw(dataframe, window, overlap):
    num_columns = (dataframe.shape[1]) / 4

    results = []

    for k in range(1, int(num_columns) + 1):
        # Rouge átlagok számítása
        rouge1_avg = dataframe[f'rouge_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rouge1']).mean()
        rouge2_avg = dataframe[f'rouge_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rouge2']).mean()
        rougeL_avg = dataframe[f'rouge_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rougeL']).mean()
        rougeLsum_avg = dataframe[f'rouge_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['rougeLsum']).mean()

        # Meteor átlag
        meteor_avg = dataframe[f'meteor_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['meteor']).mean()

        # Bleu értékek átlagolása
        bleu_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['bleu']).mean()

        # Precision értékek átlagolása a precisions listából
        precision1_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][0]).mean()
        precision2_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][1]).mean()
        precision3_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][2]).mean()
        precision4_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['precisions'][3]).mean()

        # Egyéb Bleu metrikák átlagolása
        brevity_penalty_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['brevity_penalty']).mean()
        length_ratio_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['length_ratio']).mean()
        translation_length_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['translation_length']).mean()
        reference_length_avg = dataframe[f'bleu_score_sw_{window}_{overlap}_k{k}'].apply(lambda x: json.loads(x.replace("'", "\""))['reference_length']).mean()


        results.append({
            "k": k,
            "rouge1_avg": rouge1_avg,
            "rouge2_avg": rouge2_avg,
            "rougeL_avg": rougeL_avg,
            "rougeLsum_avg": rougeLsum_avg,
            "meteor_avg": meteor_avg,
            "bleu_avg": bleu_avg,
            "precision1_avg": precision1_avg,
            "precision2_avg": precision2_avg,
            "precision3_avg": precision3_avg,
            "precision4_avg": precision4_avg,
            "brevity_penalty_avg": brevity_penalty_avg,
            "length_ratio_avg": length_ratio_avg,
            "translation_length_avg": translation_length_avg,
            "reference_length_avg": reference_length_avg
        })

    results_df = pd.DataFrame(results)

    return results_df

In [None]:
def get_response(resp):
    return resp.split("\n---------------------\n")[1].split("\n\n")

def sw_query(index, df, top_k, window, overlap, min_k=1):
    for k in range(min_k, top_k+1):
        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(df["question"].values, desc=f"({window}-{overlap} token, k = {k}) Get responses"):
            resps.append(get_response(query_engine.query(q).response))

        df[f"answer_sw_{window}_{overlap}_k{k}"] = resps
    
    answer_df = df.loc[:, df.columns.str.contains('answer', case=False)]

    return answer_df


# Load Data

In [None]:
dir_data = "../data/"

Settings.llm = None
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Dataframes

## Token Based

In [None]:
token_size_list = [512, 128, 64, 32, 16]

In [None]:
# 128 token based cross validation medquad variants

# medquad = medquad[:1000] #v1
# medquad = medquad[2000:3000] #v2
# medquad = medquad[4000:5000] #v3
# medquad = medquad[6000:7000] #v4
# medquad = medquad[8000:9000] #v5

Generate the answers

In [None]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad = medquad[:1000]

for size in token_size_list:  
    storage_context_tb = StorageContext.from_defaults(persist_dir=f"{dir_data}vectors/token_based_{size}")
    index_tb = load_index_from_storage(storage_context_tb)

    answer_df = tb_query(index_tb, medquad, 10, size)
    answer_df.to_csv(f"{dir_data}new validation datas/1000_tb_{size}_answers.csv", index=False)

Computing the scores

In [None]:
for size in token_size_list:
    df_tb = pd.read_csv(f"{dir_data}new validation datas/1000_tb_{size}_answers.csv")

    scores_df = compute_scores_tb(df_tb, size)

    scores_df.to_csv(f"{dir_data}new validation datas/1000_tb_{size}_scores.csv", index=False)

Compute the mean scores

In [None]:
for size in token_size_list:
    scores_df = pd.read_csv(f"{dir_data}new validation datas/1000_tb_{size}_scores.csv")

    mean_df = compute_mean_scores_tb(scores_df, size)
    mean_df.to_csv(f"{dir_data}new validation datas/1000_tb_{size}_scores_avg.csv", index=False)

## Sentence Based

Generate the answers

In [None]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad = medquad[:1000]

storage_context_sb = StorageContext.from_defaults(persist_dir=f"{dir_data}vectors/sentence_based")
index_sb = load_index_from_storage(storage_context_sb)

answer_df = sb_query(index_sb, medquad, 10)
answer_df.to_csv(f"{dir_data}new validation datas/1000_sb_answers.csv", index=False)


Computing the scores

In [None]:
answer_df = pd.read_csv(f"{dir_data}new validation datas/1000_sb_answers.csv")

scores_df = compute_scores_sb(answer_df)

scores_df.to_csv(f"{dir_data}new validation datas/1000_sb_scores.csv", index=False)

Compute the mean scores

In [None]:
scores_df = pd.read_csv(f"{dir_data}new validation datas/1000_sb_scores.csv")

mean_df = compute_mean_scores_sb(scores_df)

mean_df.to_csv(f"{dir_data}new validation datas/1000_sb_scores_avg.csv", index=False)

## Sliding Window

In [None]:
window = 128
overlaps = [64, 32, 16]

Generate the answers

In [None]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad = medquad[:1000]

for o in overlaps:
    storage_context_sw = StorageContext.from_defaults(persist_dir=f"{dir_data}vectors/token_based_sliding_window_{window}_{o}")
    index_sw = load_index_from_storage(storage_context_sw)

    answer_df = sw_query(index_sw, medquad, 10, window, o)

    answer_df.to_csv(f"{dir_data}new validation datas/1000_sw_{window}_{o}_answers.csv", index=False)

Computing the scores

In [None]:
for o in overlaps:
    sw_df = pd.read_csv(f"{dir_data}new validation datas/1000_sw_{window}_{o}_answers.csv")

    scores_df = compute_scores_sw(sw_df, window, o)

    scores_df.to_csv(f"{dir_data}new validation datas/1000_sw_{window}_{o}_scores.csv", index=False)

Compute the mean scores

In [None]:
for o in overlaps:
    scores_df = pd.read_csv(f"{dir_data}new validation datas/1000_sw_{window}_{o}_scores.csv")

    mean_df = compute_mean_scores_sw(scores_df, window, o)

    mean_df.to_csv(f"{dir_data}new validation datas/1000_sw_{window}_{o}_scores_avg.csv", index=False)