# Set up Phoenix

In [None]:
from phoenix.otel import register
import os
from openinference.instrumentation.openai import OpenAIInstrumentor

project_name = "RAG_Rio"

# Add Phoenix API Key for tracing
phoenix_key = ''
with open('phoenix_key.txt', 'r') as file:
    phoenix_key = file.read()
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={phoenix_key}"
os.environ["PHOENIX_API_KEY"] = phoenix_key
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={phoenix_key}";
os.environ['PHOENIX_PROJECT_NAME'] = project_name

# configure the Phoenix tracer
tracer_provider = register(
  project_name=project_name, # Default is 'default'
  auto_instrument=True # Auto-instrument your app based on installed OI dependencies
)
OpenAIInstrumentor().instrument(tracer_provider = tracer_provider)
tracer = tracer_provider.get_tracer(__name__)


# Basic imports and setups

In [None]:
import nest_asyncio
import pandas as pd
import phoenix as px
from basic_rag import MilvusKnowledgeStorage, get_paragraphs, search_pdf
from time import sleep
import datetime
from langchain_experimental.text_splitter import SemanticChunker
import matplotlib.pyplot as plt
import re
from sklearn.metrics.pairwise import cosine_similarity
from langchain_huggingface import HuggingFaceEmbeddings

nest_asyncio.apply()

## Load best dataset from Phoenix

Generated the answer with Anthropic Claude 3 Opus model. No noise

In [None]:
phoenix_client = px.Client()

In [None]:
dataset_phoenix_name = "qa_data_rio"
df_best = pd.read_json(f"./{dataset_phoenix_name}.json")

In [None]:
try:
    dataset_best = phoenix_client.upload_dataset(
        dataframe=df_best,
        dataset_name=dataset_phoenix_name,
        input_keys=["question"],
        output_keys=["human",  "chatgpt"],
    )
except Exception as e:
    dataset_best = phoenix_client.get_dataset(name=dataset_phoenix_name)

# Write Phoenix evaluation

In [None]:
from phoenix.experiments.evaluators.base import EvaluationResult, Evaluator

In [None]:
from typing import Any, Dict
import torch
from bert_score import score

class BERTScore(Evaluator):
    name="BERT Score"
    def evaluate(self, output: str, expected: Dict[str, Any], **kwargs) -> EvaluationResult:
        expected_answer = expected["chatgpt"]

        # compute Bert score
        # presission, recall and F1
        P, R, F1 = score([output], [expected_answer], lang="en", model_type="ProsusAI/finbert")
        return EvaluationResult(score=F1.numpy(force=True)[0])

In [None]:
import tensorflow_hub as hub
from scipy.spatial import distance

class USESimilarity(Evaluator):
    name="USE"
    def __init__(self):
        super().__init__()
        self.embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2")
    def evaluate(self, output: str, expected, **kwargs) -> EvaluationResult:
        embeddings = self.embed([
            output,
            expected["chatgpt"]
        ])

        similarity = 1 - distance.cosine(embeddings[0], embeddings[1])
        return EvaluationResult(score=similarity)

uses = USESimilarity()

In [None]:
from nltk.translate.meteor_score import single_meteor_score
import nltk
"""
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
"""

import re
from nltk.corpus import stopwords

stop_word_list = stopwords.words('english')


def my_tokkenizer(text):
    # különleges karakterek
    pattern = r"[{}]".format("(),.;:%\"") 
    text = re.sub(pattern, "", text)
    
    # kisbetű
    text = text.lower()
    # felesleges üres mezők törlése 
    text = text.strip()
 
    # szavakra vágás
    from nltk.tokenize import WordPunctTokenizer
    WPT = WordPunctTokenizer()
    tokens = WPT.tokenize(text)

    # stop szavak eltávolítása
    filtered_tokens = [
        token for token in tokens 
        if token not in stop_word_list
    ]
    
    # Lemmatize do not need as METEOR handle this also

    return filtered_tokens

class Meteor(Evaluator):
    name="METEOR"
    def evaluate(self, output: str, expected, **kwargs) -> EvaluationResult:
        score = single_meteor_score(my_tokkenizer(output), my_tokkenizer(expected["chatgpt"]))
        return EvaluationResult(score=score)

In [None]:
from scipy.spatial.distance import euclidean
from sentence_transformers import SentenceTransformer, SimilarityFunction

class SBERTFinance(Evaluator):
    name="SBERT finance"
    def __init__(self):
        super().__init__()
        self.model = SentenceTransformer(
            "FinLang/finance-embeddings-investopedia", 
            similarity_fn_name=SimilarityFunction.COSINE
        )
    def evaluate(self, output: str, expected, **kwargs) -> EvaluationResult:
        rag_answer_embeddings = self.model.encode([output])
        expected_embeddings = self.model.encode([expected["chatgpt"]])
        similarity = self. model.similarity(rag_answer_embeddings, expected_embeddings)
        return EvaluationResult(score=similarity.numpy(force=True)[0][0])

finance = SBERTFinance()

In [None]:
class SBERTQwen4(Evaluator):
    name="SBERT Qwen3 4B"
    def __init__(self):
        super().__init__()
        self.model = SentenceTransformer(
            "Qwen/Qwen3-Embedding-4B", 
            similarity_fn_name=SimilarityFunction.COSINE,
            tokenizer_kwargs={"padding_side": "left"},
        )
    def evaluate(self, output: str, expected, **kwargs) -> EvaluationResult:
        rag_answer_embeddings = self.model.encode([output])
        expected_embeddings = self.model.encode([expected["chatgpt"]])
        similarity = self.model.similarity(rag_answer_embeddings, expected_embeddings)
        return EvaluationResult(score=similarity.numpy(force=True)[0][0])

class SBERTQwen06(Evaluator):
    name="SBERT Qwen3 0.6B"
    def __init__(self):
        super().__init__()
        self.model = SentenceTransformer(
            "Qwen/Qwen3-Embedding-0.6B", 
            similarity_fn_name=SimilarityFunction.COSINE,
            model_kwargs={"device_map": "auto"},
            tokenizer_kwargs={"padding_side": "left"},
        )
    def evaluate(self, output: str, expected, **kwargs) -> EvaluationResult:
        rag_answer_embeddings = self.model.encode([output])
        expected_embeddings = self.model.encode([expected["chatgpt"]])
        similarity = self.model.similarity(rag_answer_embeddings, expected_embeddings)
        return EvaluationResult(score=similarity.numpy(force=True)[0][0])

# qwen4 = SBERTQwen4()
qwen06 = SBERTQwen06()

# Szomszédos mondatok szemantikus távolsága

In [None]:
df_texts = search_pdf(ticker="RIO")

In [None]:
def szemantikus_tavolsag(text: str, embeddings) -> list:
    """
    Egy szöveg szomszédos mondatai közötti szemantikus távolság (1 - koszinusz-hasonlóság) kiszámítása és vizualizálása.

    Ez a függvény a bemeneti szöveget mondatokra bontja, majd a mondatok beágyazásait (embeddingjeit) használva
    kiszámítja a szomszédos mondatok közötti szemantikus távolságot. Az eredményeket hisztogram formájában ábrázolja.

    Paraméterek:
        text (str): A bemeneti szöveg, amelyet elemezni kell.
        embeddings: Egy beágyazási (embedding) objektum, amely rendelkezik `embed_documents` metódussal.

    Visszatérési érték:
        list: A szomszédos mondatok szemantikus távolsága.
    """
    
    # a szöveg darabolása ugyanugy mint a SemanticChunker osztály teszi
    sentences = re.split(r"(?<=[.?!])\s+", text)

    # egyes mondatok beágyazása
    sentence_embeddings = embeddings.embed_documents(sentences)

    # szomszédos mondatok szemantikus távolsága
    similarities = []
    differences = []
    pairs = []
    for i in range(len(sentences) - 1):
        sim = cosine_similarity(
            [sentence_embeddings[i]], [sentence_embeddings[i+1]]
        )[0][0]
        diff = 1 - sim
        similarities.append(sim)
        differences.append(diff)
        pairs.append((sentences[i], sentences[i+1]))

    # 7. Plot distribution of differences
    plt.hist(differences, bins=10, edgecolor="black")
    plt.title("A szomszédos mondatok szemantikus távolsága")
    plt.xlabel("Távolság (1 - koszinusz-hasonlóság)")
    plt.ylabel("Frekvencia")
    plt.show()

    return differences

embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B"
)
distances = szemantikus_tavolsag(df_texts['knowledge/rio_20210323_20210323_qa_1.pdf'], embeddings)

# Evaluate basic RAG

In [None]:
from phoenix.experiments.types import Example
from basic_rag import MyRAG
from phoenix.experiments import run_experiment


def task(input, expected) -> str:
    question = input['question']
    
    
    # mock the RAG generation
    rag_answer = MyRAG().invoke(question)
    
    return rag_answer


In [None]:
df_all_result = pd.DataFrame()

result_file = "rio_validation.json"
if os.path.exists(result_file):
    df_all_result = pd.read_json(result_file)
df_all_result.head()

In [None]:
splitting_sets = [
     {
        "splitter": "Semantic", 
        "model_name": "Qwen/Qwen3-Embedding-0.6B",
        "breakpoint_threshold_type": "percentile",
        "breakpoint_threshold_amount": 70.0
    },
    {
        "splitter": "Semantic", 
        "model_name": "Qwen/Qwen3-Embedding-0.6B",
        "breakpoint_threshold_type": "percentile",
        "breakpoint_threshold_amount": 80.0
    },
    {
        "splitter": "Semantic", 
        "model_name": "Qwen/Qwen3-Embedding-0.6B",
        "breakpoint_threshold_type": "standard_deviation",
        "breakpoint_threshold_amount": 1.25
    },
    {
        "splitter": "Semantic", 
        "model_name": "FinLang/finance-embeddings-investopedia",
        "breakpoint_threshold_type": "percentile",
        "breakpoint_threshold_amount": 70.0
    },
    {
        "splitter": "Semantic", 
        "model_name": "FinLang/finance-embeddings-investopedia",
        "breakpoint_threshold_type": "standard_deviation",
        "breakpoint_threshold_amount": 1.25
    },
    {
        "splitter": "Agentic",
        "max_content":300,
        "markdown_presplit": False
    },
    {
        "splitter": "Agentic",
        "max_content":60,
        "markdown_presplit": False
    },
    {
        "splitter": "Agentic",
        "max_content":200,
        "markdown_presplit": False
    },
    {
        "splitter": "RecursiveCharacter",
        "chunk_overlap": 0,
        "chunk_size": 1000
    },
        {
        "splitter": "RecursiveCharacter",
        "chunk_overlap": 300,
        "chunk_size": 3000
    },
    {
        "splitter": "Semantic", 
        "model_name": "FinLang/finance-embeddings-investopedia",
        "breakpoint_threshold_type": "percentile",
        "breakpoint_threshold_amount": 70.0,
        "markdown_presplit": False
    },
    {
        "splitter": "Semantic", 
        "model_name": "FinLang/finance-embeddings-investopedia",
        "breakpoint_threshold_type": "standard_deviation",
        "breakpoint_threshold_amount": 1.25,
        "markdown_presplit": False
    },
]

In [None]:
# filter out settings which was tested before
filtered_splitting_sets = []
for splitting_setting in splitting_sets:
    if df_all_result.shape[0] == 0:
        filtered_splitting_sets.append(splitting_setting)
    elif str(splitting_setting) not in set(df_all_result["setting"]):
        filtered_splitting_sets.append(splitting_setting)
print(filtered_splitting_sets)

In [None]:
# run the different experiments
for splitting_setting in filtered_splitting_sets:
    start_timestamp = datetime.datetime.now()
    
    paragraphs = get_paragraphs(
        ticker="RIO",
        max_lenght=200,
        **splitting_setting  # Unpack the rest of the settings dynamically
    )

    try:
        knowledge = MilvusKnowledgeStorage()
    except:
        # retry
        sleep(1)
        knowledge = MilvusKnowledgeStorage()
    knowledge.initialize_knowledge_storage()

    # load data
    documents = [ doc.page_content for doc in paragraphs]
    metadata = [ doc.metadata for doc in paragraphs]
    knowledge.save(documents=documents, metadata=metadata)

    data_load_timestamp = datetime.datetime.now()

    experiment = run_experiment(
        dataset_best,
        task,
        experiment_name="rag-experiment",
        evaluators=[qwen06, BERTScore() ],
        experiment_metadata=splitting_setting
    )

    try:
        evaluation_result = experiment.get_evaluations()
    except:
        evaluation_result = experiment.get_evaluations()

    evaluation_result_filtered = pd.DataFrame({
        "name": evaluation_result["name"].values,
        "score": evaluation_result["score"].values,
        "setting": str(splitting_setting),
        "data_load_time": (data_load_timestamp-start_timestamp).total_seconds()
    })

    df_all_result = pd.concat([df_all_result, evaluation_result_filtered])
    df_all_result.to_json(result_file, orient="records")

In [None]:
import plotly.express as plotly_express
import plotly.graph_objects as plotly_go

custom_template = {
    "layout": plotly_go.Layout(
        font={
            "family": "Nunito",
            "size": 12,
            "color": "#707070",
        },
        title={
            "font": {
                "family": "Lato",
                "size": 18,
                "color": "#1f1f1f",
            },
        },
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff",
        colorway=plotly_express.colors.qualitative.G10,
    )
}

def format_title(title, subtitle=None, subtitle_font_size=14):
    title = f'<b>{title}</b>'
    if not subtitle:
        return title
    subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>'
    return f'{title}<br>{subtitle}'

In [None]:
df_all_result.sort_values(["name", "setting"], inplace=True)

In [None]:
# Create a boxplot using Plotly
fig = plotly_express.box(
    df_all_result,
    x="name",
    y="score",
    color="setting",
    title=format_title("Szöveg darabolása", "RAG megvalósítás teljesítménye"),
    labels={"name": "Evaluation Type", "score": "Similarity Score",  "setting": "Setting"},
    template=custom_template
)

# Customize the layout
fig.update_layout(
    xaxis_title="Mérési módszer",
    yaxis_title="Hasonlóság",
    xaxis=dict(tickangle=45),
    showlegend=False
)

# Show the plot
fig.show()
# save
fig.write_html(result_file.replace(".json", ".html"))

In [None]:
df_all_result.to_json(result_file, orient="records")

# Cleaning up

To cleanup the unnecessary HuggingFace models run the following command: huggingface-cli delete-cache