In [5]:
from rag_module.rag import RAG
import mlflow
import inspect
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# set path to benchmark data
os.environ["embedding_path"] = "./embeddings/ELOQ_news/"

In [2]:
# load data
silver = pd.read_csv("eval_data/ScopeQA/ELOQ_silver.csv")
gold = pd.read_csv("eval_data/ScopeQA/ELOQ_gold.csv")

trainset = silver.sample(500, random_state=1)
print('trainset:')
print(f'{trainset['llm_confusion_label'].value_counts()}\n')

testset = gold.sample(50, random_state=1)
print('testset:')
print(testset['llm_confusion_label'].value_counts())

trainset:
llm_confusion_label
yes    278
no     222
Name: count, dtype: int64

testset:
llm_confusion_label
yes    25
no     25
Name: count, dtype: int64


In [33]:
# set models
rags = [{"model": explainaRAG(hybrid_embedder=True, reranker_name="flashrank", retrieve_top_k=5), "name": "topk5"}]

# set mlflow experiment
mlflow.set_experiment("eval_scope_models")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


<Experiment: artifact_location='file:///c:/Users/SjoerdO/OneDrive%20-%20Info%20Support%20B.V/Documents/Afstudeeronderzoek/RAG/mlruns/316985839508092231', creation_time=1744120060800, experiment_id='316985839508092231', last_update_time=1744120060800, lifecycle_stage='active', name='eval_scope_models', tags={}>

In [34]:
def closest_chunk_scores(rag, df):
    "add scores for the closest chunk to each question"
    
    df['top_sparse_score'] = df['question'].apply(lambda q: rag.embedder.search_bm25(q, top_k=1)[0][0])
    df['top_dense_score'] = df['question'].apply(lambda q: rag.embedder.search(q, top_k=1)[0][0][0])

    top_ranked = df['question'].apply(lambda q: rag.retrieve(q)[0])
    df['top_reranker_doc'] = top_ranked.apply(lambda x: x['filename'])
    df['top_reranker_score'] = top_ranked.apply(lambda x: x['score'])

def train_model(df, features=['top_reranker_score']):
    # Prepare the features and labels
    X = df[features].values
    y = df['llm_confusion_label'].values

    # Train the logistic regression model
    model = LogisticRegression()
    model.fit(X, y)

    return model

def log_constructor_params(instance, cls):
    sig = inspect.signature(cls.__init__)
    bound_args = sig.bind_partial()
    bound_args.apply_defaults()
    all_params = bound_args.arguments

    # Exclude 'self'
    all_params = {k: v for k, v in all_params.items() if k != 'self'}

    # Now get the actual values from the instance if they've changed
    for param in all_params:
        try:
            value = getattr(instance, param)
        except AttributeError:
            value = all_params[param]
        mlflow.log_param(param, str(value))

In [36]:
features = ['top_reranker_score', 'top_sparse_score', 'top_dense_score']

for rag in rags:
    with mlflow.start_run(run_name=rag["name"]) as run:
        
        # retrieve features
        closest_chunk_scores(rag['model'], trainset)
        closest_chunk_scores(rag['model'], testset)

        # train model
        model = train_model(trainset, features=features)

        # predict on testset
        testset['logreg_prediction'] = model.predict(testset[features].values) 
        testset['logreg_probability'] = model.predict_proba(testset[features].values)[:, 1]

        # get metrics
        retrieve_accuracy = accuracy_score(testset['doc_id'], testset['top_reranker_doc'])
        accuracy = accuracy_score(testset['llm_confusion_label'], testset['logreg_prediction'])
        f1 = f1_score(testset['llm_confusion_label'], testset['logreg_prediction'], pos_label='yes')
        precision = precision_score(testset['llm_confusion_label'], testset['logreg_prediction'], pos_label='yes')
        recall = recall_score(testset['llm_confusion_label'], testset['logreg_prediction'], pos_label='yes')

        # log to mlflow
        log_constructor_params(rag['model'], rag['model'].__class__.__bases__[0])
        log_constructor_params(rag['model'], rag['model'].__class__)
        mlflow.log_param("model_name", rag["name"])
        mlflow.log_param("features", str(['top_reranker_score']))
        mlflow.log_param("trainset_size", len(trainset))
        mlflow.log_param("testset_size", len(testset))

        mlflow.sklearn.log_model(model, artifact_path="logreg_model")

        mlflow.log_metric("retrieve_accuracy", retrieve_accuracy)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)

Batches: 100%|██████████| 1/1 [00:00<00:00, 21.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.42it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.54it/s]
Batches: 1

In [29]:
testset[['llm_confusion_label', 'logreg_prediction']]

Unnamed: 0,llm_confusion_label,logreg_prediction
78,yes,yes
33,yes,yes
34,yes,yes
62,no,yes
97,yes,yes


In [30]:
# Get the coefficients from the trained model
coefficients = model.coef_[0]

# Combine into a DataFrame for readability
feature_importance = pd.DataFrame({
    'feature': features,
    'coefficient': coefficients
}).sort_values(by='coefficient', key=abs, ascending=False)

print(feature_importance)


              feature  coefficient
0  top_reranker_score    -0.836298


In [31]:
testset[testset['llm_confusion_label'] != testset['logreg_prediction']]

Unnamed: 0,doc_id,q_id,llm_confusion_label,human_confusion_label,llm_defusion_label,human_defusion_label,question,answer,top_sparse_score,top_dense_score,top_reranker_doc,top_reranker_score,logreg_prediction,logreg_probability
62,tech_111,tech_111_0_5,no,no,,,How does Sifan Hassan’s perspective on trainin...,Sifan Hassan's perspective on training and com...,45.619014,0.798171,tech_111,0.966061,yes,0.734595


In [8]:
import numpy as np

In [3]:
rag = RAG(hybrid_embedder=True, reranker_name="flashrank", retrieve_top_k=5, in_scope=True)

Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


In [4]:
rag.query("What is the capital of France?")

Batches: 100%|██████████| 1/1 [00:00<00:00, 11.16it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


ValueError: X has 1 features, but LogisticRegression is expecting 3 features as input.

In [11]:
# predict from logreg model
modelpath = './embeddings/ELOQ_news/model.pkl'
import pickle

# Load the trained model
modelpath = './embeddings/ELOQ_news/model.pkl'
with open(modelpath, 'rb') as f:
    model = pickle.load(f)

# Predict using the model
predictions = model.predict(np.array([0.89, 0.66, 0.66]).reshape(1, -1))

# Output predictions
print("Predictions:", predictions)


Predictions: ['yes']


In [12]:
model

# User testing

In [None]:
# load data
silver = pd.read_csv("eval_data/ScopeQA/ELOQ_silver.csv")
gold = pd.read_csv("eval_data/ScopeQA/ELOQ_gold.csv")

In [30]:
silver[52:60]

Unnamed: 0,doc_id,q_id,llm_confusion_label,question
52,news_6,news_6_1_3,yes,How do historical patterns of tropical storms ...
53,news_6,news_6_1_4,yes,What specific climate change factors contribut...
54,news_6,news_6_1_5,yes,Which community initiatives are most effective...
55,news_6,news_6_1_6,yes,What are the recommended evacuation plans for ...
56,news_7,news_7_0_1,no,What motivated Fred and Sheila McCoy to become...
57,news_7,news_7_0_2,no,How did the couple's search efforts contribute...
58,news_7,news_7_0_4,no,What methods were used to attempt to identify ...
59,news_7,news_7_0_5,no,How did the community respond to the shooting ...


In [37]:
[chunk['text'] for chunk in rag.embedder.chunkdata if chunk['filename'] == 'sport_198']

["Aston Villa will be attempting to end a 20-year streak when they play host to Wolverhampton Wanderers in the West Midlands derby on Saturday afternoon. Aston Villa will be hoping to end a 20-year streak when they square off against Wolverhampton Wanderers on Saturday afternoon. Unai Emery has been left delighted with Villa's start to the new season, collecting nine points from their opening four games in the Premier League. Furthermore, they kicked off their first-ever Champions League campaign on Tuesday evening with a 3-0 victory at Swiss side Young Boys. That has started a hectic schedule for Villa, who must now play five more matches in the space of two weeks before the next international break. While they are not necessarily entering new territory given last season's exhausting Europa Conference League campaign, it remains to be seen whether opting for less rotation in higher-profile matches will take its toll. © Imago\n\nWhat streak are Villa looking to end? Having made no chan

In [25]:
q = silver['question'][56]
print(q)

What motivated Fred and Sheila McCoy to become involved in the search for the highway shooter's remains?


In [7]:
rag = RAG(hybrid_embedder=True, reranker_name="flashrank", retrieve_top_k=5)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


In [26]:
LLM_response = rag.query_LLM(q)
RAG_response = rag.query(q)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.44it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [27]:
LLM_response

"Fred and Sheila McCoy became involved in the search for the highway shooter's remains due to a personal connection; their son was a victim of the highway shooter in 1997. Motivated by their loss and desire for closure, they joined the efforts to help locate his remains. Their involvement reflects a deep emotional commitment to finding justice and addressing the pain caused by the tragic event."

In [28]:
RAG_response['answer']

"Fred and Sheila McCoy were motivated to become involved in the search for the highway shooter's remains after a shooter attacked an interstate and disappeared, causing fear in the Kentucky community. They decided to take action by searching the rugged terrain and ultimately found what they believe to be the remains of Joseph Couch, the suspected shooter. Fred McCoy mentioned that for one week, they took on the role of bounty hunters, indicating a sense of duty to help their community during this troubling time."

In [29]:
RAG_response

{'chunks': [{'filename': 'news_7',
   'text': "A couple found the Kentucky highway shooter's remains by being bounty hunters for a week, they say\n\nIn this photo made from video provided by Sheila and Fred McCoy shows the couple while searching for the remains of a suspected highway shooter in London, Ky., Wednesday, Sept. 18, 2024. (Sheila and Fred McCoy via AP)\n\nLOUISVILLE, Ky. (AP) — Days after a shooter attacked an interstate and disappeared, leaving a Kentucky community scared and on guard, Fred and Sheila McCoy decided to lace up their boots for the first time in a long time and spend days in rugged terrain searching until, finally, they found a body. Kentucky State Police credited Fred and Sheila McCoy, who typically spend their retired days creating YouTube videos about the Hatfield-McCoy feud, with helping investigators find what they believe are the remains of Joseph Couch. Couch, 32, is suspected of firing randomly at vehicles on Interstate 75 on Sept. 7, wounding five pe

In [32]:
rag.query('Who are Fred and Sheila McCoy?')

Batches: 100%|██████████| 1/1 [00:00<00:00, 33.37it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'chunks': [{'filename': 'news_7',
   'text': "A couple found the Kentucky highway shooter's remains by being bounty hunters for a week, they say\n\nIn this photo made from video provided by Sheila and Fred McCoy shows the couple while searching for the remains of a suspected highway shooter in London, Ky., Wednesday, Sept. 18, 2024. (Sheila and Fred McCoy via AP)\n\nLOUISVILLE, Ky. (AP) — Days after a shooter attacked an interstate and disappeared, leaving a Kentucky community scared and on guard, Fred and Sheila McCoy decided to lace up their boots for the first time in a long time and spend days in rugged terrain searching until, finally, they found a body. Kentucky State Police credited Fred and Sheila McCoy, who typically spend their retired days creating YouTube videos about the Hatfield-McCoy feud, with helping investigators find what they believe are the remains of Joseph Couch. Couch, 32, is suspected of firing randomly at vehicles on Interstate 75 on Sept. 7, wounding five pe

In [12]:
response = rag.query('Write a one-paragraph article about the New York Yankees')
print(response['answer'])

Batches: 100%|██████████| 1/1 [00:00<00:00, 56.53it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The New York Yankees are currently in a pivotal phase as they seek a successor to the legendary radio announcer John Sterling, who has announced his retirement after an illustrious career but will return to call the playoffs for WFAN this fall. Among the leading candidates for this coveted position is Dave Sims, the Seattle Mariners' TV play-by-play voice, who has deep roots in New York sports media, having begun his career at the New York Daily News and hosted on WFAN in the early '90s. Additionally, Rickie Ricardo, a seasoned Spanish language announcer who has filled in alongside Suzyn Waldman this season, is also in the running for the role. While the Yankees are exploring their options, including a brief discussion with Marlins announcer Paul Severino, the decision is narrowing down to Sims and Ricardo, raising excitement among fans eager to see who will carry on Sterling's legacy.


In [13]:
response

{'chunks': [{'filename': 'sport_133',
   'text': 'Syndication: The Record\n\nThe New York Yankees have been searching near and far for the heir apparent to John Sterling. While the legendary voice who retired earlier this season will return to call the playoffs for WFAN this fall, the radio giant still hasn’t landed on who will be his successor.\n\nIt was reported earlier this week that Seattle Mariners TV play-by-play voice Dave Sims had interviewed for the position. Replacing Sterling would be a full circle moment for the 71-year-old Sims, who got his start at the New York Daily News, before venturing into radio, hosting a midday show on WFAN with Ed Coleman from 1989-93. He also was a weekend sports anchor at WCBS-TV in New York, as well as a radio host for the New York Knicks.\n\nSims has worn many hats, but has yet to wear a Yankees cap. Perhaps that’s in the cards, but he’s not the only candidate to replace Sterling. On Friday, the New York Post reported that in addition to Sims,