# Cohere RAG with dense retriever and ReRank model
- references: https://docs.cohere.com/v2/docs/rag-complete-example

In [7]:
import cohere
import numpy as np
from typing import List, Tuple, Dict
import os
from dotenv import load_dotenv
import json
import time # for timing functions
import sys
from colorama import Fore, Style, Back


In [8]:
# load secret from local .env file
def get_key():
    #load secret .env file
    load_dotenv()

    #store credentials
    _key = os.getenv('COHERE_API_KEY')

    #verify if it worked
    if _key is not None:
        print(Fore.GREEN + "all is good, beautiful!")
        return _key
    else:
        print(Fore.LIGHTRED_EX + "API Key is missing")

# initilize client
co = cohere.ClientV2(get_key())
        

[32mall is good, beautiful!


In [66]:
# load documents
#read documents as .txt files in data director
def read_documents_with_doi(directory_path: str) -> List[Dict[str, str]]:
    """
    Reads documents and their DOIs from individual files in a directory.

    Args:
        directory_path (str): Path to the directory containing the document files.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing 'doi' and 'text' keys.
    """
    global documents_with_doi
    documents_with_doi = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()
                if len(lines) >= 1:
                    doi = lines[0]
                    text = "".join(lines[1:]).strip()
                    documents_with_doi.append(f"{doi} {text}\n")
    return documents_with_doi

# initialize empty search query
search_queries = []
# Embed the documents
def document_embed(documents:List[str])->List[float]:
    """
    Embeds the documents from a list provided from read_documents_with_doi()
    """
    doc_emb = co.embed(
        model="embed-v4.0",
        input_type="search_document",
        texts=[doc for doc in documents],
        embedding_types=["float"],
        ).embeddings.float
    return doc_emb

# Embed the search query
def query_embed(search_queries:List[str])->List[float]:
    """
    Embeds the query from a list provided in search_queries variable
    """
    query_emb = co.embed(
        model="embed-v4.0",
        input_type="search_query",
        texts=search_queries,
        embedding_types=["float"],
        ).embeddings.float
    return query_emb

# retrieve top_k and compute similarity using dot product
def retrieve_top_k(top_k, query_embedded, documents_embedded, documents)->List[str]:
    """
    returns the top_k documents based on dot product similarity
    """

    scores = np.dot(query_embedded, np.transpose(documents_embedded))[0]#ordered list!
    # takes top scores, and returns sorted list and returns indices sliced by top_k
    max_idx = np.argsort(-scores)[:top_k]
    # returns documents by index
    retrieved_docs = [documents[item] for item in max_idx]
    # returns a list of documents
    return retrieved_docs

def rerank_documents(retrieved_documents,search_queries,threshold,top_k)->List[str]:
    """
    takes retrieved_documents as input along with search_queries and runs them through the 
    rerank model from cohere for semantic similarity. 

    top_n = top_k
    Limits those returned by a threshold score. this is to reduce those that are irrelevant.
    """
    # Rerank the documents
    results = co.rerank(
        model="rerank-v3.5",
        query=search_queries[0],
        documents=[doc for doc in retrieved_documents],
        top_n=top_k,
        max_tokens_per_doc=4096,# defaults to 4096
    )

    # Display the reranking results
    for idx, result in enumerate(results.results):
        print(f"Rank: {idx+1}")
        print(f"Score: {result.relevance_score}")
        print(f"Document: {retrieved_documents[result.index]}\n")

    #returns only those over threshold
    reranked_docs = [
        retrieved_documents[result.index] for result in results.results if result.relevance_score >=threshold
    ]

    print(f"reranked_documents: {reranked_docs}")
    print(f"length of reranked_documents: {len(reranked_docs)}")

    return reranked_docs

def cohere_rag_pipeline(directory_path,search_queries,top_k,threshold):

    # retrieve documents from directory
    documents = read_documents_with_doi(directory_path)
    print(f"Length of documents: {len(documents)}")
    # embed the documents
    documents_embedded = document_embed(documents)

    #embed the query:
    query_embedded = query_embed(search_queries)

    # retrieve the top_k documents
    retrieved_documents = retrieve_top_k(top_k, query_embedded, documents_embedded, documents)

    # rerank the documents using the Rerank model from Cohere
    reranked_documents = rerank_documents(retrieved_documents,search_queries,threshold,top_k)
    # set system instructions
    instructions = """
                    You are an academic research assistant.
                    You must include the DOI in your response.
                    If there is no content provided, ask for a different question.
                    Please structure your response like this:
                    Summary: summary statement here. 
                    DOI: summary of the text associated with this DOI.
                    Address me as, 'my lady'.
                    """
    # create messages to model
    messages = [{"role":"user",
                "content": search_queries[0]},
                {"role":"system",
                "content":instructions}]

    # Generate the response
    resp = co.chat(
        model="command-a-03-2025",
        messages=messages,
        documents=reranked_documents,
    )

    return resp, reranked_documents

# ****** Pipeline ********
# set directory path
directory_path = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data"
# initialize search_queries 
search_queries = [input("what is your query?")]#could be a list of multiple queries
# set top_k
top_k = 5
#set threshold 
threshold = 0.1

response, reranked_documents_end = cohere_rag_pipeline(directory_path,search_queries,top_k,threshold)
# Display the response
print(response.message.content[0].text)
print(Fore.LIGHTCYAN_EX + f"------\nReranked documents:")
for doc in reranked_documents_end:
    print(doc)

# Display the citations and source documents
if response.message.citations:
    print(Fore.LIGHTYELLOW_EX + "\nCITATIONS:")
    for citation in response.message.citations:
        print(f"source text: {citation.text},\nsource: {citation.sources[0].document.get('content').split("\n")[0]}\n------")


Length of documents: 45
Rank: 1
Score: 0.22735819
Document: DOI: 10.1007/s11192-022-04367-w
 Title: Identifying and correcting invalid citations due to DOI errors in Crossref data
Abstract: This work aims to identify classes of DOI mistakes by analysing the open bibliographic metadata available in Crossref, highlighting which publishers were responsible for such mistakes and how many of these incorrect DOIs could be corrected through automatic processes. By using a list of invalid cited DOIs gathered by OpenCitations while processing the OpenCitations Index of Crossref open DOI-to-DOI citations (COCI) in the past two years, we retrieved the citations in the January 2021 Crossref dump to such invalid DOIs. We processed these citations by keeping track of their validity and the publishers responsible for uploading the related citation data in Crossref. Finally, we identified patterns of factual errors in the invalid DOIs and the regular expressions needed to catch and correct them. The o

# Analysis
Precision, recall, accuracy, F1 scores and faithfulness
## Precision, recall, F1 score
### references
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
- https://vitalflux.com/accuracy-precision-recall-f1-score-python-example/


In [16]:
from typing import List, Dict, Set
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
import openpyxl
import pandas as pd

In [17]:
#initial dataframe to capture results from each query and results
#ONLY DO THIS AT THE BEGINNING OF THE ANALYSIS PROCEDURE, OTHERWISE, IT WILL ERASE THE PREVIOUS RESULTS!!

results_df = pd.DataFrame(columns=['Query','Precision','Recall','F1-Score','Accuracy', 'Balanced accuracy', 'Faithfulness score', 'Response'])
results_df

Unnamed: 0,Query,Precision,Recall,F1-Score,Accuracy,Balanced accuracy,Faithfulness score,Response


## set up functions

In [57]:
from typing import List, Dict
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
"""
change this to read in an excel sheet of queries and ground_truth dois.
Then it should be isolated as a function.
Run the function to iterature through the list.
"""

# Extract DOIs from retrieved documents
retrieved_dois = [doc.split("\n")[0].strip("DOI: ") for doc in reranked_documents_end]
print("Retrieved DOIs:", retrieved_dois)

# initiates the variable
ground_truth = []

def evaluate_retrieval(retrieved_dois, ground_truth):
    corpus_doi_list = []
    #corpus_list is a global variable in rag_pipeline()
    for each in range(len(documents_with_doi)):
        a = documents_with_doi[each].split("\n")[0].strip("DOI: ")
        corpus_doi_list.append(a)
    print(len(corpus_doi_list))

    def compare_lists(list1, list2, list3):
        for val in list1:
            if val in list2:
                list3.append(1)
            else:
                list3.append(0)

    #set y_true so that len(y_true)==len(corpus_doi_list)
    y_true = []
    compare_lists(corpus_doi_list,ground_truth,y_true)
    y_true = np.array(y_true)
    y_pred = []
    compare_lists(corpus_doi_list,retrieved_dois,y_pred)
    y_pred = np.array(y_pred)

    # calculate metrics - could also use sklearn.metrics functions such as precision_score, but this is easier to read
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred, normalize=True)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

    faithfulness_score = 0
    for each in retrieved_dois:
        if each in response.message.content[0].text:
            faithfulness_score+=1
        else:
            faithfulness_score+=0

    return {
        'Query':f"{search_queries[0]}",
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Accuracy":accuracy,
        "Balanced accuracy":balanced_accuracy,
        "Faithfulness score":faithfulness_score,
        "Response":response.message.content[0].text
    }

def print_results()->Dict:
    """
    Prints a nicely ordered set of results from evalaute_retrieval()
    """
    global results
    results = evaluate_retrieval(retrieved_dois, ground_truth)
    print(f"For query: {results['Query']}:")
    print(f"Precision: {results['Precision']:.3f}")
    print(f"Recall: {results['Recall']:.3f}")
    print(f"F1-Score: {results['F1-Score']:.3f}")
    print(f"Accuracy: {results['Accuracy']:.3f}")
    print(f"Balanced accuracy: {results['Balanced accuracy']:.3f}")
    print(f"Faithfulness score: {results['Faithfulness score']}")
    return results

#for debugging

print_results()


Retrieved DOIs: ['10.1162/qss_a_00286', '10.48550/arXiv.2303.17661', '10.31222/osf.io/smxe5']
45
For query: which studies examined the abstract in metadata?:
Precision: 0.000
Recall: 0.000
F1-Score: 0.933
Accuracy: 0.933
Balanced accuracy: 0.933
Faithfulness score: 2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'Query': 'which studies examined the abstract in metadata?',
 'Precision': 0.0,
 'Recall': 0.0,
 'F1-Score': 0.9333333333333333,
 'Accuracy': 0.9333333333333333,
 'Balanced accuracy': 0.9333333333333333,
 'Faithfulness score': 2,
 'Response': 'Summary: Two studies examined the abstract in metadata.\n\nDOI: 10.1162/qss_a_00286 - This study compared the amount of metadata and the completeness degree of research publications in new academic databases.\n\nDOI: 10.31222/osf.io/smxe5 - This study presented an up-to-date overview of the availability of six metadata elements in Crossref, including abstracts.\n\nIs there anything else I can help you with, my lady?'}

## run the test from here

In [73]:
#run the test from here


# Extract DOIs from retrieved documents
retrieved_dois = [doc.split("\n")[0].strip("DOI: ") for doc in reranked_documents_end]
print("Retrieved DOIs:", retrieved_dois)

# Ground truth relevant documents (DOIs) for each query
ground_truth = ["10.1007/s11192-022-04367-w","10.1371/journal.pbio.1002542","10.1007/s11192-015-1765-5","10.1162/qss_a_00112","10.1162/qss_a_00210"]

#***** Begin chat session *****
# set directory path
directory_path = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data_multi_lang"
# initialize search_queries 
search_queries = [input("what is your query?")]#could be a list of multiple queries

# set top_k
top_k = 5
#set threshold 
threshold = 0.10

response, reranked_documents_end = cohere_rag_pipeline(directory_path,search_queries,top_k,threshold)
# Display the response
print(response.message.content[0].text)

new_result = print_results()
# add the new result to the df
results_df.loc[len(results_df)] = new_result

#save the queries and responses to separate dataframe to be manually annontated
answer_relevance_df = results_df[['Query','Response']].copy(deep=True)

# save out answer_relevance_df
filename="analysis/dense_answer_relevance_results.xlsx"
answer_relevance_df.to_excel(filename)

filename = "analysis/dense_analysis_results.xlsx"
os.makedirs(os.path.dirname(filename), exist_ok=True)
results_df.to_excel(filename)
results_df.tail(5)

Retrieved DOIs: ['10.1007/s11192-022-04367-w', '10.1162/qss_a_00112', '10.1371/journal.pbio.1002542']
Length of documents: 45
Rank: 1
Score: 0.09165022
Document: DOI: 10.1007/s11192-022-04367-w
 Title: CrossRefデータのDOIエラーによる無効な引用の識別と修正 Identifying and correcting invalid citations due to DOI errors in Crossref data
Abstract: この作業は、CrossRefで利用可能なオープンな参考文献メタデータを分析することにより、DOIの間違いのクラスを特定し、どの出版社がそのような間違いを担当し、これらの誤ったDOIの数を自動プロセスで修正できるかを強調することを目的としています。 By using a list of invalid cited DOIs gathered by OpenCitations while processing the OpenCitations Index of Crossref open DOI-to-DOI citations (COCI) in the past two years, we retrieved the citations in the January 2021 Crossref dump to such invalid DOIs.これらの引用は、CrossRefの関連する引用データをアップロードする責任を負う責任者の有効性と出版社を追跡することで処理しました。最後に、無効なDOIの事実上の誤りのパターンと、それらをキャッチして修正するために必要な正規表現を特定しました。この研究の結果は、無効な引用の大部分に責任を負い、および/または影響を受けた出版社は少数であることを示しています。過去の研究で提案されたDOI名エラーの分類法を拡張し、以前のアプローチよりも無効なDOIでより多くのミスをきれいにすることができる、より詳細に精巧な正規表現を定義しました。私たちの研究で収集されたデータは、定性的観点からDOIの間違い

Unnamed: 0,Query,Precision,Recall,F1-Score,Accuracy,Balanced accuracy,Faithfulness score,Response
11,which studies examined citations?,1.0,0.2,0.911111,0.911111,0.6,1,Summary: two studies examined citations.\n\nDO...
12,which studies examined citations?,1.0,0.5,0.953488,0.953488,0.75,0,"My lady, it seems there is no specific content..."
13,which studies examined citations?,0.0,0.0,0.888889,0.888889,0.5,0,Summary: Three studies examined citations.\n\n...
14,which studies examined citations?,1.0,0.6,0.955556,0.955556,0.8,3,Summary: Three studies examined citations.\n\n...
15,which studies examied citations?,1.0,0.6,0.955556,0.955556,0.8,0,"My lady, it seems there is no specific content..."
