# Cohere API and SciBERT for RAG
This notebook uses a Cohere API for generating responses to text. A query input is required from the user. 
SciBERT is used for embeddings in a dense vector array both the text and the query. 
A DOI is supplied with the text as both an identifier and locator. 

- [ ] set up venv
- [ ] install transformers torch cohere in command line

### todo
- [ ] create script that compiles data/documents.txt with DOI || text for all documents
- [ ] store vectorized documents in a db
    - https://huggingface.co/learn/cookbook/rag_with_hugging_face_gemma_mongodb

### options
- Batch Processing:
    If large number of texts, process them in batches to avoid memory issues.
    Example: Use a loop or torch.utils.data.DataLoader.

- Change model size: smaller models require less processing

- fine tune model on corpus

- look into pooling strategies

- Tokenizer
    - put cleaning process distincly prior to the tokenizer, using the default values as much as possible. 



In [1]:
# imports
import cohere
from cohere import Client
from transformers import AutoTokenizer, AutoModel
import numpy as np
from typing import List, Tuple, Dict
import os
from dotenv import load_dotenv
import json

def main():
    #load secret .env file
    load_dotenv()

    #store credentials
    global key,email
    key = os.getenv('COHERE_API_KEY')
    email = os.getenv('EMAIL')

    #verify if it worked
    if email is not None and key is not None:
        print("all is good, man!")

main()

all is good, man!


In [2]:

# Initialize Cohere client with key from secrets
co = cohere.Client(key)

# Load SciBERT model and tokenizer
"""
Autotokenizer documentation can be found here: https://huggingface.co/docs/transformers/v4.50.0/en/model_doc/auto#transformers.AutoTokenizer

Model documentation can be found here: https://huggingface.co/allenai/scibert_scivocab_uncased
Citation for SciBERT:
@inproceedings{beltagy-etal-2019-scibert,
    title = "SciBERT: A Pretrained Language Model for Scientific Text",
    author = "Beltagy, Iz  and Lo, Kyle  and Cohan, Arman",
    booktitle = "EMNLP",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-1371"
}


"""
# Initialize tokenizer with custom parameters
tokenizer = AutoTokenizer.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    max_len=512,
    use_fast=True,  # Use the fast tokenizer
    do_lower_case=False,  # Preserve case
    add_prefix_space=False,  # No prefix space
    never_split=["[DOC]", "[REF]"],  # Tokens to never split
    additional_special_tokens=["<doi>", "</doi>"]  # Add custom special tokens
)

# This is the SciBERT model that is used to embed the text and query.
# other models: 'allenai-specter', 
# Load model directly

model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased", torch_dtype="auto")

# Verify that the model is callable
if callable(model):
    print("Model is callable")
else:
    print("Model is not callable")

Model is callable


In [3]:
"""
Basic RAG with Cohere model
Document source: data/documents.txt where the DOI with resolver is separated from the abstract by ||. One record per line. 
Saved as UTF-8

Returns:  answers based on query from input()
"""

# Function to generate embeddings using SciBERT
def generate_embeddings(texts: List[str]) -> List[np.ndarray]:
    """
    converts raw text to numerical representations using a pretrained model, in this case, SciBERT.
    Currently this is applied to both the document text and the query. 
    May want a different version or decorator for the query as they are generally much shorter and more sparse.

    Input: text from tokenizer step above as a list of strings
    Output: np.array
    """
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        max_length=512, # returns PyTorch tensors which are compatible with model
        padding="max_length",
        truncation=True,
        return_attention_mask=True # return the attention mask - need to learn more
        )
    # this passes the tokenized inputs through the model
    outputs = model(**inputs)

    # applies mean pooling to get a fixed size embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Function to read documents and their DOIs from a file
def read_documents_with_doi(file_path: str) -> List[Dict[str, str]]:
    documents_with_doi = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("||")  # Assuming DOI and document are separated by "||"
            if len(parts) == 2:
                doi, document = parts
                documents_with_doi.append({"doi": doi.strip(), "text": document.strip()})
    return documents_with_doi

# Path to the file containing documents and DOIs
file_path = "data/documents.txt"  # Replace with your file path

# Read documents and DOIs from the file
documents_with_doi = read_documents_with_doi(file_path)

# Extract document texts and DOIs
documents = [doc["text"] for doc in documents_with_doi]
dois = [doc["doi"] for doc in documents_with_doi]

# Example query
query = input(" What is your query: ")

# Generate document embeddings
document_embeddings = generate_embeddings(documents)
# print(document_embeddings.shape) # to see the output shape of the array

# Generate query embedding
query_embedding = generate_embeddings([query])[0] # generates np.array for the query text

# Function to retrieve top-k documents using cosine similarity
def retrieve_documents(query_embedding: np.ndarray, document_embeddings: List[np.ndarray], top_k: int = 2) -> List[Tuple[float, Dict[str, str]]]:
    similarities = []
    for doc_emb in document_embeddings:
        # cosine similarity
        similarity = np.dot(query_embedding, doc_emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)) 
        similarities.append(similarity)
    # ranking
    top_indices = np.argsort(similarities)[::-1][:top_k]
    return [(similarities[i], documents_with_doi[i]) for i in top_indices]

# Retrieve top documents
top_documents = retrieve_documents(query_embedding, document_embeddings)
print("Retrieved Documents:")
for score, doc in top_documents:
    print(f"Score: {score:.4f}, DOI: {doc['doi']}, Document: {doc['text']}")

# Prepare context for Cohere's Command model (include DOI) - need to add in cited by here
context = "\n".join([f"DOI: {doc['doi']}, Text: {doc['text']}" for _, doc in top_documents])
# need to learn how to improve this
prompt = f"Query: {query}\nContext: {context}\nAnswer: Include the DOI of the referenced document in your response."

# Generate response using Cohere's Command model
response = co.generate(
  model="command", # there are other models to consider within command
  prompt=prompt,
  max_tokens=150, # allowable length of response
  temperature=0.5 # lower for less creativity, more for more creativity
)

# Print the generated response
print("\nGenerated Response:")
print(response.generations[0].text)


Retrieved Documents:
Score: 0.7492, DOI: https://doi.org/10.1162/qss_a_00286, Document: ABSTRACT  The main objective of this study is to compare the amount of metadata and the completeness degree of research publications in new academic databases. Using a quantitative approach, we selected a random Crossref sample of more than 115,000 records, which was then searched in seven databases (Dimensions, Google Scholar, Microsoft Academic, OpenAlex, Scilit, Semantic Scholar, and The Lens). Seven characteristics were analyzed (abstract, access, bibliographic info, document type, publication date, language, and identifiers), to observe fields that describe this information, the completeness rate of these fields, and the agreement among databases. The results show that academic search engines (Google Scholar, Microsoft Academic, and Semantic Scholar) gather less information and have a low degree of completeness. Conversely, third-party databases (Dimensions, OpenAlex, Scilit, and The Lens) have

## V2: implementing chat history

calls a JSON file of documents

In [14]:
# Load SciBERT model and tokenizer 
"""
REMOVE THIS ONCE RUNNING TO GO BACK TO THE CHANGED TOKENIZER AND MODEL ABOVE
"""
#tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
#model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

# Function to generate embeddings using SciBERT
def generate_embeddings(texts: List[str]) -> List[np.ndarray]:
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        max_length=512,
        padding="max_length",
        truncation=True
    )
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings


# Function to read documents and their DOIs from a file
"""
Input: a .txt file with || as separators
    doi with resolver \\ Abstract: followed by text

Returns: a list of dictionaries containing doi and text

"""
def read_documents_with_doi(file_path: str) -> List[Dict[str, str]]:
    documents_with_doi = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("||")  # Assuming DOI and document are separated by "||"
            if len(parts) == 2:
                doi, document = parts
                documents_with_doi.append({"doi": doi.strip(), "text": document.strip()})
    return documents_with_doi


# Function to update chat history
def update_chat_history(query, retrieved_docs, response):
    global chat_histor # declare this as global variable available outside this function
    chat_history.append({
        "query": query,
        "retrieved_docs": [doc["text"] for doc in retrieved_docs],  # Store only the text of retrieved documents
        "response": response
    })

#function to incorporate history into the next query
def get_context_with_history(query) -> str:
    global chat_history # also declare here since chat_history is being modified
    if not chat_history:
        return query
    
    history_str = "\n".join([
        f"User: {entry['query']}\n"
        f"Context: {'; '.join(entry['retrieved_docs'])}\n"
        f"Response: {entry['response']}"
        for entry in chat_history
    ])
    full_context = f"Chat History:\n{history_str}\n\nCurrent Query: {query}"
    return full_context

#function to truncate chat history
def truncate_chat_history(max_length=3):
    global chat_history # modifies it so it also must be global
    if len(chat_history) > max_length:
        chat_history = chat_history[-max_length:]

#function to retrieve top-k documents using cosine similarity
"""
retrieves documents from the embedded documents
Args:
    query: this is the query passed
    top_k: number of references to provide

Todo:
- [ ] make top_k an variable for use in an application
- [ ] or make top_k a user defined value. example: top_k = input("how many results do you want?")
"""
def retrieve_documents(query: str, top_k: int = 2) -> List[Dict[str, str]]:
    query_embedding = generate_embeddings([query])[0]
    document_embeddings = generate_embeddings(documents)
    similarities = [
        np.dot(query_embedding, doc_emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
        for doc_emb in document_embeddings
    ]
    top_indices = np.argsort(similarities)[::-1][:top_k]
    return [documents_with_doi[i] for i in top_indices]

#RAG pipeline function
def rag_pipeline(query):
    #incorporate chat history
    full_context = get_context_with_history(query)
    
    #retrieve documents
    global retrieved_docs
    retrieved_docs = retrieve_documents(query)
    
    #prepare context for Cohere's Command model
    context = "\n".join([f"DOI: {doc['doi']}, Text: {doc['text']}" for doc in retrieved_docs])
    prompt = f"Query: {query}\nContext: {context}\nAnswer: Include the DOI of the referenced document in your response."
    
    # Generate response
    response = co.generate(
        model="command",
        prompt=prompt,
        max_tokens=150,
        temperature=0.2
    ).generations[0].text
    
    # Update chat history
    update_chat_history(query, retrieved_docs, response)
    
    # Truncate history if necessary
    truncate_chat_history()
    
    # Print the response
    print("Generated Response:")
    print(response)
    return response


# Path to the file containing documents and DOIs
file_path = "data/documents.txt"

# Read documents and DOIs from the file
documents_with_doi = read_documents_with_doi(file_path)
documents = [doc["text"] for doc in documents_with_doi]

# Main loop for user interaction
chat_history = []#initialize chat history
while True:
    query = input("What is your query (or type 'exit' to quit): ")
    if query.lower() == "exit":
        break
    rag_pipeline(query)

Generated Response:
 The random Crossref sample was collected by the authors of the study linked to by the first query. Invalid citations are unfortunately not addressed in the source. 

Is there anything else I can help you with? 


# Analysis
## Test One: 
- [ ] count specific terms found in responses and in post-tokenized text. 
- [ ] compare post-tokenized and pre-tokenized text


In [6]:
from typing import List, Dict, Set
from sklearn.metrics import precision_score, recall_score, f1_score

In [15]:
retrieved_docs


[{'doi': 'https://doi.org/10.1162/qss_a_00286',
  'text': 'ABSTRACT  The main objective of this study is to compare the amount of metadata and the completeness degree of research publications in new academic databases. Using a quantitative approach, we selected a random Crossref sample of more than 115,000 records, which was then searched in seven databases (Dimensions, Google Scholar, Microsoft Academic, OpenAlex, Scilit, Semantic Scholar, and The Lens). Seven characteristics were analyzed (abstract, access, bibliographic info, document type, publication date, language, and identifiers), to observe fields that describe this information, the completeness rate of these fields, and the agreement among databases. The results show that academic search engines (Google Scholar, Microsoft Academic, and Semantic Scholar) gather less information and have a low degree of completeness. Conversely, third-party databases (Dimensions, OpenAlex, Scilit, and The Lens) have more metadata quality and a 

In [18]:
"""
This code is from the huggingface notebook 
    - [ ] track down source

todo:
- [ ] rewrite queuries 
- [ ] reqrite retrieved_docs - the dictionary is not quite what is needed
- [ ] create ground truth from dataset
"""

# Queries go here - change this to an input or a list from a file
queries = [
    "Who collected a random Crossref sample?",
    "Who is responsible for invalid citaitons?"
]

# This needs to come from the RAG above 
# REWRITE this as the two dictionaries are not the same. Need to take kv pair for 'doi' in retrieved_docs and
# create a new dictionary in a list
retrieved_docs = []
for i in retrieved_docs:
    retrieved_doi = i.get('doi',"")
    retrieved_docs.append({'doi':retrieved_doi})


# Ground truth relevant documents (DOIs) for each query
ground_truth = [
    {"DOI": "https://doi.org/10.1162/qss_a_00286"},  # For query 1
    {"DOI": "https://doi.org/10.1007/s11192-022-04367-w"}       # For query 2
]

# Function to compute precision, recall, and F1-score
def evaluate_retrieval(
    queries: List[str],
    retrieved_docs2: List[Dict[str, str]],
    ground_truth: List[Dict[str, str]]
) -> Dict[str, float]:
    all_labels = []
    all_predictions = []

    for query, retrieved, gt in zip(queries, retrieved_docs, ground_truth):
        # Convert DOIs to sets for easy comparison
        retrieved_dois = set([doc["DOI"] for doc in retrieved])
        print(retrieved_dois)
        gt_dois = set([doc["DOI"] for doc in gt])
        print(gt_dois)
        
        # Binary labels: 1 if document is relevant, 0 otherwise
        labels = [1 if doi in gt_dois else 0 for doi in documents_with_doi]
        predictions = [1 if doi in retrieved_dois else 0 for doi in documents_with_doi]
        
        all_labels.extend(labels)
        all_predictions.extend(predictions)

    # Compute metrics
    if sum(all_predictions) == 0:
        precision = 0.0
        recall = 0.0 if sum(all_labels) > 0 else 1.0  # If no predictions but all labels are 0, recall is 1.0
        f1 = 0.0
    else:
        precision = precision_score(all_labels, all_predictions, zero_division=0)
        recall = recall_score(all_labels, all_predictions, zero_division=0)
        f1 = f1_score(all_labels, all_predictions, zero_division=0)

    return {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
        }

# Example usage
results = evaluate_retrieval(queries, retrieved_docs, ground_truth)
print(f"Precision: {results['Precision']:.4f}")
print(f"Recall: {results['Recall']:.4f}")
print(f"F1-Score: {results['F1-Score']:.4f}")


Precision: 0.0000
Recall: 1.0000
F1-Score: 0.0000
