# Create questions
This notebook creates questions using LLM-assist from Cohere models. <br>
This follows the method outlined in Teixera de Lima et al., (2025), Wang et al., (2024), and Bruni et al., (2025)


In [1]:
import cohere
from typing import List, Dict, Tuple
import numpy as np
import os
from dotenv import load_dotenv
import json
import time # for timing functions
import sys
from colorama import Fore, Style, Back
import pandas as pd

# load secret from local .env file
def get_key():
    #load secret .env file
    load_dotenv()

    #store credentials
    _key = os.getenv('COHERE_API_KEY')

    #verify if it worked
    if _key is not None:
        print(Fore.GREEN + "all is good, beautiful!")
        return _key
    else:
        print(Fore.LIGHTRED_EX + "API Key is missing")

# Initialize Cohere client
co = cohere.ClientV2(get_key(),client_name="little_steve")

[32mall is good, beautiful!


In [15]:
# set up functions

def embed_documents(documents: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of documents."""
    response = co.embed(texts=documents, model="embed-v4.0",input_type="search_query").embeddings
    return response

def generate_questions(documents: List[str]) -> List[str]:
    """Generate questions using the chat endpoint in the V2 API."""
    instruction = """
    Develop questions based on the given documents. 
    The questions should be easy to understandby a novice and only require looking up facts. 
    An example question: What is augmentation in RAG? 
    Generate one question that can be answered with at least 5 of the following documents.
    """
    prompt = f"{instruction} \n".join(documents)
    response = co.chat(
        messages=[{"role":"user","content":prompt}],#V2 requires an object
        model="command-a-03-2025",
        temperature=0.2, # default is 3
        max_tokens=500,
    )
    return [response.message.content[0].text]  # Adjust if you want multiple questions

def find_true_positives(question: str, documents: List[str]) -> List[str]:
    """Identify the top 5 documents relevant to the question using the rerank endpoint."""
    response = co.rerank(
        query=question,
        documents=documents,
        model="rerank-english-v3.0",
        top_n=5,
    )
    return [doc for doc in response.results]

#read documents as .txt files in data director
def read_documents_with_doi(directory_path: str) -> List[Dict[str, str]]:
    """
    Reads documents and their DOIs from individual files in a directory.

    Args:
        directory_path (str): Path to the directory containing the document files.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing 'doi' and 'text' keys.
    """

    documents_with_doi = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()
                if len(lines) >= 1:
                    doi = lines[0]
                    text = "".join(lines[1:]).strip()
                    documents_with_doi.append(f"{doi} {text}\n")
    return documents_with_doi


def main():
    
    # Step 2: Generate questions
    questions = generate_questions(documents)
    
    # Step 3: Find true positives for each question
    for question in questions:
        true_positives = find_true_positives(question, documents)
        print(f"Question: {question}")
        print(f"True Positives by DOI: {[documents[doc.index].split("\n")[0].strip("DOI: ") for doc in true_positives]}")
        print("-" * 80)

# go ahead and embed so that we don't have to do this each time
directory_path = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data"
documents = read_documents_with_doi(directory_path)
#embeddings = embed_documents(documents)




In [None]:
"""
 - need to run loop for x number questions
 - collect questions, true positives, and answer in pandas df
 - save to excel file called golden_set.xlsx
"""
import random

# randomize order of documents
random.shuffle(documents)
embeddings = embed_documents(documents)
# now, generate a question
main()


## question verification

In [29]:
# set up functions

def embed_documents(documents: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of documents."""
    response = co.embed(texts=documents, model="embed-v4.0",input_type="search_query").embeddings
    return response

def generate_questions(documents: List[str]) -> List[str]:
    """Generate questions using the chat endpoint in the V2 API."""
    instruction = """
    Evaluate the following question and answer pair for correctness. 
    Use the provided documents as evidence. 
    Please answer yes or no, with a reason. 
    QUESTION: What are the key challenges and potential solutions for improving metadata quality?
    ANSWER: 
    ['10.48550/arXiv.2404.17663', '10.48550/arXiv.2409.10633', '10.48550/arXiv.2502.03627', '10.48550/arXiv.2508.18620', '10.48550/arXiv.2401.16359']
    """
    prompt = f"{instruction} \n".join(documents)
    response = co.chat(
        messages=[{"role":"user","content":prompt}],#V2 requires an object
        model="command-a-03-2025",
        temperature=0.2, # default is 3
        max_tokens=500,
    )
    return [response.message.content[0].text]  # Adjust if you want multiple questions

def find_true_positives(question: str, documents: List[str]) -> List[str]:
    """Identify the top 5 documents relevant to the question using the rerank endpoint."""
    response = co.rerank(
        query=question,
        documents=documents,
        model="rerank-english-v3.0",
        top_n=5,
    )
    return [doc for doc in response.results]

#read documents as .txt files in data director
def read_documents_with_doi(directory_path: str) -> List[Dict[str, str]]:
    """
    Reads documents and their DOIs from individual files in a directory.

    Args:
        directory_path (str): Path to the directory containing the document files.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing 'doi' and 'text' keys.
    """

    documents_with_doi = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()
                if len(lines) >= 1:
                    doi = lines[0]
                    text = "".join(lines[1:]).strip()
                    documents_with_doi.append(f"{doi} {text}\n")
    return documents_with_doi


def EvaluateQuestion():
    
    # Step 2: Generate questions
    questions = generate_questions(documents)
    
    # Step 3: Find true positives for each question
    for question in questions:
        true_positives = find_true_positives(question, documents)
        print(f"Question: {question}")
        print(f"True Positives by DOI: {[documents[doc.index].split("\n")[0].strip("DOI: ") for doc in true_positives]}")
        print("-" * 80)

# go ahead and embed so that we don't have to do this each time
directory_path = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data"
documents = read_documents_with_doi(directory_path)
#embeddings = embed_documents(documents)




In [None]:
"""
 - need to run loop for x number questions
 - collect questions, true positives, and answer in pandas df
 - save to excel file called golden_set.xlsx
"""
import random

# randomize order of documents
random.shuffle(documents)
embeddings = embed_documents(documents)
# now, generate a question
EvaluateQuestion()


# Check questions from a list of DOIs
Questions and true positives were adjusted untill complete agreement between llm and human evaluator. 

In [16]:
# set up functions

def embed_documents(documents: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of documents."""
    response = co.embed(texts=documents, model="embed-v4.0",input_type="search_query").embeddings
    return response

def generate_questions(documents: List[str]) -> List[str]:
    """Generate questions using the chat endpoint in the V2 API."""
    instruction = """
    Evaluate the following question and list of DOIs for correctness. 
    Use the provided documents as evidence. IF there are less than 5, recommend a document to include.
    Please answer yes or no, with a brief summary of each DOI. 
    QUESTION: which studies examined the abstract in metadata?
    ANSWER: 
    ["10.1002/leap.1411","10.1007/s11192-020-03632-0","10.1162/qss_a_00286","10.1162/qss_a_00022","10.31222/osf.io/smxe5"]
    """
    prompt = f"{instruction} \n".join(documents)
    response = co.chat(
        messages=[{"role":"user","content":prompt}],#V2 requires an object
        model="command-a-03-2025",
        temperature=0.2, # default is 3
        max_tokens=500,
    )
    return [response.message.content[0].text]  # Adjust if you want multiple questions

def find_true_positives(question: str, documents: List[str]) -> List[str]:
    """Identify the top 5 documents relevant to the question using the rerank endpoint."""
    response = co.rerank(
        query=question,
        documents=documents,
        model="rerank-english-v3.0",
        top_n=5,
    )
    return [doc for doc in response.results]

#read documents as .txt files in data director
def read_documents_with_doi(directory_path: str) -> List[Dict[str, str]]:
    """
    Reads documents and their DOIs from individual files in a directory.

    Args:
        directory_path (str): Path to the directory containing the document files.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing 'doi' and 'text' keys.
    """

    documents_with_doi = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()
                if len(lines) >= 1:
                    doi = lines[0]
                    text = "".join(lines[1:]).strip()
                    documents_with_doi.append(f"{doi} {text}\n")
    return documents_with_doi


def EvaluateQuestion():
    
    # Step 2: Generate questions
    questions = generate_questions(documents)
    
    # Step 3: Find true positives for each question
    for question in questions:
        true_positives = find_true_positives(question, documents)
        print(f"Question: {question}")
        print(f"True Positives by DOI: {[documents[doc.index].split("\n")[0].strip("DOI: ") for doc in true_positives]}")
        print("-" * 80)

# go ahead and embed so that we don't have to do this each time
directory_path = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data"
documents = read_documents_with_doi(directory_path)
#embeddings = embed_documents(documents)




In [17]:
"""
 - need to run loop for x number questions
 - collect questions, true positives, and answer in pandas df
 - save to excel file called golden_set.xlsx
"""
import random

# randomize order of documents
random.shuffle(documents)
embeddings = embed_documents(documents)
# now, generate a question
EvaluateQuestion()


Question: **Evaluation of the provided DOI list for the question "which studies examined the abstract in metadata?"**

**Correct DOIs (Yes):**
1. **10.1002/leap.1411**  
   - *Summary*: Examines the lexical content of abstracts in preprints during the COVID-19 crisis, comparing them to pre-pandemic abstracts. Focuses on promotional language, positive/negative words, and hedge words.  
   - *Relevance*: Directly analyzes abstracts and their metadata.  

2. **10.1007/s11192-020-03632-0**  
   - *Summary*: Assesses the type of articles published during the first 3 months of the COVID-19 pandemic, comparing them to H1N1 articles. Extracts and evaluates abstracts from PubMed.  
   - *Relevance*: Explicitly examines abstracts in metadata.  

3. **10.1162/qss_a_00286**  
   - *Summary*: Compares the completeness of metadata, including abstracts, across eight free-access scholarly databases using a Crossref sample of 115,000 records.  
   - *Relevance*: Directly evaluates abstract metadata.  
