# Create questions
This notebook creates questions using LLM-assist from Cohere models. <br>
This follows the method outlined in Teixera de Lima et al., (2025), Wang et al., (2024), and Bruni et al., (2025)


In [1]:
import cohere
from typing import List, Dict, Tuple
import numpy as np
import os
from dotenv import load_dotenv
import json
import time # for timing functions
import sys
from colorama import Fore, Style, Back
import pandas as pd

# load secret from local .env file
def get_key():
    #load secret .env file
    load_dotenv()

    #store credentials
    _key = os.getenv('COHERE_API_KEY')

    #verify if it worked
    if _key is not None:
        print(Fore.GREEN + "all is good, beautiful!")
        return _key
    else:
        print(Fore.LIGHTRED_EX + "API Key is missing")

# Initialize Cohere client
co = cohere.ClientV2(get_key(),client_name="little_steve")

[32mall is good, beautiful!


In [10]:
# set up functions

def embed_documents(documents: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of documents."""
    response = co.embed(texts=documents, model="embed-v4.0",input_type="search_query").embeddings
    return response

def generate_questions(documents: List[str]) -> List[str]:
    """Generate questions using the chat endpoint in the V2 API."""
    instruction = """
    Develop questions for an undergraduate exam based on the given documents. 
    Each simple, single fact question must not be a compound question using a conjunction.
    Generate a simple, single fact, question that requires reasoning across at least 5 of the following documents:
    """
    prompt = f"{instruction} \n".join(documents)
    response = co.chat(
        messages=[{"role":"user","content":prompt}],#V2 requires an object
        model="command-a-03-2025",
        temperature=0.2, # default is 3
        max_tokens=500,
    )
    return [response.message.content[0].text]  # Adjust if you want multiple questions

def find_true_positives(question: str, documents: List[str]) -> List[str]:
    """Identify the top 5 documents relevant to the question using the rerank endpoint."""
    response = co.rerank(
        query=question,
        documents=documents,
        model="rerank-english-v3.0",
        top_n=5,
    )
    return [doc for doc in response.results]

#read documents as .txt files in data director
def read_documents_with_doi(directory_path: str) -> List[Dict[str, str]]:
    """
    Reads documents and their DOIs from individual files in a directory.

    Args:
        directory_path (str): Path to the directory containing the document files.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing 'doi' and 'text' keys.
    """

    documents_with_doi = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()
                if len(lines) >= 1:
                    doi = lines[0]
                    text = "".join(lines[1:]).strip()
                    documents_with_doi.append(f"{doi} {text}\n")
    return documents_with_doi


def main():
    
    # Step 2: Generate questions
    questions = generate_questions(documents)
    
    # Step 3: Find true positives for each question
    for question in questions:
        true_positives = find_true_positives(question, documents)
        print(f"Question: {question}")
        print(f"True Positives by DOI: {[documents[doc.index].split("\n")[0].strip("DOI: ") for doc in true_positives]}")
        print("-" * 80)

# go ahead and embed so that we don't have to do this each time
directory_path = "/Users/poppyriddle/Documents/PhD/Research_proposal/Part_3/part_3_cohere/data"
documents = read_documents_with_doi(directory_path)
#embeddings = embed_documents(documents)




In [11]:
"""
 - need to run loop for x number questions
 - collect questions, true positives, and answer in pandas df
 - save to excel file called golden_set.xlsx
"""
import random

# randomize order of documents
random.shuffle(documents)
embeddings = embed_documents(documents)
# now, generate a question
main()


Question: **Question:**  
What are the key challenges and opportunities associated with the use of OpenAlex as a bibliometric data source, and how does it compare to traditional databases like Web of Science and Scopus in terms of metadata accuracy, coverage, and inclusivity?  

**Explanation:**  
This question requires reasoning across at least five documents:  
1. **DOI: 10.48550/arXiv.2401.16359** (Reference Coverage Analysis of OpenAlex compared to Web of Science and Scopus)  
2. **DOI: 10.48550/arXiv.2404.17663** (An analysis of the suitability of OpenAlex for bibliometric analyses)  
3. **DOI: 10.48550/arXiv.2406.15154** (Analysis of the Publication and Document Types in OpenAlex, Web of Science, Scopus, Pubmed and Semantic Scholar)  
4. **DOI: 10.48550/arXiv.2409.10633** (Evaluating the Linguistic Coverage of OpenAlex: An Assessment of Metadata Accuracy and Completeness)  
5. **DOI: 10.1590/SciELOPreprints.11205** (On the Open Road to Universal Indexing: OpenAlex and OpenJournal