In [3]:
import os
from pathlib import Path

EFS_DIR = Path("/efs/shared_storage/pcmoritz")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

# Mappings
embedding_dimensions = {
    "thenlper/gte-base": 768,
    "BAAI/bge-large-en": 1024,
    "text-embedding-ada-002": 1536
}
max_context_lengths = {
    "gpt-4": 8192,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-16k": 16384,
    "meta-llama/Llama-2-7b-chat-hf": 4096,
    "meta-llama/Llama-2-13b-chat-hf": 4096,
    "meta-llama/Llama-2-70b-chat-hf": 4096,
}

experiment_name = "full-questions-0300"

# Evaluate responses
evaluation_system_content = """
        Your job is to rate the quality of our generated answer {generated_answer}
        given a query {query} and a reference answer {reference_answer}.
        Your score has to be between 1 and 5.
        You must return your response in a line with only the score.
        Do not return answers in any other format.
        On a separate line provide your reasoning for the score as well.
        """
evaluate_responses(
        experiment_name=experiment_name,
        reference_loc=str(Path(ROOT_DIR, "experiments", "responses", f"gpt-4-full-questions-0300.json")), 
        response_loc=str(Path(ROOT_DIR, "experiments", "responses", f"llama-2-70b-full-questions-0300.json")),
        evaluator="gpt-4", 
        temperature=0.0, 
        max_context_length=max_context_lengths["gpt-4"],
        system_content=evaluation_system_content)

/home/ray/default/llm-applications


100%|██████████| 300/300 [34:22<00:00,  6.88s/it]


In [2]:
import time
import openai
import json
from tqdm import tqdm
import numpy as np

from dotenv import load_dotenv; load_dotenv(override=True)

def generate_response(
    llm, temperature=0.0, 
    system_content="", assistant_content="", user_content="", 
    max_retries=3, retry_interval=60):
    """Generate response from an LLM."""
    retry_count = 0
    while retry_count < max_retries:
        try:
            response = openai.ChatCompletion.create(
                model=llm,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": system_content},
                    {"role": "assistant", "content": assistant_content},
                    {"role": "user", "content": user_content},
                ],
            )
            return response["choices"][-1]["message"]["content"]
        except Exception as e:
            print(e)
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return ""


class QueryAgent:
    def __init__(self, embedding_model_name="thenlper/gte-base",
                 llm="meta-llama/Llama-2-70b-chat-hf", 
                 temperature=0.0, max_context_length=4096,
                 system_content="", assistant_content=""):
        
        # Embedding model
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {"device": "cuda", "batch_size": 100}
        if embedding_model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
            
        self.llm = llm
        self.temperature = temperature
        self.context_length = max_context_length - len(system_content + assistant_content)
        self.system_content = system_content
        self.assistant_content = assistant_content

        # VectorDB connection
        self.conn = psycopg.connect(os.environ["DB_CONNECTION_STRING"])
        register_vector(self.conn)

    def __call__(self, query, num_chunks=6):
        # Get context
        embedding = np.array(self.embedding_model.embed_query(query))
        with self.conn.cursor() as cur:
            cur.execute("SELECT * FROM document ORDER BY embedding <-> %s LIMIT %s", (embedding, num_chunks))
            rows = cur.fetchall()
            context = [{"text": row[1]} for row in rows]
            sources = [row[2] for row in rows]

        # Generate response
        user_content = f"query: {query}, context: {context}"
        answer = generate_response(
            llm=self.llm,
            temperature=self.temperature,
            system_content=self.system_content,
            assistant_content=self.assistant_content,
            user_content=user_content[: self.context_length],
        )

        # Result
        result = {
            "question": query,
            "sources": sources,
            "answer": answer,
        }
        return result


def set_credentials(llm):
    if llm.startswith("gpt"):
        openai.api_base = os.environ["OPENAI_API_BASE"]
        openai.api_key = os.environ["OPENAI_API_KEY"]
    else:
        openai.api_base = os.environ["ANYSCALE_API_BASE"]
        openai.api_key = os.environ["ANYSCALE_API_KEY"]


# Generate responses
def generate_responses(
    experiment_name, data_path, 
    chunk_size, chunk_overlap, num_chunks,
    embedding_model_name, 
    llm, temperature, max_context_length, 
    system_content, assistant_content=""):
    
    # Set credentials
    set_credentials(llm=llm)
    
    # Build index
    # create_index(
    #     embedding_model_name=embedding_model_name,
    #     chunk_size=chunk_size,
    #     chunk_overlap=chunk_overlap,
    # )
    
    # Query agent
    agent = QueryAgent(
        embedding_model_name=embedding_model_name,
        llm=llm,
        temperature=temperature,
        max_context_length=max_context_length,
        system_content=system_content,
        assistant_content=assistant_content,
    )

    # Generate responses
    results = []
    with open(Path(data_path), "r") as f:
        questions = [json.loads(item)["question"] for item in list(f)][:300]
    for query in tqdm(questions):
        result = agent(query=query, num_chunks=num_chunks)
        results.append(result)

    # Save to file
    responses_fp = Path(ROOT_DIR, "experiments", "responses", f"{experiment_name}.json")
    responses_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "data_path": data_path,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "num_chunks": num_chunks,
        "embedding_model_name": embedding_model_name,
        "llm": llm,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    responses = {
        "config": config,
        "results": results,
    }
    with open(responses_fp, "w") as fp:
        json.dump(responses, fp, indent=4)


def evaluate_responses(
    experiment_name, reference_loc, response_loc,
    evaluator, temperature, max_context_length,
    system_content, assistant_content=""):
    
    # Set credentials
    set_credentials(llm=evaluator)
    
    # Load answers
    with open(Path(reference_loc), "r") as f:
        references = [item for item in json.load(f)["results"]]
    with open(Path(response_loc), "r") as f:
        generated = [item for item in json.load(f)["results"]]
    assert len(references) == len(generated)

    # Quality score
    results = []
    context_length = max_context_length - len(system_content + assistant_content)
    for ref, gen in tqdm(zip(references, generated), total=len(references)):
        assert ref["question"] == gen["question"]
        user_content = str(
            {
                "question": gen["question"],
                "generated_answer": gen["answer"],
                "reference_answer": ref["answer"],
            }
        )[:context_length]

        # Generate response
        response = generate_response(
            llm=evaluator,
            temperature=temperature,
            system_content=system_content,
            assistant_content=assistant_content,
            user_content=user_content,
        )

        # Extract from response
        score, reasoning = response.split("\n", 1)

        # Store result
        result = {
            "question": gen["question"],
            "generated_answer": gen["answer"],
            "reference_answer": ref["answer"],
            "score": float(score),
            "reasoning": reasoning.lstrip("\n"),
            "sources": gen["sources"],
        }
        results.append(result)

    # Save to file
    evaluator_name = evaluator.split("/")[-1].lower()
    evaluation_fp = Path(ROOT_DIR, "experiments", "evaluations", f"{experiment_name}_{evaluator_name}.json")
    evaluation_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "reference_loc": reference_loc,
        "response_loc": response_loc,
        "evaluator": evaluator,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    evaluation = {
        "config": config,
        # "retrieval_score": get_retrieval_score(references, generated),
        "quality_score": np.mean([item["score"] for item in results if (item["score"] and item["reference_answer"])]),
        "results": results,
    }
    with open(evaluation_fp, "w") as fp:
        json.dump(evaluation, fp, indent=4)

# Assemble the dataset

In [9]:
import json
import os

records = []

with open(os.path.join(ROOT_DIR, "experiments/evaluations/full-questions-0000_gpt-4.json")) as f:
    records.extend(json.loads(f.read())["results"])

with open(os.path.join(ROOT_DIR, "experiments/evaluations/full-questions-0300_gpt-4.json")) as f:
    records.extend(json.loads(f.read())["results"])

In [13]:
dataset = [{"question": record["question"], "targets": 0 if record["score"] < 3.5 else 1} for record in records]

In [15]:
with open(os.path.join(ROOT_DIR, "datasets", "routing.json"), "w") as f:
    json.dump(dataset, f)