In [3]:
import os
from pathlib import Path

EFS_DIR = Path("/efs/shared_storage/pcmoritz")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

# Mappings
embedding_dimensions = {
    "thenlper/gte-base": 768,
    "BAAI/bge-large-en": 1024,
    "text-embedding-ada-002": 1536
}
max_context_lengths = {
    "gpt-4": 8192,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-16k": 16384,
    "meta-llama/Llama-2-7b-chat-hf": 4096,
    "meta-llama/Llama-2-13b-chat-hf": 4096,
    "meta-llama/Llama-2-70b-chat-hf": 4096,
}

experiment_name = "full-questions-0300"

# Evaluate responses
evaluation_system_content = """
        Your job is to rate the quality of our generated answer {generated_answer}
        given a query {query} and a reference answer {reference_answer}.
        Your score has to be between 1 and 5.
        You must return your response in a line with only the score.
        Do not return answers in any other format.
        On a separate line provide your reasoning for the score as well.
        """
evaluate_responses(
        experiment_name=experiment_name,
        reference_loc=str(Path(ROOT_DIR, "experiments", "responses", f"gpt-4-full-questions-0300.json")), 
        response_loc=str(Path(ROOT_DIR, "experiments", "responses", f"llama-2-70b-full-questions-0300.json")),
        evaluator="gpt-4", 
        temperature=0.0, 
        max_context_length=max_context_lengths["gpt-4"],
        system_content=evaluation_system_content)

/home/ray/default/llm-applications


100%|██████████| 300/300 [34:22<00:00,  6.88s/it]


In [2]:
import time
import openai
import json
from tqdm import tqdm
import numpy as np

from dotenv import load_dotenv; load_dotenv(override=True)

def generate_response(
    llm, temperature=0.0, 
    system_content="", assistant_content="", user_content="", 
    max_retries=3, retry_interval=60):
    """Generate response from an LLM."""
    retry_count = 0
    while retry_count < max_retries:
        try:
            response = openai.ChatCompletion.create(
                model=llm,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": system_content},
                    {"role": "assistant", "content": assistant_content},
                    {"role": "user", "content": user_content},
                ],
            )
            return response["choices"][-1]["message"]["content"]
        except Exception as e:
            print(e)
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return ""


class QueryAgent:
    def __init__(self, embedding_model_name="thenlper/gte-base",
                 llm="meta-llama/Llama-2-70b-chat-hf", 
                 temperature=0.0, max_context_length=4096,
                 system_content="", assistant_content=""):
        
        # Embedding model
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {"device": "cuda", "batch_size": 100}
        if embedding_model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
            
        self.llm = llm
        self.temperature = temperature
        self.context_length = max_context_length - len(system_content + assistant_content)
        self.system_content = system_content
        self.assistant_content = assistant_content

        # VectorDB connection
        self.conn = psycopg.connect(os.environ["DB_CONNECTION_STRING"])
        register_vector(self.conn)

    def __call__(self, query, num_chunks=6):
        # Get context
        embedding = np.array(self.embedding_model.embed_query(query))
        with self.conn.cursor() as cur:
            cur.execute("SELECT * FROM document ORDER BY embedding <-> %s LIMIT %s", (embedding, num_chunks))
            rows = cur.fetchall()
            context = [{"text": row[1]} for row in rows]
            sources = [row[2] for row in rows]

        # Generate response
        user_content = f"query: {query}, context: {context}"
        answer = generate_response(
            llm=self.llm,
            temperature=self.temperature,
            system_content=self.system_content,
            assistant_content=self.assistant_content,
            user_content=user_content[: self.context_length],
        )

        # Result
        result = {
            "question": query,
            "sources": sources,
            "answer": answer,
        }
        return result


def set_credentials(llm):
    if llm.startswith("gpt"):
        openai.api_base = os.environ["OPENAI_API_BASE"]
        openai.api_key = os.environ["OPENAI_API_KEY"]
    else:
        openai.api_base = os.environ["ANYSCALE_API_BASE"]
        openai.api_key = os.environ["ANYSCALE_API_KEY"]


# Generate responses
def generate_responses(
    experiment_name, data_path, 
    chunk_size, chunk_overlap, num_chunks,
    embedding_model_name, 
    llm, temperature, max_context_length, 
    system_content, assistant_content=""):
    
    # Set credentials
    set_credentials(llm=llm)
    
    # Build index
    # create_index(
    #     embedding_model_name=embedding_model_name,
    #     chunk_size=chunk_size,
    #     chunk_overlap=chunk_overlap,
    # )
    
    # Query agent
    agent = QueryAgent(
        embedding_model_name=embedding_model_name,
        llm=llm,
        temperature=temperature,
        max_context_length=max_context_length,
        system_content=system_content,
        assistant_content=assistant_content,
    )

    # Generate responses
    results = []
    with open(Path(data_path), "r") as f:
        questions = [json.loads(item)["question"] for item in list(f)][:300]
    for query in tqdm(questions):
        result = agent(query=query, num_chunks=num_chunks)
        results.append(result)

    # Save to file
    responses_fp = Path(ROOT_DIR, "experiments", "responses", f"{experiment_name}.json")
    responses_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "data_path": data_path,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "num_chunks": num_chunks,
        "embedding_model_name": embedding_model_name,
        "llm": llm,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    responses = {
        "config": config,
        "results": results,
    }
    with open(responses_fp, "w") as fp:
        json.dump(responses, fp, indent=4)


def evaluate_responses(
    experiment_name, reference_loc, response_loc,
    evaluator, temperature, max_context_length,
    system_content, assistant_content=""):
    
    # Set credentials
    set_credentials(llm=evaluator)
    
    # Load answers
    with open(Path(reference_loc), "r") as f:
        references = [item for item in json.load(f)["results"]]
    with open(Path(response_loc), "r") as f:
        generated = [item for item in json.load(f)["results"]]
    assert len(references) == len(generated)

    # Quality score
    results = []
    context_length = max_context_length - len(system_content + assistant_content)
    for ref, gen in tqdm(zip(references, generated), total=len(references)):
        assert ref["question"] == gen["question"]
        user_content = str(
            {
                "question": gen["question"],
                "generated_answer": gen["answer"],
                "reference_answer": ref["answer"],
            }
        )[:context_length]

        # Generate response
        response = generate_response(
            llm=evaluator,
            temperature=temperature,
            system_content=system_content,
            assistant_content=assistant_content,
            user_content=user_content,
        )

        # Extract from response
        score, reasoning = response.split("\n", 1)

        # Store result
        result = {
            "question": gen["question"],
            "generated_answer": gen["answer"],
            "reference_answer": ref["answer"],
            "score": float(score),
            "reasoning": reasoning.lstrip("\n"),
            "sources": gen["sources"],
        }
        results.append(result)

    # Save to file
    evaluator_name = evaluator.split("/")[-1].lower()
    evaluation_fp = Path(ROOT_DIR, "experiments", "evaluations", f"{experiment_name}_{evaluator_name}.json")
    evaluation_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "reference_loc": reference_loc,
        "response_loc": response_loc,
        "evaluator": evaluator,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    evaluation = {
        "config": config,
        # "retrieval_score": get_retrieval_score(references, generated),
        "quality_score": np.mean([item["score"] for item in results if (item["score"] and item["reference_answer"])]),
        "results": results,
    }
    with open(evaluation_fp, "w") as fp:
        json.dump(evaluation, fp, indent=4)

# Assemble the dataset

In [9]:
import json
import os

records = []

with open(os.path.join(ROOT_DIR, "experiments/evaluations/full-questions-0000_gpt-4.json")) as f:
    records.extend(json.loads(f.read())["results"])

with open(os.path.join(ROOT_DIR, "experiments/evaluations/full-questions-0300_gpt-4.json")) as f:
    records.extend(json.loads(f.read())["results"])

In [13]:
dataset = [{"question": record["question"], "targets": 0 if record["score"] < 3.5 else 1} for record in records]

In [15]:
with open(os.path.join(ROOT_DIR, "datasets", "routing.json"), "w") as f:
    json.dump(dataset, f)

# Manual routing

In [2]:
import json
import os

import os
from pathlib import Path

EFS_DIR = Path("/efs/shared_storage/pcmoritz")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

records = []

with open(os.path.join(ROOT_DIR, "experiments/evaluations/full-questions-0000_gpt-4.json")) as f:
    records.extend(json.loads(f.read())["results"])

with open(os.path.join(ROOT_DIR, "experiments/evaluations/full-questions-0300_gpt-4.json")) as f:
    records.extend(json.loads(f.read())["results"])

/home/ray/default/llm-applications


In [18]:
!pip install spacy
!python -m spacy download en_core_web_md


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spac

In [22]:
import spacy
nlp = spacy.load('en_core_web_md')

doc = nlp(records[2]["question"])
for token in doc:
    print(token.text, token.lemma_)

What what
's be
the the
difference difference
between between
learner learner
worker worker
and and
local local
worker worker
? ?


In [12]:
import numpy as np

# np.mean([record["score"] for record in records if "example" in record["question"]])
np.mean([record["score"] for record in records if "error" in record["question"]])


3.625

In [13]:
# np.mean([record["score"] for record in records if "example" not in record["question"]])
np.mean([record["score"] for record in records if "error" not in record["question"]])

3.896114864864865

In [9]:
records

[{'question': 'how can I use leela chess zero for a similar two player board game called breakthrough?',
  'generated_answer': "Based on the provided context, it seems that LeelaChessZero is an RL agent designed for playing chess, and it is an extension of the AlphaZero algorithm to handle multi-agent competitive environments. It is implemented using the Ray RLlib library, which provides a Python interface for building and training RL agents.\n\nTo use LeelaChessZero for a similar two-player board game called Breakthrough, you would need to make several modifications to the original code. Here are some suggestions on how to approach this:\n\n1. Modify the game environment: The first step would be to modify the game environment to match the rules of Breakthrough. This would involve creating a new class that inherits from the `Environment` class in RLlib, and implementing the necessary methods to simulate the game state, actions, and rewards.\n2. Adapt the LeelaChessZero model: The Leela

# Spacy pipeline

In [31]:
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of stopwords
nlp = spacy.load('en_core_web_md')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words]

    # return preprocessed list of tokens
    return mytokens

spacy_tokenizer(records[1]["question"])

['num_sample', 'tune', '?']

In [30]:
records[1]["question"]

'what is num_samples in tune?'

In [55]:
from sklearn.model_selection import train_test_split

X = [record["question"] for record in records]
ylabels = [record["score"] >= 4 for record in records]
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))


In [57]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



In [58]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.6944444444444444
Logistic Regression Precision: 0.7345679012345679
Logistic Regression Recall: 0.9083969465648855


In [59]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_train)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_train, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_train, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_train, predicted))

Logistic Regression Accuracy: 0.9285714285714286
Logistic Regression Precision: 0.9090909090909091
Logistic Regression Recall: 1.0


In [63]:
data = {record["question"]: record["score"] for record in records}

(np.mean([data[x] for i, x in enumerate(X_test) if not predicted[i]]), np.mean([data[x] for i, x in enumerate(X_test) if predicted[i]]))

(4.0625, 3.817857142857143)

In [64]:


np.mean([record["score"] for record in records])

3.8925