## Load the packages

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path
from tqdm.auto import tqdm
from dotenv import load_dotenv


load_dotenv();

  from .autonotebook import tqdm as notebook_tqdm


True

## Define the global constants

In [2]:
DEVICE = 'cuda'
CHAT_MODEL_NAME = 'premai-io/prem-1B-chat'

CHUNK_SIZE = 512
CHUNK_OVERLAP = 50
TOP_K_DOCS = 8
MIN_SCORE=0.3

RAG_SYSTEM_PROMPT = """You are a helpful assistant, optimized for RAG. Please answer the user question, based on the given context. But avoid sentences like 'based on the given context' in the response.
If you can't find the answer in the context, just say that you don't know. Keep the response short. Don't quote anything from the context directly.

Context:
{context}
"""

## Embed the docs

#### Load the docs

In [3]:
docs = []

for f in tqdm(list(Path('docs/papers').iterdir())):
    if f.suffix == '.pdf':
        loader = PyPDFLoader(f)
        docs.extend(loader.load())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:11<00:00,  1.23it/s]


#### Split the docs

In [4]:
tokenizer = AutoTokenizer.from_pretrained(CHAT_MODEL_NAME)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(tokenizer, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
splits = text_splitter.split_documents(docs)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Create the vectorstore

In [5]:
embedding_gen = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])

vectorstore = Qdrant.from_documents(
    splits,
    embedding_gen,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)

In [20]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


def generate_result(model, tokenizer, messages, terminators):
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate
    inputs = tokenizer(prompt, return_attention_mask=False, return_tensors="pt", add_special_tokens=False)
    input_ids = inputs['input_ids']
    input_ids = input_ids.to(model.device)
    res = model.generate(input_ids=input_ids, max_new_tokens=400, pad_token_id=tokenizer.pad_token_id, eos_token_id=terminators, do_sample=False)
    generated_text = tokenizer.decode(res[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
    return generated_text


def ask_question(ques, vectorstore, model, tokenizer, terminators):
    docs_with_scores = vectorstore.similarity_search_with_score(ques, k=TOP_K_DOCS)
    docs = [doc for doc, score in docs_with_scores if score > MIN_SCORE]
    if not docs:
        raise Exception('No docs found')
    relevant_context = format_docs(docs)
    
    prompt = ChatPromptTemplate.from_messages(
        [("system", RAG_SYSTEM_PROMPT), ("user", "{input}")]
    )
    messages = prompt.invoke({'context': relevant_context, 'input': 'hi'}).messages
    messages = [
        {'role': 'system', 'content': messages[0].content},
        {'role': 'user', 'content': messages[1].content},
    ]
    res = generate_result(model, tokenizer, messages, terminators)
    return {'response': res, 'relevant_docs': [d.page_content for d in docs]}

## Chat

#### Load the `premai-io/prem-1B-chat` model from Huggingface.

In [7]:
# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained('premai-io/prem-1B-chat', torch_dtype=torch.bfloat16)
model = model.to(DEVICE)

# Setup terminators
terminators = [tokenizer.eos_token_id, tokenizer.encode('<|eot_id|>', add_special_tokens=False)[0]]

#### Ask the questions

In [25]:
questions = [
    "What is the key feature of ChatEval compared to the other evaluation strategies? Give a short answer.",
    "How does the Infini-attention technique aim to address the problem related to limited context in generative models? Give a short answer.",
]

In [26]:
for i, ques in enumerate(questions):
    res = ask_question(ques, vectorstore, model, tokenizer, terminators)
    print('\033[94m' + ques + '\033[0m')
    print('\033[96m' + res['response'] + '\033[0m' + '\n\n')

[94mWhat is the key feature of ChatEval compared to the other evaluation strategies? Give a short answer.[0m
[96mChatEval is a multi-agent evaluation framework that employs a LLM-based approach to evaluate text. The framework is designed to enable human-like evaluation of text by leveraging the strengths of different LLMs. The framework consists of a single-agent model, which is responsible for generating responses to the evaluation task, and a multi-agent model, which is responsible for collaborating with the single-agent model to generate responses. The framework is designed to be scalable and flexible, allowing for the evaluation of text from different perspectives and with different levels of human expertise.

In this paper, we present the design of ChatEval and discuss the key components of the framework. We also discuss the effect of different communication strategies on the evaluation performance of ChatEval. We conclude by discussing the future of ChatEval and its potential 