# Import Libraries

In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import pandas as pd
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain_core.messages import HumanMessage
from langchain_mistralai.chat_models import ChatMistralAI
from WQ.WikiQuery.src import constants
from dotenv import load_dotenv
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Read Dataset
crypto_wiki_articles = pd.read_csv('WQ/WikiQuery/data/Wikipedia Crypto Articles.csv')

In [None]:
# Load dataframe content into a document

articles = DataFrameLoader(crypto_wiki_articles,
                           page_content_column = "article")
document = articles.load()

# I use Langchain’s recursive character splitter for chunking

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    separators=['\n\n', '\n', '.'],
    length_function=len,
)


document_chunks = text_splitter.split_documents(document)

# Create a Knowledge Base
Let's start by loading the content in a pandas DataFrame.

In [None]:
df = pd.DataFrame([d.page_content for d in document], columns=["text"])
df.head(10)

Unnamed: 0,text
0,"A cryptocurrency, crypto-currency, or crypto i..."
1,Bitcoin (abbreviation: BTC or XBT; sign: ₿) is...
2,"Digital currency (digital money, electronic mo..."
3,A central bank digital currency (CBDC; also ca...
4,Litecoin (Abbreviation: LTC; sign: Ł) is a dec...
5,Satoshi Nakamoto is the name used by the pres...
6,A blockchain is a distributed ledger with grow...
7,Ethereum is a decentralized blockchain with sm...
8,"Charlie Lee is a computer scientist, best know..."
9,Namecoin (Abbreviation: NMC; sign: \n \n \...


# Generate the Test Set

In [None]:
# chunk up articles into chunks of approximately 5000 words to meet prompt max tokens
def chunk_article(article, max_chunk_words=5000):
    words = article.split()
    chunks = []
    current_chunk = []
    word_count = 0
    for word in words:
        current_chunk.append(word)
        word_count += 1
        if word_count >= max_chunk_words:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            word_count = 0
    if current_chunk:  # Append any remaining words as the last chunk
        chunks.append(' '.join(current_chunk))
    return chunks

# Create a list to store the chunked articles
chunked_articles = []

# Iterate over the rows of the original DataFrame
for index, row in df.iterrows():
    article = row['text']
    article_chunks = chunk_article(article)
    chunked_articles.extend(article_chunks)

# Create a new DataFrame from the chunked articles
chunked_df = pd.DataFrame({'chunked_article': chunked_articles})


In [None]:
# OPEN AI Model needed for evaluation using giskard library
os.environ["OPENAI_API_KEY"] = "OPEN AI API KEY" # Add your OPEN AI API KEY

In [None]:
from giskard.rag import generate_testset
testset = generate_testset(
    knowledge_base,
    num_questions=60,
    agent_description="A",
)


In [None]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(chunked_df)

In [None]:
# Display a few samples from the test set
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("******************", end="\n\n")

Question 1: When was Titcoin launched and who are its founders?
Reference answer: Titcoin was launched in 2014 and was founded by Edward Mansfield, Richard Allen, and a third anonymous individual.
Reference context:
Document 21: Titcoin (Ticker Symbol: TIT) is a cryptocurrency launched in 2014. Titcoin's blockchain is derived from the Bitcoin source code, with modifications to improve transaction speed and efficiency. Titcoin is intended for the adult entertainment industry to allow users to pay for adult products and services without the fear of incriminating payment histories appearing on their credit cards.In 2015, Titcoin received two nominations at the 2015 XBIZ Awards. == History == Titcoin was founded by Edward Mansfield, Richard Allen, and a third anonymous individual. The founders developed Titcoin for the adult entertainment industry as a cash alternative payment system for performing anonymous transactions.On June 21, 2014, the Titcoin cryptocurrency wallet and source code w

In [None]:
# Save the test set to a file
testset.save("test-set.jsonl")

# Prepare the Prompt Template


In [None]:
template = """You are an AI Assistant specialized in cryptocurrencies named WikiQuery.
    You provide detailed answers to customer inquiries within the scope of cryptocurrency.
    If a question is outside this scope, you politely decline without revealing additional information whatsoever.
    Your demeanor is professional and amicable, and you aim to guide and assist users while maintaining ethical standards.
    Your responses are concise and informative, limited to one paragraph.
    Context: {context}
    Question: {question}
    Helpful Answer:"""


prompt = PromptTemplate(input_variables=["context", "question"], template=template)

# Create the RAG Chain

In [None]:
mistral = ChatMistralAI(mistral_api_key="MISTRAL API KEY") # Add your MISTRAL API KEY

In [None]:
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectore_database = Chroma(persist_directory="WQ/WikiQuery/vector_db",embedding_function=embedding_model)
retriever = vectore_database.as_retriever()

In [None]:
# Create the chain
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | mistral # This is the LLM used in the chain to generate the answers.
    | StrOutputParser()
)

# Evaluating the Model on the Test Set

In [None]:
# A function to invoke the chain with a specific question and returns the answer.

def answer_fn(question, history=None):
    return chain.invoke({"question": question})

We can now use the evaluate() function to evaluate the model on the test set. This function will compare the answers from the chain with the reference answers in the test set.

In [None]:
from giskard.rag import evaluate

report = evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent:   0%|          | 0/29 [00:00<?, ?it/s]

Correctness evaluation:   0%|          | 0/29 [00:00<?, ?it/s]

In [None]:
# Display the report.
display(report)