In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install langchain
!pip install torch
!pip install accelerate
!pip install sentence-transformers
!pip install streamlit
!pip install streamlit-chat
!pip install faiss-cpu
!pip install tiktoken
!pip install huggingface-hub
!pip install pypdf
!pip install llama-cpp-python


Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.46-py3-none-any.whl (299 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.3/299.3 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Down

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import PyPDFLoader
import os
import tempfile

In [None]:
def initialize_session_state():
    session_state = {
        "history": [],
        "generated": ["Hello! Ask me anything about 🤖"],
        "past": ["Hey! 👋"]
    }
    return session_state


In [None]:
def conversation_chat(query, chain, session_state):
    result = chain({"question": query, "chat_history": session_state["history"]})
    session_state["history"].append((query, result["answer"]))
    return result["answer"]


In [None]:
def collect_user_questions():
    questions = []
    print("Enter your questions one by one and type 'done' when finished:")
    while True:
        user_input = input("Enter your question: ")
        if user_input.lower() == 'done':
            break
        questions.append(user_input)
    return questions

In [None]:
def display_chat_history(chain, session_state):
    questions = collect_user_questions()
    for question in questions:
        answer = conversation_chat(question, chain, session_state)
        print(f"Question: {question}")
        print(f"Answer: {answer}\n")


In [None]:
def create_conversational_chain(vector_store):
    print('Creating conversational chain...')
    print("Started creating LLM...")

    llm = LlamaCpp(
        streaming=True,
        model_path="/content/drive/MyDrive/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
        temperature=0.75,
        top_p=1,
        verbose=True,
        n_ctx=4096,
    )

    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
        memory=memory,
    )

    print("Completed creating LLM!")

    return chain

In [None]:
def main():
    session_state=initialize_session_state()
    print("ChatBot using Mistral-7B-Instruct LLM :books:")

    uploaded_files = ["/content/Chainpoll A high efficacy method for LLM hallucination detection - 2310.18344.pdf"]

    if uploaded_files:
        text = []
        for file_path in uploaded_files:
            with open(file_path, "rb") as f:
                file_contents = f.read()
            file_extension = os.path.splitext(file_path)[1]
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(file_contents)
                temp_file_path = temp_file.name
                print("loading: ", file_path)
            loader = None
            if file_extension == ".pdf":
                loader = PyPDFLoader(temp_file_path)

            if loader:
                text.extend(loader.load())
                os.remove(temp_file_path)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=10000, chunk_overlap=20
        )
        text_chunks = text_splitter.split_documents(text)

        print("chunks:\n", text_chunks)

        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={"device": "cpu"},
        )

        print("embeddings:\n", embeddings)

        vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

        chain = create_conversational_chain(vector_store)

        display_chat_history(chain,session_state)


if __name__ == "__main__":
    main()

ChatBot using Mistral-7B-Instruct LLM :books:
loading:  /content/Chainpoll A high efficacy method for LLM hallucination detection - 2310.18344.pdf
chunks:
 [Document(page_content='ChainPoll : A H IGHEFFICACY METHOD FOR LLM\nHALLUCINATION DETECTION\nRobert Friel\nGalileo Technologies Inc.Atindriyo Sanyal\nGalileo Technologies Inc.\nOctober 31, 2023\nABSTRACT\nLarge language models (LLMs) have witnessed significant advancements in generating coherent, intelligent,\nand contextually relevant responses. However, the presence of hallucinations – inaccurate or unmotivated\nclaims – remains a persistent challenge, motivating the development of automated metrics for the detection of\nhallucinations in LLM outputs.\nWe make two contributions: ChainPoll , a novel hallucination detection methodology that substantially out-\nperforms existing alternatives, and RealHall , a carefully curated suite of benchmark datasets for evaluating\nhallucination detection metrics proposed in recent literature.\n

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embeddings:
 client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={'device': 'cpu'} encode_kwargs={} multi_process=False show_progress=False
Creating conversational chain...
Started creating LLM...


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /content/drive/MyDrive/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:    

Completed creating LLM!
Enter your questions one by one and type 'done' when finished:
Enter your question: What is the significance of ChainPoll achieving superior performance across all four benchmarks in RealHall?
Enter your question: How does RealHall address the limitations of existing datasets used in prior work on hallucination detection?
Enter your question: What are the key contributions of ChainPoll in the field of hallucination detection for LLMs?
Enter your question: Done


  warn_deprecated(

llama_print_timings:        load time =     850.28 ms
llama_print_timings:      sample time =      40.35 ms /    74 runs   (    0.55 ms per token,  1834.04 tokens per second)
llama_print_timings: prompt eval time =  279752.85 ms /  1992 tokens (  140.44 ms per token,     7.12 tokens per second)
llama_print_timings:        eval time =   14691.45 ms /    74 runs   (  198.53 ms per token,     5.04 tokens per second)
llama_print_timings:       total time =  295338.41 ms /  2066 tokens
Llama.generate: prefix-match hit


Question: What is the significance of ChainPoll achieving superior performance across all four benchmarks in RealHall?
Answer:  The significance of ChainPoll achieving superior performance across all four benchmarks in RealHall is that it demonstrates the effectiveness of the method for detecting both open-domain and closed-domain hallucinations. It also highlights the importance of carefully engineering prompts for hallucination detection, as well as the value of Boolean judgments over numeric scores.




llama_print_timings:        load time =     850.28 ms
llama_print_timings:      sample time =      14.06 ms /    26 runs   (    0.54 ms per token,  1848.56 tokens per second)
llama_print_timings: prompt eval time =   20482.22 ms /   168 tokens (  121.92 ms per token,     8.20 tokens per second)
llama_print_timings:        eval time =    4901.41 ms /    25 runs   (  196.06 ms per token,     5.10 tokens per second)
llama_print_timings:       total time =   25525.85 ms /   193 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     850.28 ms
llama_print_timings:      sample time =      77.08 ms /   138 runs   (    0.56 ms per token,  1790.46 tokens per second)
llama_print_timings: prompt eval time =  231663.20 ms /  1686 tokens (  137.40 ms per token,     7.28 tokens per second)
llama_print_timings:        eval time =   27041.36 ms /   137 runs   (  197.38 ms per token,     5.07 tokens per second)
llama_print_timings:       total time =  259733.30 ms /  1823 

Question: How does RealHall address the limitations of existing datasets used in prior work on hallucination detection?
Answer:   Limitations of existing datasets in prior work on hallucination detection include lack of diversity, challenge, and realism. RealHall addresses these limitations by carefully selecting four datasets that meet criteria for Challenge, Realism, and Task Diversity. For example, RealHall Closed evaluates how well a metric can detect closed-domain hallucinations in Retrieval Augmented Generation (RAG) use cases by using COVID-QA with retrieval and DROP. RealHall Open evaluates how well a metric can detect open-domain hallucinations in realistic settings by using the Open Assistant prompts and TriviaQA datasets.




llama_print_timings:        load time =     850.28 ms
llama_print_timings:      sample time =      14.84 ms /    26 runs   (    0.57 ms per token,  1752.26 tokens per second)
llama_print_timings: prompt eval time =   43756.84 ms /   334 tokens (  131.01 ms per token,     7.63 tokens per second)
llama_print_timings:        eval time =    4943.24 ms /    25 runs   (  197.73 ms per token,     5.06 tokens per second)
llama_print_timings:       total time =   48893.82 ms /   359 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     850.28 ms
llama_print_timings:      sample time =      64.00 ms /   113 runs   (    0.57 ms per token,  1765.71 tokens per second)
llama_print_timings: prompt eval time =  232252.50 ms /  1686 tokens (  137.75 ms per token,     7.26 tokens per second)
llama_print_timings:        eval time =   22547.03 ms /   112 runs   (  201.31 ms per token,     4.97 tokens per second)
llama_print_timings:       total time =  255760.80 ms /  1798 

Question: What are the key contributions of ChainPoll in the field of hallucination detection for LLMs?
Answer:   The main limitations of existing datasets used in prior work on hallucination detection are that they do not meet the criteria for challenge, realism, and task diversity. RealHall addresses these limitations by carefully selecting four datasets that meet these criteria and dividing them into two groups of two: RealHall Closed and RealHall Open. The former evaluates how well a metric can detect closed-domain hallucinations while the latter tests open-domain hallucinations. This allows for more comprehensive evaluation of metrics in real-world use cases.

