### Import Libraries

In [4]:
import torch # Pytorch library for deep learning and tensor operations
import pandas as pd #Library for data manipulation and analysis
import wget # For downloading files from the internet

# Import modules from LlamaIndex for managing and quering indices
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP # For Llama model usage via C++ bindings
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt, # Converts structured messages to a prompt string
    completion_to_prompt, # Converts a model's completion to a prompt format
)

# Additional imports from LlamaIndex for building and managing indices
from llama_index.core import (
  SimpleDirectoryReader,
  VectorStoreIndex,
  ServiceContext,
)

# Set global tokenizer for LlamaIndex
from llama_index.core import set_global_tokenizer

# Transformers library for pre-trained model tokenization and embedding
from transformers import AutoTokenizer

# Embedding utilities for LlamaIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding # Embedding models from HuggingFace

# Importing SentenceTransformers for advanced sentence embeddings
from sentence_transformers import SentenceTransformer, util

# Metrics from sklearn for evaluation
from sklearn.metrics import precision_score, recall_score

# Natural Language Toolkit (NLTK) for tokenization and BLEU score calculation
import nltk  # Standard NLP library
from nltk.tokenize import word_tokenize  # Tokenizer for splitting sentences into words
from nltk.translate.bleu_score import (
    sentence_bleu,  # Calculate BLEU for individual sentences
    corpus_bleu,   # Calculate BLEU for a corpus
    SmoothingFunction # Adds smoothing to BLEU calculations
)





  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


### Download model

In [5]:
#Create function to download llama 2 model from hugging face to local machine for faster computation
def bar_custom(current, total, width=80):
    print("Downloading %d%% [%d/%d] bytes" % (current/total * 100, current, total))

model_url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf"
wget.download(model_url)

'llama-2-7b-chat.Q2_K.gguf'

### Wrapping Llama 2 in LlamaCPP
Wrapping the LLaMA 2 model in a LlamaCPP mode offers several significant benefits. These revolve around perfomance optimization, ease of integration and flexibility when working with large langugage models. Some benefits are:
1. Performance optimization
2. Scalability
3. Integration Flexibility
4. Model Efficiency


In [6]:
def select_llm() -> LlamaCPP:
    """
    Initializes and returns a LlamaCPP instance with specific configurations.

    Returns:
        LlamaCPP: Configured instance of the LlamaCPP language model.
    """
    return LlamaCPP(
    model_path=r"C:\Users\asus\llama-2-7b-chat.Q2_K.gguf", # Path to the LLaMA model file
    temperature=0.1, # Sampling temperature (Controls the randomness in generated text)
    max_new_tokens=500, # Maximum number of tokens to generate in a single response
    context_window=3900, # Size of the model's context window (maximum input length in tokens) 
    generate_kwargs={}, # Additional keyword arguments for text generation
    model_kwargs={"n_gpu_layers":1}, # Model-specific arguments, such as GPU layer allocation
    messages_to_prompt=messages_to_prompt, # Function to convert structured messages into a prompt
    completion_to_prompt=completion_to_prompt, # Function to format the model's output as a prompt
    verbose=True, # Enables verbose logging for debugging and monitoring
  )


In [7]:
# Set the global tokenizer for the LlamaIndex framework.
# This ensures that all text tokenization is handled using the tokenizer
# associated with the specified LLaMA-2 model from HuggingFace.
set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
     # Load the pre-trained tokenizer for "NousResearch/Llama-2-7b-chat-hf"
    # and use its `encode` method to tokenize text into numerical IDs.
)

In [116]:
# Initialize an embedding model for converting text into vector representations.
# "HuggingFaceEmbedding" is used to create embeddings compatible with LlamaIndex.
# The model "BAAI/bge-small-en-v1.5" is a pre-trained small English embedding model
# available from HuggingFace, designed to generate dense vector representations of text.
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [8]:
# create a CPP model instance
llm = select_llm()

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from C:\Users\asus\llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32    

In [75]:
# path file for the document to be looked up for RAG
file_path = r"C:\Downloads\Rules and Conditions _ McDonald's Monopoly 2024 IE.pdf"
print(f"Loading file from: {file_path}")

Loading file from: C:\Downloads\Rules and Conditions _ McDonald's Monopoly 2024 IE.pdf


In [76]:
# load documents
documents = SimpleDirectoryReader(input_files=
    [r"C:\Users\asus\Downloads\Rules and Conditions _ McDonald's Monopoly 2024 IE.pdf"]
).load_data()

In [77]:
# create vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [78]:
# set up query engine
query_engine = index.as_query_engine(llm=llm)

In [9]:
#get query and response sample dataset from the policy document for evaluation
dataset_path = r"C:\Users\asus\Desktop\mcdonalds_monopoly_dataset.csv"
data = pd.read_csv(dataset_path)

queries = data['Query'].tolist()
expected_responses = data['Response'].tolist()

In [109]:
data.head()

Unnamed: 0,Query,Response
0,Who is eligible to participate in the promotion?,Entrants must be aged 18 or over and residents...
1,What are the dates of the promotion?,"The promotion runs from September 4, 2024, to ..."
2,How can I enter the promotion?,"To enter, purchase a qualifying menu item to r..."
3,What are the categories of prizes?,"Prizes include Instant Win, Collect to Win, an..."
4,What is the deadline for submitting prize claims?,All prize claims must be submitted by October ...


In [10]:
# Load a pre-trained SentenceTransformer model for computing semantic similarity
# "all-MiniLM-L6-v2" is a small, efficient model that generates embeddings for text
# suitable for tasks like sentence-level similarity comparison.
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def evaluate_responses(model, queries, expected_responses):
    """
    Evaluate the model's responses to queries by calculating semantic similarity with expected responses.
    
    Args:
        model: The model used for generating responses (e.g., a query engine).
        queries: A list of input queries/questions.
        expected_responses: A list of expected responses to those queries.
        
    Returns:
        None (prints average semantic similarity).
    """
    precision_scores = [] # List to store precision scores 
    semantic_similarities = [] # List to store semantic similarity values between model responses and expected responses
    # Iterate over each query and its expected response
    for query, expected in zip(queries, expected_responses):
        # Get model's response
        response = query_engine.query(query)  
        # Compute semantic similarity
        #Encode both the model's response and the expected response into embeddings
        embeddings = embedder.encode([response, expected])
        #Calculate the cosine similarity between the embeddings of the response and the expected response
        similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
        semantic_similarities.append(similarity)

        

    # Aggregate metrics
    avg_similarity = sum(semantic_similarities) / len(semantic_similarities)
    print(f"Average Semantic Similarity: {avg_similarity:.2f}")

In [53]:
#Evaluate the semantic similarity of the model 
evaluate_responses(llm,queries,expected_responses)

Llama.generate: 76 prefix-match hit, remaining 1327 prompt tokens to eval

llama_print_timings:        load time =   15842.58 ms
llama_print_timings:      sample time =       3.71 ms /    91 runs   (    0.04 ms per token, 24515.09 tokens per second)
llama_print_timings: prompt eval time =   42600.02 ms /  1327 tokens (   32.10 ms per token,    31.15 tokens per second)
llama_print_timings:        eval time =    9813.79 ms /    90 runs   (  109.04 ms per token,     9.17 tokens per second)
llama_print_timings:       total time =   52478.43 ms /  1417 tokens
Llama.generate: 76 prefix-match hit, remaining 1309 prompt tokens to eval

llama_print_timings:        load time =   15842.58 ms
llama_print_timings:      sample time =       4.09 ms /   101 runs   (    0.04 ms per token, 24664.22 tokens per second)
llama_print_timings: prompt eval time =   42583.59 ms /  1309 tokens (   32.53 ms per token,    30.74 tokens per second)
llama_print_timings:        eval time =   10527.01 ms /   100 runs  

Average Semantic Similarity: 0.68


A semantic score of 0.68 was achieved 

In [110]:
# Download the tokenizer model
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [80]:
# Create a list of all the response that would be generated by the model for the given sample query
generated_responses = []
for query in queries:
    response = query_engine.query(query)
    generated_responses.append(response.response)

Llama.generate: 76 prefix-match hit, remaining 1327 prompt tokens to eval

llama_print_timings:        load time =   15990.94 ms
llama_print_timings:      sample time =       4.44 ms /   105 runs   (    0.04 ms per token, 23627.36 tokens per second)
llama_print_timings: prompt eval time =   43208.98 ms /  1327 tokens (   32.56 ms per token,    30.71 tokens per second)
llama_print_timings:        eval time =   11357.63 ms /   104 runs   (  109.21 ms per token,     9.16 tokens per second)
llama_print_timings:       total time =   54640.39 ms /  1431 tokens
Llama.generate: 76 prefix-match hit, remaining 1309 prompt tokens to eval

llama_print_timings:        load time =   15990.94 ms
llama_print_timings:      sample time =       2.75 ms /    68 runs   (    0.04 ms per token, 24754.28 tokens per second)
llama_print_timings: prompt eval time =   42394.71 ms /  1309 tokens (   32.39 ms per token,    30.88 tokens per second)
llama_print_timings:        eval time =    7478.80 ms /    67 runs  

In [100]:
# Example Data
tokenized_reference_responses =  [word_tokenize(response.lower()) for response in expected_responses]
tokenized_generated_responses =  [word_tokenize(response.lower()) for response in generated_responses]

smooth_fn = SmoothingFunction().method1
# Calculate Sentence-Level BLEU
for ref, gen in zip(tokenized_reference_responses, tokenized_generated_responses):
    score = sentence_bleu(ref, gen, smoothing_function=smooth_fn)
    print(f"BLEU Score: {score:.4f}")

# Calculate Corpus-Level BLEU
corpus_score = corpus_bleu(tokenized_reference_responses, tokenized_generated_responses)
print(f"Corpus BLEU Score: {corpus_score:.4f}")

BLEU Score: 0.0022
BLEU Score: 0.0060
BLEU Score: 0.0016
BLEU Score: 0.0007
BLEU Score: 0.0023
BLEU Score: 0.0010
BLEU Score: 0.0025
BLEU Score: 0.0009
BLEU Score: 0.0056
BLEU Score: 0.0015
BLEU Score: 0.0020
BLEU Score: 0.0013
BLEU Score: 0.0013
Corpus BLEU Score: 0.0000


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [1]:
%%writefile app.py,
import streamlit as st 
from llama_index.core import (
  SimpleDirectoryReader,
  VectorStoreIndex,
  ServiceContext,
)

from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from langchain.schema import(SystemMessage, HumanMessage, AIMessage)

from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# load documents
documents = SimpleDirectoryReader(input_files=
    [r"C:\Users\asus\Downloads\Rules and Conditions _ McDonald's Monopoly 2024 IE.pdf"]
).load_data()

# create vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)


def init_page() -> None:
  st.set_page_config(
    page_title="Personal Chatbot"
  )
  st.header("Personal Chatbot")
  st.sidebar.title("Options")

def select_llm() -> LlamaCPP:
  return LlamaCPP(
    model_path="llama-2-7b-chat.Q2_K.gguf",
    temperature=0.1,
    max_new_tokens=500,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers":1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
  )

def init_messages() -> None:
  clear_button = st.sidebar.button("Clear Conversation", key="clear")
  if clear_button or "messages" not in st.session_state:
    st.session_state.messages = [
      SystemMessage(
        content="you are a helpful AI assistant. Reply your answer in markdown format."
      )
    ]

def get_answer(llm, messages) -> str:
    query_engine = index.as_query_engine(llm=llm)
    response = query_engine.query(messages)
    return response

def main() -> None:
  init_page()
  llm = select_llm()
  init_messages()

  if user_input := st.chat_input("Input your question!"):
    st.session_state.messages.append(HumanMessage(content=user_input))
    with st.spinner("Bot is typing ..."):
      answer = get_answer(llm, user_input)
      print(answer)
    st.session_state.messages.append(AIMessage(content=str(answer)))
    

  messages = st.session_state.get("messages", [])
  for message in messages:
    if isinstance(message, AIMessage):
      with st.chat_message("assistant"):
        st.markdown(message.content)
    elif isinstance(message, HumanMessage):
      with st.chat_message("user"):
        st.markdown(message.content)

if __name__ == "__main__":
  main()

Overwriting app.py,
