<a href="https://colab.research.google.com/github/prabhakaran-s-code/genai-python/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abstract
This notebook contains the code to build a contextual chatbot. A set of web pages containing contextual information is fed in the input, relevant information from the webpages extracted and stored in embedded format using fiass library. When the user asks a query, the same is embedded, semantic search performed in the faiss index to get the best matched context. The query along with the context is passed on to a generative AI model to generate the response. Couple of Gen AI models like Databricks DollyV2 and gpt4all falcon were tried. Some basic grounding logic is also implemented to ensure the generated ouput is relevant.

In [None]:
#!pip install gpt4all --quiet
!pip install sentence-transformers --quiet
!pip install nltk --quiet
!pip install torch --quiet
!pip install faiss-cpu --quiet
!pip install numpy --quiet
!pip install torch transformers accelerate --quiet

In [None]:
#!wget -P path/to/models https://gpt4all.io/models/gguf/gpt4all-falcon-newbpe-q4_0.gguf

In [None]:
import requests, re, nltk, torch, faiss, sqlite3, pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline,  AutoTokenizer

nltk.download('punkt')
#from gpt4all import GPT4All
#model = GPT4All("/content/path/to/models/gpt4all-falcon-newbpe-q4_0.gguf")

# Load the model (replace with the specific Dolly model you choose)
generator = pipeline(model="databricks/dolly-v2-7b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
# Load the tokenizer associated with the Dolly model
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-7b")


In [None]:
#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

In [None]:
# Extract all the urls from sitemap.xml for a website

def extract_urls_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.text, 'xml')

    for link in soup.find_all('loc'):
          urls.append(link.text)
    return urls

In [None]:
# Function to extract content from URL
def extract_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    headers = [header.get_text() for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
    paragraphs = [p.get_text() for p in soup.find_all('p')]
    meta = []
    for meta_tag in soup.find_all('meta', {'name': ['title', 'description', 'path', 'tags']}):
        if meta_tag.get('content') is not None:
            meta.append(' ' + meta_tag.get('content'))
    # Remove unwanted characters (customize as needed)
    return (' '.join(meta +  headers + paragraphs))

In [None]:
def encode_content(urls):
    data = {'sentence': [], 'url': []}
    df = pd.DataFrame(data)

    for url in urls:
        id = 0
        page_content = extract_content(url)

        temp_sentences = nltk.sent_tokenize(page_content)
        for sentence in temp_sentences:
            sentence = re.sub(r'[^a-zA-Z0-9\.,!?/\-’™():% ]', ' ', sentence) # clean-up invalid characters in the sentence
#            temp_sentences[index] = sentence
            # Instead of appending to the DataFrame directly, create a temporary DataFrame and concatenate
            temp_df = pd.DataFrame({'sentence': [sentence], 'url': [url]})
            df = pd.concat([df, temp_df], ignore_index=True)

    write_df_to_db(df)

    embeddings =bi_encoder.encode(df['sentence'].tolist(), show_progress_bar=True)
    create_faiss_index(embeddings)

In [None]:
def write_df_to_db(df):
    # Connect to database (create if it doesn't exist)
    conn = sqlite3.connect('sentences_db.sqlite')
    cursor = conn.cursor()
    # Create table (if it doesn't exist)
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS sentences (
            id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
            sentence TEXT,
            url TEXT
        )
    ''')

    # Insert data from DataFrame
    df.to_sql('sentences', conn, if_exists='replace', index_label='id')
    conn.close()

In [None]:
def create_faiss_index(embeddings):
    # Assuming 'sentence_encodings' is a numpy array of shape (num_sentences, embedding_dim)
    dim = embeddings.shape[1]
    faiss_index = faiss.IndexFlatIP(dim)  # Create a flat index for exact search
    faiss_index.add(embeddings)  # Add the sentence encodings to the index
    faiss.write_index(faiss_index, "my_index.faiss")  # Save index to disk


In [None]:
def search(query):
    print("Input question:", query)

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    query_embedding = bi_encoder.encode(query)

    # Load Faiss index (assuming it's already created and saved)
    index = faiss.read_index("my_index.faiss")
    distances, indices = index.search(query_embedding.reshape(1, -1), k=5)

    # Connect to SQLite database
    conn = sqlite3.connect('sentences_db.sqlite')
    cursor = conn.cursor()

    # Retrieve sentences from SQLite
    retrieved_sentences = []
    for i in indices[0]:
        cursor.execute("SELECT sentence FROM sentences WHERE id = " + str(i))
        result = cursor.fetchone()
        if result:
            retrieved_sentences.append(result[0])

    conn.close()

    # Print results
    context =" ".join(retrieved_sentences)

    # Return the first retrieved sentence, or None if no sentences were found
    if context:
        return context
    else:
        return None  # Handle the case when no sentences are found

In [None]:
# Connect to SQLite database
conn = sqlite3.connect('sentences_db.sqlite')
cursor = conn.cursor()
cursor.execute("drop table sentences")
conn.close()

In [None]:
urls = []
sentences = []
sentences_embedding = []
encode_content(extract_urls_from_sitemap('https://www.abc.com/sitemap.xml'))

In [None]:
# This code uses gpt4all model. Hence commented.
#query = '#Some user query#'
#context = search(query)

#system_template = 'System: You are a contextual chat bot, you will be presented a context from which the a question will be asked, give your valuable insights as well. If you cannot find the answer to the user\'s query in the context, politely respond the same to the user.\nContext: '+ context + '\n'
#prompt_template = 'Query: {0}. \n Response: '
#with model.chat_session(system_template, prompt_template):
#    response = model.generate(query, max_tokens=100, temp=0.0, top_k=30, top_p=0.0, min_p=0.0, repeat_penalty=10, repeat_last_n=64, n_batch=1, n_predict=None, streaming=False)
#    print(response)

In [None]:
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def extract_keywords(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    keywords = [word.lower() for word, tag in tagged_tokens if tag.startswith('N') or tag.startswith('V') or tag.startswith('J')]
    stop_words = set(stopwords.words('english'))
    filtered_keywords = [word for word in keywords if word.lower() not in stop_words]
    return set(filtered_keywords)



In [None]:
# Define your context and system prompt
query = "# user query #"
context = search(query)

system_prompt = "You are a contextual chat bot, you will be presented a context from which the a question will be asked. Please respond to the user precisely based on the context. Cite specific passages from the context to support your claims in the response. Add a reference URL in the response if an URL is provided in the context. If you cannot find the answer to the user\'s query in the context, politely respond the same to the user."
# Construct the prompt with context and system prompt
prompt = f"""{system_prompt}
Context: {context}
User: {query}
Assistant: """  # Simulate a conversation-like structure

# Tokenize the user text.
model_inputs = tokenizer(prompt, return_tensors="pt")

generate_kwargs = dict(
    model_inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=1.0,
    temperature=float(0.0),
    repetition_penalty= 2.0
)

# Pass input_ids and attention_mask through generation_config
res = generator(prompt, kwargs=generate_kwargs)


context_keywords = extract_keywords(context)
response_keywords = extract_keywords(res[0]['generated_text'])

if response_keywords.issubset(context_keywords):
    print("All valid keywords in the response are present in the context. /n")
    print(res[0]['generated_text'])
else:
    missing_keywords = response_keywords - context_keywords
    num_missing_keywords = len(missing_keywords)
    print("The following keywords are in the response but not in the context:", missing_keywords)

    # Regenerate the response if there are more than 3 missing keywords between the response and context
    if num_missing_keywords > 3:
        print('--Regenerating Response--')
        res = generator(prompt, kwargs=generate_kwargs)
        context_keywords = extract_keywords(context)
        response_keywords = extract_keywords(res[0]['generated_text'])

    # Check similarity score of the context and query

    context_embedding = bi_encoder.encode(context)
    response_embedding = bi_encoder.encode(res[0]['generated_text'])

    similarity_score = util.pytorch_cos_sim(context_embedding, response_embedding)
    print("similarity_score: ", similarity_score)

    if(similarity_score > 0.75):
        print(res[0]['generated_text'])
    else:
        print('Apologies! I cannot answer this query. Please visit https://www.abc.com and post your query in the Contact Us section. If the query is relevant, one of our associates will get back to you shortly.')

In [None]:
print(context)

In [None]:
print(extract_keywords(context))
print(extract_keywords(res[0]['generated_text']))