In [29]:
import numpy as np
import pandas as pd

# Loading the Dataset


In [30]:
import markdown_to_json
import pprint

with open('./data/omniflow_documentation_complete.md', 'r') as file:
    data = file.read()
    
    
# print(data)

md_dict = markdown_to_json.dictify(data)

# pprint.pprint(md_dict)

# DFS into the dictionary 
# start from the first set of keys?
def dfs_dict(d, path):
    if path is None:
        path = []
        
    for key, value in d.items():
        # path.append({"key": key, "value": str(value)})
        
        if isinstance(value, dict):
            # Recursively apply DFS if the value is another dictionary
            dfs_dict(value, path)
        else:
            # Reached the end
            # cleanup the value and key
            
            path.append({"key": key, "value": str(value)})

all_md_keys_dict = []        
dfs_dict(md_dict, all_md_keys_dict)

pprint.pprint(all_md_keys_dict)

[{'key': '**Table of Contents**',
  'value': "['[**OmniFlow Documentation**](#omniflow-documentation)', "
           "['[**Table of Contents**](#table-of-contents)', '[**1. "
           "Introduction**](#1-introduction)', ['[**Overview**](#overview)', "
           "'[**Purpose**](#purpose)', '[**Key Features**](#key-features)', "
           "'[**System Requirements**](#system-requirements)'], '[**2. "
           "Installation Guide**](#2-installation-guide)', "
           "['[**Pre-installation Checklist**](#pre-installation-checklist)', "
           "'[**Installation Steps**](#installation-steps)', "
           "'[**Post-installation "
           "Configuration**](#post-installation-configuration)'], '[**3. User "
           "Guide**](#3-user-guide)', ['[**Getting "
           "Started**](#getting-started)', '[**Basic "
           "Operations**](#basic-operations)', '[**Advanced "
           "Features**](#advanced-features)', '[**Best "
           "Practices**](#best-practices)'], '[*

In [31]:
# Create a dataframe out of all of the doc
md_df = pd.DataFrame.from_dict(all_md_keys_dict)
md_df['id'] = md_df.index
md_df.head(5)

Unnamed: 0,key,value,id
0,**Table of Contents**,['[**OmniFlow Documentation**](#omniflow-docum...,0
1,**Overview**,"OmniFlow is a versatile, cross-platform applic...",1
2,**Purpose**,The primary purpose of OmniFlow is to provide ...,2
3,**Key Features**,"['**Task Management:** Organize, prioritize, a...",3
4,**System Requirements**,"['**Operating System:** Windows 10 or later, m...",4


# Cleanup the data

In [32]:
import re


# Function to clean up markdown syntax
def clean_markdown(line):
    # Remove headers (e.g., # Header, ## Header)
    line = re.sub(r'^#+\s', '', line)
    # Remove bold and italics (e.g., **bold**, *italic*)
    line = re.sub(r'\*\*(.*?)\*\*', r'\1', line)
    line = re.sub(r'\*(.*?)\*', r'\1', line)
    # Remove inline code (e.g., `code`)
    line = re.sub(r'`(.*?)`', r'\1', line)
    # Remove links (e.g., [text](url))
    line = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', line)
    # Remove images (e.g., ![alt text](url))
    line = re.sub(r'!\[(.*?)\]\((.*?)\)', r'\1', line)
    # Remove blockquotes (e.g., > Quote)
    line = re.sub(r'^>\s', '', line)
    # Remove unordered list items (e.g., - Item, * Item)
    line = re.sub(r'^[-*]\s', '', line)
    # Remove ordered list items (e.g., 1. Item)
    line = re.sub(r'^\d+\.\s', '', line)
    # Remove horizontal rules (e.g., ---)
    line = re.sub(r'^-{3,}$', '', line)
    # Remove other markdown artifacts as needed
    return line.strip()

# # Clean each line and store in the cleaned_lines list
# cleaned_lines = [clean_markdown(line) for line in lines]

# # The 'cleaned_lines' variable now contains the cleaned list of strings
# cleaned_lines


In [33]:
md_df['key'] = md_df['key'].apply(clean_markdown)
md_df['value'] = md_df['value'].apply(clean_markdown)

In [34]:
md_df.head(3)

Unnamed: 0,key,value,id
0,Table of Contents,"'[OmniFlow Documentation', '[Table of Contents...",0
1,Overview,"OmniFlow is a versatile, cross-platform applic...",1
2,Purpose,The primary purpose of OmniFlow is to provide ...,2


# ChromaDB

In [35]:
import chromadb
from chromadb.config import Settings

In [36]:
chroma_client = chromadb.PersistentClient(path="./db")

In [37]:
collection_name = "md_collection"

# delete the collection if it already exists
if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:
        chroma_client.delete_collection(name=collection_name)
collection = chroma_client.create_collection(name=collection_name)

In [38]:
# some constants
DOCUMENT="value"
TOPIC="key"

In [39]:
# populate the collection
collection.add(
    documents=md_df[DOCUMENT].tolist(),
    metadatas=[{TOPIC: topic} for topic in md_df[TOPIC].tolist()],
    ids=[f"id{x}" for x in range(len(md_df))],
)

In [40]:
len(md_df)

23

# Querying the database

In [41]:
import pprint
results = collection.query(query_texts=["how to install"], n_results=3 )

pprint.pprint(results)

{'data': None,
 'distances': [[1.0831094102464416, 1.1107773940186252, 1.2292424829248871]],
 'documents': [["'Verify that your system meets the [system requirements.', "
                "'Ensure you have administrative privileges for "
                "installation.', 'Disable any antivirus software temporarily "
                "to avoid conflicts during installation.']",
                "['Installation Errors: Ensure you have sufficient permissions "
                "and disable antivirus software during installation.', "
                "'Connection Problems: Check your internet connection and "
                "firewall settings.', 'Performance Issues: Close unnecessary "
                'applications and ensure your system meets the recommended '
                "specifications.']",
                "'Download: Go to the [OmniFlow website and download the "
                "appropriate installer for your operating system.', 'Run the "
                'Installer: Open the downloade

# Loading the Model and Creating the Prompt

In [42]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_id = "databricks/dolly-v2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
lm_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

In [43]:
pipe = pipeline(
    "text-generation",
    model=lm_model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    device_map="auto",
)

# Getting Keywords from a Question

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
def extract_keywords_tfidf(text, num_keywords=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    keywords = sorted(list(zip(feature_names, tfidf_scores)), key=lambda x: x[1], reverse=True)
    return [word for word, score in keywords[:num_keywords]]

In [46]:
text = "How much RAM is needed for Omniflow?"
keywords = extract_keywords_tfidf(text)
print(keywords)

['needed', 'omniflow', 'ram']


In [47]:
def getContextFromDB(keywords, results_to_use=1):
  ctxs = []
  for keyword in keywords:
    ctxs.append(collection.query(query_texts=[keyword], n_results=results_to_use))
  context = ""
  for ctx in ctxs:
    temp = " ".join([f"#{str(i)}" for i in ctx["documents"][0]])
    context = context + " " + temp
  return context

# Creating the Extended Prompt

In [48]:
def generate_extended_prompt(question, context_to_lookup=None, results_to_use=1):
  context = ""
  # lookup the vector database for semantically similar data
  if (context_to_lookup):
    # ctxs = collection.query(query_texts=[context_to_lookup], n_results=results_to_use)
    # # join all the results
    # context = " ".join([f"#{str(i)}" for i in ctxs["documents"][0]])
    context = getContextFromDB(context_to_lookup, results_to_use)
    
  # populate the prompt template
  prompt_template = f"""
  Relevant context: {context}
  Considering the above context, answer the following question.
  Question: {question}
  """
  return prompt_template

In [49]:
question = "How much RAM is needed for Omniflow?"
ctx_lookup = extract_keywords_tfidf(question)
ext_prompt = generate_extended_prompt(question, ctx_lookup, 1)
print(ext_prompt)


  Relevant context:  #['Description: Fetches a list of all tasks within a specified project.', 'Parameters:', ['project_id (required): The ID of the project.', 'status (optional): Filter tasks by status (e.g., completed, in-progress).'], 'Response: A JSON object containing the list of tasks.'] #OmniFlow is a versatile, cross-platform application designed to streamline workflows and enhance productivity in both individual and team environments. The software integrates with various tools and platforms, offering a unified interface for managing tasks, communications, and data analytics. #['Operating System: Windows 10 or later, macOS 10.14 or later, Linux (various distributions), iOS 13 or later, Android 8.0 or later.', 'Processor: Dual-core CPU, 2.0 GHz or faster.', 'Memory: 4 GB RAM (8 GB recommended).', 'Storage: 500 MB available space.', 'Internet Connection: Required for cloud-based features and updates.']
  Considering the above context, answer the following question.
  Question: H

In [50]:
lm_response = pipe(ext_prompt)
print(lm_response[0]["generated_text"])


  Relevant context:  #['Description: Fetches a list of all tasks within a specified project.', 'Parameters:', ['project_id (required): The ID of the project.', 'status (optional): Filter tasks by status (e.g., completed, in-progress).'], 'Response: A JSON object containing the list of tasks.'] #OmniFlow is a versatile, cross-platform application designed to streamline workflows and enhance productivity in both individual and team environments. The software integrates with various tools and platforms, offering a unified interface for managing tasks, communications, and data analytics. #['Operating System: Windows 10 or later, macOS 10.14 or later, Linux (various distributions), iOS 13 or later, Android 8.0 or later.', 'Processor: Dual-core CPU, 2.0 GHz or faster.', 'Memory: 4 GB RAM (8 GB recommended).', 'Storage: 500 MB available space.', 'Internet Connection: Required for cloud-based features and updates.']
  Considering the above context, answer the following question.
  Question: H

# Streamlining the Entire Flow

In [51]:
def getLLMResponse(question, results_to_use=1, enableRAG=True):
  ctx_lookup = ""
  if enableRAG:
    ctx_lookup = extract_keywords_tfidf(question)
  ext_prompt = generate_extended_prompt(question, ctx_lookup, results_to_use)
  lm_response = pipe(ext_prompt)
  return lm_response[0]["generated_text"]

In [52]:
ques = "How much RAM is needed for OmniFlow?"
res = getLLMResponse(ques)
print("----")
print(res)

----

  Relevant context:  #['Description: Fetches a list of all tasks within a specified project.', 'Parameters:', ['project_id (required): The ID of the project.', 'status (optional): Filter tasks by status (e.g., completed, in-progress).'], 'Response: A JSON object containing the list of tasks.'] #OmniFlow is a versatile, cross-platform application designed to streamline workflows and enhance productivity in both individual and team environments. The software integrates with various tools and platforms, offering a unified interface for managing tasks, communications, and data analytics. #['Operating System: Windows 10 or later, macOS 10.14 or later, Linux (various distributions), iOS 13 or later, Android 8.0 or later.', 'Processor: Dual-core CPU, 2.0 GHz or faster.', 'Memory: 4 GB RAM (8 GB recommended).', 'Storage: 500 MB available space.', 'Internet Connection: Required for cloud-based features and updates.']
  Considering the above context, answer the following question.
  Questi

# Evaluation

In [53]:
questions = [
  "How much RAM is needed for OmniFlow?",
  "What is OmniFlow used for?",
  "What is the email id for OmniFlow support?",
  "How to make a request to /tasks?",
  "What are some best practices for OmniFlow?",
]

## Running with RAG

In [54]:
responses_rag = []
for ques in questions:
  res = getLLMResponse(ques, 3)
  responses_rag.append(res)

In [55]:
responses_no_rag = []
for ques in questions:
  res = getLLMResponse(ques, 3, enableRAG=False)
  responses_no_rag.append(res)

In [56]:
for i in range(len(questions)):
  print("----RAG----")
  print(responses_rag[i])
  print("----NO-RAG----")
  print(responses_no_rag[i])
  print("==============")

----RAG----

  Relevant context:  #['Description: Fetches a list of all tasks within a specified project.', 'Parameters:', ['project_id (required): The ID of the project.', 'status (optional): Filter tasks by status (e.g., completed, in-progress).'], 'Response: A JSON object containing the list of tasks.'] #['400 Bad Request: The request was invalid or missing required parameters.', '401 Unauthorized: Authentication failed or API key is invalid.', '500 Internal Server Error: An error occurred on the server side.'] #['Description: Creates a new task within a specified project.', 'Parameters:', ['project_id (required): The ID of the project.', 'task_name (required): The name of the task.', 'assigned_to (optional): The user ID of the assignee.'], 'Response: A JSON object containing the details of the created task.'] #OmniFlow is a versatile, cross-platform application designed to streamline workflows and enhance productivity in both individual and team environments. The software integrate