# Creating a voice assistant for knowledge base!

Using Whisper, OpenAI, Eleven Labs, and ActiveLoop.

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ELEVEN_API_KEY = os.getenv("ELEVEN_API_KEY")
ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")

## Knowledge Base

In [2]:
# Importing the necessary libraries
import os
import requests
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
import re
import warnings
warnings.filterwarnings("ignore")

In [3]:
my_activeloop_org_id = "ACTIVELOOP_TOKEN"
dataset_name = "voice-assistant"
dataset_path= f'hub://{my_activeloop_org_id}/{dataset_name}'

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

### Web scraping content ( Python library articles ) from Hugging Face Hub

In [4]:
# Function to get the documentation URLs

def get_documentation_urls():

    return[
        	'/docs/huggingface_hub/guides/overview',
		    '/docs/huggingface_hub/guides/download',
		    '/docs/huggingface_hub/guides/upload',
		    '/docs/huggingface_hub/guides/hf_file_system',
		    '/docs/huggingface_hub/guides/repository',
		    '/docs/huggingface_hub/guides/search',
    ]

In [5]:
# Function to construct the full URL

def construct_full_url(base_url, relative_url):
    return base_url + relative_url

In [6]:
def scrape_page_content(url):
    response = requests.get(url)    #get request to the url
    soup = BeautifulSoup(response.text, 'html.parser')    #use beautiful soup to parse the html content
    text = soup.body.text.strip()     # Extract the desired content from the page (in this case, the body text)
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]', '', text)
    text = re.sub(r'\s+', ' ', text)    # Remove any whitespace characters
    return text.strip()

In [7]:
# Function to scrape all content from the given URLs and save to a file

def scrape_all_content(base_url,relative_urls,filename):

    content = []
    for i in relative_urls:
        full_url = construct_full_url(base_url,i)
        scraped_content = scrape_page_content(full_url)
        content.append(scraped_content.rstrip('\n'))

    # Save the content to a file
    with open(filename, 'w', encoding='utf-8') as file:
        for item in content:
            file.write("%s\n" % item)
    return content

In [None]:
# Define a function to load documents from a file

def load_docs(root_dir,filename):