# Creating a voice assistant for knowledge base!

Using Whisper, OpenAI, Eleven Labs, and ActiveLoop.

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ELEVEN_API_KEY = os.getenv("ELEVEN_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

## Knowledge Base

In [2]:
# Importing the necessary libraries
import os
import requests
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

from pinecone import Pinecone
import pinecone
from pinecone import ServerlessSpec

import re
import warnings
warnings.filterwarnings("ignore")



In [None]:
# Initialize Pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment="us-east-1-aws"
)
# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Create or get index
index_name = "voice-assistant"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embeddings dimension
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
# Initialize vector store
vectorstore = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
    text_key="text"
)

### 1. Web scraping content ( Python library articles ) from Hugging Face Hub

In [4]:
# Function to get the documentation URLs

def get_documentation_urls():

    return[
        	'/docs/huggingface_hub/guides/overview',
		    '/docs/huggingface_hub/guides/download',
		    '/docs/huggingface_hub/guides/upload',
		    '/docs/huggingface_hub/guides/hf_file_system',
		    '/docs/huggingface_hub/guides/repository',
		    '/docs/huggingface_hub/guides/search',
    ]

In [5]:
# Function to construct the full URL

def construct_full_url(base_url, relative_url):
    return base_url + relative_url

In [6]:
def scrape_page_content(url):
    response = requests.get(url)    #get request to the url
    soup = BeautifulSoup(response.text, 'html.parser')    #use beautiful soup to parse the html content
    text = soup.body.text.strip()     # Extract the desired content from the page (in this case, the body text)
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]', '', text)
    text = re.sub(r'\s+', ' ', text)    # Remove any whitespace characters
    return text.strip()

In [7]:
# Function to scrape all content from the given URLs and save to a file

def scrape_all_content(base_url,relative_urls,filename):

    content = []
    for i in relative_urls:
        full_url = construct_full_url(base_url,i)
        scraped_content = scrape_page_content(full_url)
        content.append(scraped_content.rstrip('\n'))

    # Save the content to a file
    with open(filename, 'w', encoding='utf-8') as file:
        for item in content:
            file.write("%s\n" % item)
    return content

In [8]:
# Define a function to load documents from a file

def load_docs(root_dir,filename):
    docs = []
    try:
        loader = TextLoader(os.path.join(root_dir,filename), encoding='utf-8')
        docs.extend(loader.load_and_split())
    except Exception as e:
        pass      #if an error occurs, pass it and continue
    return docs

def split_docs(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000,chunk_overlap = 0)
    return text_splitter.split_documents(docs)

### 2. Embedding and storing in Deep Lake


In [9]:
# define the main function

def main():
    base_url = 'https://huggingface.co'
    # Set the name of the file to which the scraped content will be saved
    filename='content.txt'
    # Set the root directory where the content file will be saved
    root_dir ='./'
    relative_urls = get_documentation_urls()

    content = scrape_all_content(base_url,relative_urls,filename)

    docs = load_docs(root_dir,filename)

    texts = split_docs(docs)

    db = DeepLake(dataset_path=dataset_path,embedding=embeddings)

    db.add_documents(texts)

    os.remove(filename)

    print("Content scraped, embedded, and stored in Deep Lake successfully!")

if __name__ == "__main__":
    main()

InvalidTokenException: Token is invalid. Make sure the full token string is included and try again.