In [1]:
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import OllamaLLM
from langchain_chroma import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
import os
from datetime import datetime

In [2]:
import bs4

In [3]:
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [4]:
URL = "https://www.think-cell.com/en/resources/kb/overview.fcgi"
BASE_URL = "https://www.think-cell.com/en/resources/"

In [5]:
# Create the embedder with a specific model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [6]:
import re

def bs4_extractor(html: str) -> str:
    soup = bs4.SoupStrainer(name=("h1","h3", "h2", "p", "ol",))
    soup = bs4.BeautifulSoup(html, "html.parser", parse_only=soup)
    soup = re.sub(r"think-cell Suite has arrived. Discover your library and new tools.\n   \n  Resources  \n", "", soup.text).strip()
    soup = re.sub(r'\n+', ' ', soup).strip()
    soup = re.sub(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])', ' ', soup)
    soup = re.sub(r'Share Products Order Download Resources Career Company', '', soup)
    return soup


def custom_metadata_extractor(html: str, url: str):
    soup = bs4.BeautifulSoup(html, 'html.parser')
    image_urls = []

    for picture in soup.find_all('picture'):
        jpg_url = None

        # Check all <source> tags for jpg
        for source in picture.find_all('source'):
            src = source.get('srcset', '')
            if '.jpg' in src:
                jpg_url = src
                break  # take the first .jpg and stop

        # Fallback to <img> tag if no .jpg found in <source>
        if not jpg_url:
            img_tag = picture.find('img')
            if img_tag and '.jpg' in img_tag.get('src', ''):
                jpg_url = img_tag['src']

        if jpg_url:
            image_urls.append(jpg_url)

    return {'source': url, 'image_urls': image_urls}

def load_data(URL):
    loader = RecursiveUrlLoader(
        URL,
        max_depth=6,
        # use_async=False,
        extractor=bs4_extractor,
        metadata_extractor=custom_metadata_extractor,
        #exclude_dirs=["https://www.think-cell.com/en/resources/kb/overview.fcgi"],
        # timeout=10,
        check_response_status=True,
        # continue_on_failure=True,
        prevent_outside=True,
        base_url=BASE_URL
        # ...
    )
    docs = loader.load()
    return docs

# doc = load_data("https://www.think-cell.com/en/resources/kb/0235")

In [7]:
def split_docs(docs, embedder):

    # Split into chunks using the SemanticChunker with the embedder
    text_splitter = SemanticChunker(embeddings=embedder, breakpoint_threshold_type="standard_deviation")
    documents = text_splitter.split_documents(docs)

    return documents


def retrieve(saved_vector):
    
    retriever = saved_vector.as_retriever(
        search_type="similarity", 
        search_kwargs={"k": 1}
        )
    return retriever

In [8]:
vector_store_path = "/Users/abubakarmuktar/Documents/Think-Cell/VectorStoreDB"
index_name = "faiss_index"
full_index_path = os.path.join(vector_store_path, index_name)
start = ""
end = ""


def embed_docs(documents, embedder):

    # Ensure the directory exists
    os.makedirs(vector_store_path, exist_ok=True)

    # just query if it exists
    if os.path.exists(full_index_path):
        saved_vector = FAISS.load_local(full_index_path, 
                                        embeddings=embedder, 
                                        allow_dangerous_deserialization=True)

        return saved_vector
    else:
        embedded_vector = FAISS.from_documents(documents=documents, embedding=embedder)
        embedded_vector.save_local(full_index_path)
        
        return embedded_vector

In [9]:

# Define llm
llm = OllamaLLM(
    model="mistral:instruct",
  )

chunks = None
# just query if it exists
if not os.path.exists(full_index_path):
  docs = load_data(URL) #Load Dataset
  chunks = split_docs(docs, embedder=embedder) #Split Document

saved_vector = embed_docs(chunks, embedder=embedder) #Embed Document
retrieved = retrieve(saved_vector) # Retrieve simimlar docs


Unable to load from https://www.think-cell.com/en/resources/content-hub/&bull;%09https:/www.think-cell.com/en/resources/content-hub/role-of-data-visualization-in-business-decision-making. Received error Received HTTP status 404 of type ValueError


In [10]:
# Define the prompt template
prompt = """
1. You are a CUSTOMER SUPPORT ENGINEER helping Think-cell's customers answer questions related to their software product, use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know, you may have to contact CUSTOMER SUPPORT" but don't make up an answer on your own.
3. Keep the answer crisp, straight forward and limited to 3,4 sentences.
4. Add the LINKS for the particular sources for easy accessibility.

Context: {context}

Question: {{question}}

Helpful Answer:"""


QA_CHAIN_PROMPT = PromptTemplate.from_template(template=prompt)

# Create document prompt
document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

# Create the stuff documents chain
combine_docs_chain = create_stuff_documents_chain(
    llm,
    QA_CHAIN_PROMPT,
    document_prompt=document_prompt
)

# Create the retrieval chain
qa_chain = create_retrieval_chain(
    retriever=retrieved,
    combine_docs_chain=combine_docs_chain
)

# Updated response function
def respond(question):
    # Invoke the chain with the question
    result = qa_chain.invoke({"input": question})
    
    # Return the answer
    return result["answer"]


In [17]:
question = "But what about automating periodic reports, or using a web service to create charts based on your parameters, or exporting your business intelligence reports as PowerPoint slides?"
response = respond(question)
print(response)

{'input': 'But what about automating periodic reports, or using a web service to create charts based on your parameters, or exporting your business intelligence reports as PowerPoint slides?', 'context': [Document(id='4a487c4f-40a7-4f72-af23-50cb58c25ee4', metadata={'source': 'https://www.think-cell.com/en/resources/manual/introductionautomation', 'image_urls': ['\n            /img/containers/assets/en/resources/manual/images/slideview/automation_chart_template_no_name.png/4a5ed727fbe509a326f4ee15c9d8f95e/automation_chart_template_no_name.jpg\n          ', '\n            /img/containers/assets/en/resources/manual/images/slideview/automation_chart_template_with_name.png/ea3f9aaba09aedb7d77f0d48e6858df7/automation_chart_template_with_name.jpg\n          ', '\n            /img/containers/assets/en/resources/manual/images/toolbar/namedtext.png/69226f076701dd0524de75302756c587/namedtext.jpg\n          ', '\n            /img/containers/assets/en/resources/manual/images/slideview/text_field_w

In [18]:
response["answer"]

" To automate reports using Think-cell, first, create a PowerPoint presentation with a think-cell chart and a text box for the slide title as an automation template. Name the chart and text box uniquely using the 'Update Chart Name' option in their respective contextual toolbars (source: https://www.think-cell.com/en/resources/manual/introductionautomation#charting).\n\nNext, create programmatic logic to merge your Excel or JSON data with this template slide and supply it with the required data for use in a new presentation (source: https://www.think-cell.com/en/resources/manual/introductionautomation#externaldata). This way, you can generate presentations with various combinations of think-cell charts using your own data."

In [20]:
def bs4_extractor(html: str) -> str:
    soup = bs4.SoupStrainer(name=("h1","h3", "h2", "p", "ol", "ul",) )
    soup = bs4.BeautifulSoup(html, "html.parser", parse_only=soup)
    soup = re.sub(r"think-cell Suite has arrived. Discover your library and new tools.\n   \n  Resources  \n", "", soup.text).strip()
    soup = re.sub(r'\n+', ' ', soup).strip()
    soup = re.sub(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])', ' ', soup)
    soup = re.sub(r'Share Products Order Download Resources Career Company', '', soup)
    return soup


def custom_metadata_extractor(html: str, url: str):
    soup = bs4.BeautifulSoup(html, 'html.parser')
    image_urls = []

    for picture in soup.find_all('picture'):
        jpg_url = None

        # Check all <source> tags for jpg
        for source in picture.find_all('source'):
            src = source.get('srcset', '')
            if '.jpg' in src:
                jpg_url = src
                break  # take the first .jpg and stop

        # Fallback to <img> tag if no .jpg found in <source>
        if not jpg_url:
            img_tag = picture.find('img')
            if img_tag and '.jpg' in img_tag.get('src', ''):
                jpg_url = img_tag['src']

        if jpg_url:
            image_urls.append(jpg_url)

    return {'source': url, 'image_urls': image_urls}

def load_data(URL):
    loader = RecursiveUrlLoader(
        URL,
        max_depth=6,
        #use_async=False,
        extractor=bs4_extractor,
        metadata_extractor=custom_metadata_extractor,
        #exclude_dirs=["https://www.think-cell.com/en/resources/kb/overview.fcgi"],
        # timeout=10,
        check_response_status=True,
        # continue_on_failure=True,
        prevent_outside=True,
        base_url="https://www.think-cell.com/en/resources/"
        # ...
    )
    docs = loader.load()
    return docs


dd = load_data("https://www.think-cell.com/en/resources/kb/overview.fcgi")
dd

Unable to load from https://www.think-cell.com/en/resources/content-hub/&bull;%09https:/www.think-cell.com/en/resources/content-hub/role-of-data-visualization-in-business-decision-making. Received error Received HTTP status 404 of type ValueError


[Document(metadata={'source': 'https://www.think-cell.com/en/resources/kb/overview.fcgi', 'image_urls': []}, page_content='think-cell Suite has arrived. Discover your library and new tools.     Knowledge base                               Resources                                            Knowledge base                 Search our knowledge base Our knowledge base covers technical topics that are not part of the user manual.All articleshttps://www.think-cell.com/en/resources/kb/0003https://www.think-cell.com/en/resources/kb/0004https://www.think-cell.com/en/resources/kb/0005https://www.think-cell.com/en/resources/kb/0007https://www.think-cell.com/en/resources/kb/0010https://www.think-cell.com/en/resources/kb/0014https://www.think-cell.com/en/resources/kb/0015https://www.think-cell.com/en/resources/kb/0017https://www.think-cell.com/en/resources/kb/0018https://www.think-cell.com/en/resources/kb/0021https://www.think-cell.com/en/resources/kb/0022https://www.think-cell.com/en/resources/kb

In [22]:
type(retrieved)

langchain_core.vectorstores.base.VectorStoreRetriever

In [25]:
type(saved_vector)

langchain_community.vectorstores.faiss.FAISS