In [1]:
#from langchain_community.document_loaders import WebBaseLoader
#from langchain_community.vectorstores import Chroma
# from langchain.text_splitter import CharacterTextSplitter
# from langchain_community.document_loaders import TextLoader
# from langchain_openai import OpenAIEmbeddings
# from langchain.chains import LLMChain
# from langchain.prompts import PromptTemplate
# from langchain_openai import OpenAI, ChatOpenAI
import pandas as pd

import streamlit as st
import os
import time
import re


import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from fnmatch import fnmatch
import openai

os.environ["OPENAI_API_KEY"] = st.secrets["openai_api_key"]
openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:


# Function to extract domain name from URL
def get_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    return domain

def skip_page(href,patterns):
    return any(fnmatch(href, pattern) for pattern in patterns)

patterns=[
    "*.jpeg",
    "*.png",
    #"*/nl/*",
    "*/treatment/*",
    "*/specialty/*",
    "*/terms-and-conditions/*",
    "*/all-specialists*",
    "*/specialist/*",
    "*/privacy-policy/*",
    "*/cookie-policy*",
    "*/disclaimer*",
    "*/category/*",
    "*/location/*",
    "*#*",
    
    "*/specialiteit*",
    "*/alle-specialisten*",
    "*/privacybeleid/*",
    "*/locatie*",
    "*/algemene-voorwaarden*",
    "*/behandeling*"
]

# Function to crawl a page and its subpages
def crawl(url, visited_pages, out_file):
    # Add the current page to visited pages
    visited_pages.add(url)
    print("Crawling:", url)
    out_file.write(f"N;{url}\n")
    
    try:
        # Send a GET request
        response = requests.get(url)
        # Check if request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all anchor tags
            for link in soup.find_all('a'):
                # Extract the href attribute
                href = link.get('href')
                # Join the URL with the base URL if it's a relative URL
                href = urljoin(url, href)
                href = href.rstrip('/') 
                # Check if the URL belongs to the same domain and hasn't been visited yet
                #'nl' not in href and 
                if (get_domain(href) != get_domain(url)) or (href in visited_pages):
                    continue
                if skip_page(href,patterns):
                    continue
                
                
#                 and \
#                                 not href.endswith('.jpeg') and not href.endswith('.jpg') \
#                                 and "pll_switcher" not in href and "cookie-policy" not in href \
#                                 and not href.endswith('.png') and not href.endswith('.gif')\
#                                 and "/treatment/" not in href \
#                                 and "/specialty/" not in href \
#                                 and "/terms-and-conditions/" not in href \
#                                 and "/all-specialists/" not in href \
#                                 and "/specialist/" not in href \
#                                 and "/privacy-policy/" not in href \
#                                 and "/category/" not in href \
#                                 and "#" not in href \
#                                 and "/location/" not in href :
                    # Recursively crawl the subpage
                crawl(href, visited_pages, out_file)
    except Exception as e:
        print("Error crawling page:", url)
        print(e)



In [3]:
# Starting URL
start_url = 'https://myscarspecialist.com'

# Set to keep track of visited pages
visited_pages = set()

# Start crawling
# with open("pages.csv","w") as out_file:
#     out_file.write("include;url\n")
#     crawl(start_url, visited_pages, out_file)


In [4]:
# for i in visited_pages:
#     print('"'+i+'",')

In [5]:


# Function to fetch and scrape the textual content from a given URL
def fetch_text_from_url(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Remove header and footer
            for tag in ['header', 'footer', '#cmplz-cookiebanner-container','#cmplz-manage-consent']:
                element = soup.select_one(tag)
                if element:
                    element.decompose()
            
            header = soup.find('header')
            if header:
                header.decompose()  # Removes the header tag and its contents
            
            footer = soup.find('footer')
            if footer:
                footer.decompose()  # Removes the footer tag and its contents
            
            # Extract all text from the parsed HTML
            text = soup.get_text()

            # Clean up the text by removing excessive whitespace and newlines
            #cleaned_text = text
            cleaned_text = re.sub(r'\n{4,}', '\n', text)
            #cleaned_text = ' '.join(text.split())

            return cleaned_text
        else:
            print(f"Error: Unable to fetch content, status code {response.status_code}")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None

# Example usage
# url = "https://myscarspecialist.com/treatments/surgical-scar-revision"  # Replace with the URL you want to scrape
# scraped_text = fetch_text_from_url(url)

# if scraped_text:
#     print(scraped_text)


In [16]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from uuid import uuid4
from langchain_core.documents import Document

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)




In [17]:
df_pages = pd.read_excel("scraped_webpages.xlsx",sheet_name="SCRAPED WEBPAGES")
urls = df_pages["url"].values
langs = df_pages["language"].values

In [18]:
documents = []
uuids = []

for page_url,language in zip(urls,langs):
    print(f"Processing: {page_url}")
    scraped_text = fetch_text_from_url(page_url)
    if scraped_text:
        documents.append(
            Document(
                page_content = scraped_text,
                metadata = {"url": page_url,
                             "language": language,
                             },
                # id=10,
            )
        )
        uuids.append(str(uuid4()))

vector_store.add_documents(documents=documents, ids=uuids)

Processing: https://myscarspecialist.com
Processing: https://myscarspecialist.com/dashboard
Processing: https://myscarspecialist.com/scar-characteristics
Processing: https://myscarspecialist.com/pain
Processing: https://myscarspecialist.com/thickness
Processing: https://myscarspecialist.com/itch
Processing: https://myscarspecialist.com/pliability
Processing: https://myscarspecialist.com/redness
Processing: https://myscarspecialist.com/pigmentation
Processing: https://myscarspecialist.com/texture
Processing: https://myscarspecialist.com/contractures
Processing: https://myscarspecialist.com/scar-gallery
Processing: https://myscarspecialist.com/hypertrophic-scar
Processing: https://myscarspecialist.com/burn-scar
Processing: https://myscarspecialist.com/linear-scar
Processing: https://myscarspecialist.com/atrophic-scar
Processing: https://myscarspecialist.com/horizontal-keloid
Processing: https://myscarspecialist.com/vertical-keloid
Processing: https://myscarspecialist.com/small-keloid
Pro

OperationalError: attempt to write a readonly database

In [4]:
results = vector_store.similarity_search(
    "Give me an example of a treatment for burn scars",
    k=2,
    #filter={"url": "https://myscarspecialist.com/treatments/micro-needling"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* 
Surgical Scar Revision - My Scar Specialist
 
 
Surgical Scar Revision
 by My Scar Specialist | Aug 16, 2022 | XIST - atrophic, XIST - burn, XIST - contract, XIST - HK, XIST - hypertrophy, XIST - plia, XIST - SK, XIST - texture, XIST - VK
Surgical Scar Revision
← Pulsed Dye Laser (PDL)Surgical Wound Closure Techniques →
Scar Treatments >

Why Surgical Scar Revision?

Annually, 200 million incisions (surgical cuts) are performed worldwide. The treatment of aesthetically unacceptable scars can be challenging, although several options are available. These options vary from non-invasive options to invasive scar revision. As there are lots of options along this spectrum it is important that you seek advice for your specific concerns from a surgeon specialised in scar.
Patients often see a plastic surgeon once they have tried lots of other non-invasive options. However, sometimes there are non-invasive options that could be more appropriate than surgery; again, an assessment by your scar 

In [None]:
website_list = [
    "https://howtotreatscars.com/scar-gallery",
    "https://howtotreatscars.com/itch/#aboutsection",
    "https://howtotreatscars.com/atrophic-scar/",
    "https://howtotreatscars.com",
    "https://howtotreatscars.com/project/intense-pulsed-light-ipl/",
    "https://howtotreatscars.com/home-treatments/",
    "https://howtotreatscars.com/project/vacuum-massage/",
    "https://howtotreatscars.com/partners/#bap-medical",
    "https://howtotreatscars.com/small-keloid/#aboutsection",
    "https://howtotreatscars.com/partners",
    "https://howtotreatscars.com/pain/#aboutsection",
    "https://howtotreatscars.com/project/topical-ointments-and-creams/",
    "https://howtotreatscars.com/",
    "https://howtotreatscars.com/project/scar-massage/",
    "https://howtotreatscars.com/linear-scar/",
    "https://howtotreatscars.com/home/",
    "https://howtotreatscars.com/texture/",
    "https://howtotreatscars.com/partners/",
    "https://howtotreatscars.com/project/injectables/",
    "https://howtotreatscars.com/vertical-keloid/",
    "https://howtotreatscars.com/scar-characteristics",
    "https://howtotreatscars.com/pigmentation/",
    "https://howtotreatscars.com/scar-treatments/",
    "https://howtotreatscars.com/pliability/",
    "https://howtotreatscars.com/project/pulsed-dye-laser-pdl/",
    "https://howtotreatscars.com/partners#bap-medical",
    "https://howtotreatscars.com/texture/#aboutsection",
    "https://howtotreatscars.com/scar-characteristics/",
    "https://howtotreatscars.com/project/exercise-and-splinting/",
    "https://howtotreatscars.com/redness/#aboutsection",
    "https://howtotreatscars.com/project/non-ablative-fractional-laser-nafl/",
    "https://howtotreatscars.com/pigmentation/#aboutsection",
    "https://howtotreatscars.com/scar-gallery/",
    "https://howtotreatscars.com/project/moisturizers/",
    "https://howtotreatscars.com/contact/",
    "https://howtotreatscars.com/burn-scar/",
    "https://howtotreatscars.com/redness/",
    "https://howtotreatscars.com/project/pressure-therapy/",
    "https://howtotreatscars.com/hypertrophic-scar/",
    "https://howtotreatscars.com/project/corticosteroid-therapy/",
    "https://howtotreatscars.com/project/ablative-fractional-laser-afl/",
    "https://howtotreatscars.com/surgery/",
    "https://howtotreatscars.com/linear-scar/#aboutsection",
    "https://howtotreatscars.com/vertical-keloid/#aboutsection",
    "https://howtotreatscars.com/hypertrophic-scar/#aboutsection",
    "https://howtotreatscars.com/project/surgical-scar-revision/",
    "https://howtotreatscars.com/atrophic-scar/#aboutsection",
    "https://howtotreatscars.com/horizontal-keloid/",
    "https://howtotreatscars.com/about/",
    "https://howtotreatscars.com/project/silicone-therapy/",
    "https://howtotreatscars.com/pliability/#aboutsection",
    "https://howtotreatscars.com/project/surgical-wound-closure-techniques/",
    "https://howtotreatscars.com/contractures/#aboutsection",
    "https://howtotreatscars.com/small-keloid/",
    "https://howtotreatscars.com/non-invasive-treatments/",
    "https://howtotreatscars.com/pain/",
    "https://howtotreatscars.com/contractures/",
    "https://howtotreatscars.com/minimally-invasive-treatments/",
    "https://howtotreatscars.com/cookie-policy-eu/",
    "https://howtotreatscars.com/itch/",
    "https://howtotreatscars.com/project/micro-needling/",
    "https://howtotreatscars.com/disclaimer/",
    "https://howtotreatscars.com/thickness/#aboutsection",
    "https://howtotreatscars.com/burn-scar/#aboutsection",
    "https://howtotreatscars.com/project/shockwave-therapy/",
    "https://howtotreatscars.com/project/skin-camouflage/",
    "https://howtotreatscars.com/horizontal-keloid/#aboutsection",
    "https://howtotreatscars.com/thickness/",
    "https://howtotreatscars.com/project/scar-taping/",
]

In [57]:

# website_list=[
#     "https://howtotreatscars.com/project/scar-massage/",
#     "https://howtotreatscars.com/pigmentation/"
# ]

def create_vs_from_website(
    website_array,
    chunk_size=2000, 
    chunk_overlap=50
):
    docs = []
    embeddings_model = OpenAIEmbeddings(openai_api_key=st.secrets["openai_api_key"])
    
    for idx, i in enumerate(website_array):
        print(idx,i)
        loader = WebBaseLoader(i)
        data = loader.load()
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        docs += (text_splitter.split_documents(data))
        #time.sleep(0.5)
    db = Chroma.from_documents(docs, embeddings_model,persist_directory="./chroma_db")
    
    print(len(docs))
    return db

# vs = create_vs_from_website(
#    website_array = website_list
# )



In [58]:
embeddings_model = OpenAIEmbeddings(openai_api_key=st.secrets["openai_api_key"])
vs = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)


In [61]:
def get_context_from_db(
    query,
    vectorstore,
    n_retrieve=3
):
    querybase = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k":n_retrieve, "lambda_mult":0.6})
    res = querybase.get_relevant_documents(query)
    
    final_answer = ""
    for i in res:
        summary = "###\n"+ i.page_content + "\n This info was retrieved from: " + i.metadata["source"] + "\n###\n"
        final_answer+=summary
        
    return final_answer

context= get_context_from_db(
    query="What are Possible beneficial effects of massage",
    vectorstore=vs,
)
#print(context)

In [70]:
"""
If the question is completely unrelated to the treatment of scars, do NOT make up an answer but instead reply with:
'Sorry, this information can not be found on the website.'
"""

template = """
You are a friendly assistant that helps people who are browsing a website with information on scar treatments.
You are polite, provide extensive accurate answers, and point the user to the right location for more information.

You have to answer a question that you find below, but only using information in the context below.
Do not use any other information and make sure your answer is almost an exact copy of the relevant text in the context.
The provided context in split in different chunks of information delimited by triple '#', and at the end of each
piece of context you find a urls where the info is retrieved from. You are allowed to combine information from
different parts of the context into one consistent and complete answer

If the question is completely unrelated to the treatment of scars, do NOT make up an answer but instead reply with:
'Sorry, this information can not be found on the website.'

If you give an answer, end your answer by stating on which website this info can be found, which is given at the end of each piece of context.
Make sure to give the entire link, starting with 'https:'
You are also allowed to give multiple URLs.

Question: {question}
Context: {context}

"""
#model = 'gpt-3.5-turbo-1106'
model = 'gpt-4-1106-preview'

prompt = PromptTemplate.from_template(template)
llm = ChatOpenAI(model=model,
                       temperature=0.1,
                       openai_api_key=st.secrets["openai_api_key"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

#question = "How many people suffer from burn injury every year"
#question = "Why would I choose for scar massage?"
#question = "What are Possible beneficial effects of massage on scar tissue"
#question = "which scar treatment options are there? What are their benefits and downsides?"
#question = "How does shocktherapy work, and are there any risks associated to it?"
question = "What can you tell me about burn scars?"
context = get_context_from_db(
    query=question,
    vectorstore=vs,
    n_retrieve=5,
)

answer = llm_chain.invoke({"question":question,"context":context})
print(answer["text"])

Burn scars are usually inevitable when the dermis is damaged and without surgery the wound takes up to three weeks or longer to heal. Although split skin grafts reduce healing times and scarring, they still leave a scar. Even with excellent treatment a scar may be present once the wound has healed. Although scarring can be minimized or improved by various non-invasive, semi-invasive treatments and surgical procedures the scar can’t completely be eliminated. Burn scars are often characterized by hypertrophy, contraction and an uneven texture. Treatment options for burn scars include home treatments like pressure therapy, silicone, topicals, etc., non-invasive treatments performed by scar specialists, minimally invasive treatments which have become increasingly popular due to highly visible results and short recuperation times, and surgery. Reconstructive surgery is often seen as the last option after all other treatment options have failed, but with burn scars, it is often a necessary m

In [64]:
#answer