In [2]:
import pandas as pd

import streamlit as st
import os
import time
import re


import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from fnmatch import fnmatch
import openai

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams

from uuid import uuid4
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = st.secrets["openai_api_key"]
openai.api_key  = os.environ['OPENAI_API_KEY']

# CRAWL WEBPAGES

In [3]:


# Function to extract domain name from URL
def get_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    return domain

def skip_page(href,patterns):
    return any(fnmatch(href, pattern) for pattern in patterns)

patterns=[
    "*.jpeg",
    "*.png",
    #"*/nl/*",
    "*/treatment/*",
    "*/specialty/*",
    "*/terms-and-conditions/*",
    "*/all-specialists*",
    "*/specialist/*",
    "*/privacy-policy/*",
    "*/cookie-policy*",
    "*/disclaimer*",
    "*/category/*",
    "*/location/*",
    "*#*",
    
    "*/specialiteit*",
    "*/alle-specialisten*",
    "*/privacybeleid/*",
    "*/locatie*",
    "*/algemene-voorwaarden*",
    "*/behandeling*"
]

# Function to crawl a page and its subpages
def crawl(url, visited_pages, out_file):
    # Add the current page to visited pages
    visited_pages.add(url)
    print("Crawling:", url)
    out_file.write(f"N;{url}\n")
    
    try:
        # Send a GET request
        response = requests.get(url)
        # Check if request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all anchor tags
            for link in soup.find_all('a'):
                # Extract the href attribute
                href = link.get('href')
                # Join the URL with the base URL if it's a relative URL
                href = urljoin(url, href)
                href = href.rstrip('/') 
                # Check if the URL belongs to the same domain and hasn't been visited yet
                #'nl' not in href and 
                if (get_domain(href) != get_domain(url)) or (href in visited_pages):
                    continue
                if skip_page(href,patterns):
                    continue
                
                
#                 and \
#                                 not href.endswith('.jpeg') and not href.endswith('.jpg') \
#                                 and "pll_switcher" not in href and "cookie-policy" not in href \
#                                 and not href.endswith('.png') and not href.endswith('.gif')\
#                                 and "/treatment/" not in href \
#                                 and "/specialty/" not in href \
#                                 and "/terms-and-conditions/" not in href \
#                                 and "/all-specialists/" not in href \
#                                 and "/specialist/" not in href \
#                                 and "/privacy-policy/" not in href \
#                                 and "/category/" not in href \
#                                 and "#" not in href \
#                                 and "/location/" not in href :
                    # Recursively crawl the subpage
                crawl(href, visited_pages, out_file)
    except Exception as e:
        print("Error crawling page:", url)
        print(e)



In [4]:
# Starting URL
start_url = 'https://myscarspecialist.com'

# Set to keep track of visited pages
visited_pages = set()

# Start crawling
# with open("pages.csv","w") as out_file:
#     out_file.write("include;url\n")
#     crawl(start_url, visited_pages, out_file)


In [5]:
# for i in visited_pages:
#     print('"'+i+'",')

# SCRAPE RELEVANT PAGES AND PUT IN VECTOR DB

## scrape function

In [6]:


# Function to fetch and scrape the textual content from a given URL
def fetch_text_from_url(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Remove header and footer
            for tag in ['header', 'footer', '#cmplz-cookiebanner-container','#cmplz-manage-consent']:
                element = soup.select_one(tag)
                if element:
                    element.decompose()
            
            header = soup.find('header')
            if header:
                header.decompose()  # Removes the header tag and its contents
            
            footer = soup.find('footer')
            if footer:
                footer.decompose()  # Removes the footer tag and its contents
            
            # Extract all text from the parsed HTML
            text = soup.get_text()

            # Clean up the text by removing excessive whitespace and newlines
            #cleaned_text = text
            cleaned_text = re.sub(r'\n{4,}', '\n', text)
            #cleaned_text = ' '.join(text.split())

            return cleaned_text
        else:
            print(f"Error: Unable to fetch content, status code {response.status_code}")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None

# Example usage
# url = "https://myscarspecialist.com/treatments/surgical-scar-revision"  # Replace with the URL you want to scrape
# scraped_text = fetch_text_from_url(url)

# if scraped_text:
#     print(scraped_text)


## read urls from excel file

In [7]:
df_pages = pd.read_excel("scraped_webpages_reviewed.xlsx",sheet_name="SCRAPED_REVIEWED")
urls = df_pages["url"].values
langs = df_pages["language"].values

assert len(urls)==len(langs), "urls and langs not equally long"

## create or load vectorDB

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

create = False

client = QdrantClient(path="./myscarspecialist_qdrant_db")

if create:
    client.create_collection(
        collection_name="myscarspecialist",
        vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
    )

vector_store = QdrantVectorStore(
    client=client,
    collection_name="myscarspecialist",
    embedding=embeddings,
)

In [9]:

# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# vector_store = Chroma(
#     collection_name="myscarspecialist",
#     embedding_function=embeddings,
#     persist_directory="./myscarspecialist_chroma_db",  # Where to save data locally, remove if not necessary
# )


## Fill vector store

In [10]:
documents = []
uuids = []


if create:
    for page_url,language in zip(urls,langs):
        print(f"Processing: {page_url}")
        scraped_text = fetch_text_from_url(page_url)
        if scraped_text:
            documents.append(
                Document(
                    page_content = scraped_text,
                    metadata = {"url": page_url,
                                 "language": language,
                                 },
                    # id=10,
                )
            )
            uuids.append(str(uuid4()))
    
    vector_store.add_documents(documents=documents, ids=uuids)

## retrieve chunks

In [14]:
query="Give me an example of a treatment for burn scars"
results = vector_store.similarity_search(
    query,
    k=2,
    # filter={"language": "nl",
    #        },
    filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.language",
                match=models.MatchValue(value='nl'),
            ),
        ]
    ),
    
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* 
Litteken­behandelingen - My Scar Specialist
 
 
Litteken­behandelingen
Hieronder zie je verschillende littekenbehandelingen. Een of meer van deze behandelingen kunnen worden gebruikt om uw litteken te behandelen. U kunt op de afbeelding klikken van de behandeling waar u meer over wilt weten.

Vind een littekenspecialist
Zelfzorg
Vocht­inbrengende Crèmes
Druktherapie
Siliconen­therapie
Topicale Zalven en Crèmes
Huid­camouflage
Omgaan met littekens
Niet-invasieve behandelingen
Beweging en Spalken
Litteken­massage
Litteken­taping
Shockwave Therapie
Vacuüm massage
Minimaal invasieve behandelingen
Cortico­steroïde Therapie
Injecteerbare Producten
Micro-Needling
Ablatieve Fractionele Laser (AFL)
Niet-Ablatieve Fractionele Laser (NAFL)
Intense Pulsed Light (IPL)
Pulsed Dye Laser (PDL)
Cortico­steroïde Therapie
Injecteerbare Producten
Micro-Needling
Ablatieve Fractionele Laser (AFL)
Niet-Ablatieve Fractionele Laser (NAFL)
Intense Pulsed Light (IPL)
Pulsed Dye Laser (PDL)
Chirurgie
Chirurgis

# FULL RAG

In [30]:
template_text_system = """ You are a friendly assistant that helps people who are browsing a website with information on scar treatments.
You are polite, provide extensive accurate answers, and point the user to the right location for more information.
You always answer in the same language as the original question.

You have to answer a question that you find below, but only using information in the context below.
Do not use any other information and make sure your answer is almost an exact copy of the relevant text in the context.
The provided context is split in different chunks of information delimited by triple '#', and at the end of each
piece of context you find a urls where the info is retrieved from. You are allowed to combine information from
different parts of the context into one consistent and complete answer.

If the question is completely unrelated to the treatment of scars, do NOT make up an answer but instead reply with:
'Sorry, this information can not be found on the website.'. If however you can not find an exact answer in the context, 
but you find some related information, you can still give a reply acknowleding that it might not exactly answer their question,
but more info might be available on the website.

If you give an answer, end your answer by stating on which website this info can be found, which is given at the end of each piece of context.
Make sure to give the entire link, starting with 'https:'
You are also allowed to give multiple URLs.
Add the URL in the following form: "You can read more about <topic_the_question_was_about> on: https://..."
"""

template_text_user = """
Question: 
{question}

Context: 
{context}
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", template_text_system,),
        ("human", template_text_user),
    ]
)
# prompt.invoke(
#     {
#         "question":query,
#         "context":context,
#     }
# )

In [31]:
#query="What are Possible beneficial effects of massage"
query="Can you give an overview of different characteristics of scars?"
n_chunks = 3
lang="en" #"nl"

chunks = vector_store.similarity_search(
    query,
    k=2,
    # filter={"language": "nl",
    #        },
    filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.language",
                match=models.MatchValue(value='nl'),
            ),
        ]
    ),
    
)

context = ""
for _chunk in chunks:
    summary = "###\n"+ _chunk.page_content + "\n This info was retrieved from: " + _chunk.metadata["url"] + "\n###\n"
    context+=summary

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.2,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

chain = prompt | llm

response = chain.invoke(
    {
        "question":query,
        "context":context,
    }
)
print(response.content)

Er zijn verschillende kenmerken van littekens die vaak voorkomen. Deze omvatten:

- **Pijn**: Littekens kunnen pijnlijk zijn.
- **Jeuk**: Jeuk is een veelvoorkomend symptoom bij littekens.
- **Dikte**: Littekens kunnen dikker worden, wat vaak "littekenhypertrofie" wordt genoemd. Dit gebeurt als er te veel collageen wordt geproduceerd tijdens het genezingsproces.
- **Plooibaarheid**: Littekens zijn meestal stijver dan gewone huid, wat het moeilijker maakt om te bewegen, vooral rondom gewrichten.
- **Roodheid**: Littekens kunnen rood zijn.
- **Pigmentatie**: Veranderingen in pigmentatie kunnen optreden bij littekens.
- **Textuur**: De textuur van littekens kan variëren.
- **Contracturen**: Dit zijn strakke littekens die de beweging kunnen beperken.

Je kunt meer lezen over littekenkenmerken op: https://myscarspecialist.com/nl/littekenkenmerken en https://myscarspecialist.com/nl/author/lisamsp.


In [32]:
response

AIMessage(content='Er zijn verschillende kenmerken van littekens die vaak voorkomen. Deze omvatten:\n\n- **Pijn**: Littekens kunnen pijnlijk zijn.\n- **Jeuk**: Jeuk is een veelvoorkomend symptoom bij littekens.\n- **Dikte**: Littekens kunnen dikker worden, wat vaak "littekenhypertrofie" wordt genoemd. Dit gebeurt als er te veel collageen wordt geproduceerd tijdens het genezingsproces.\n- **Plooibaarheid**: Littekens zijn meestal stijver dan gewone huid, wat het moeilijker maakt om te bewegen, vooral rondom gewrichten.\n- **Roodheid**: Littekens kunnen rood zijn.\n- **Pigmentatie**: Veranderingen in pigmentatie kunnen optreden bij littekens.\n- **Textuur**: De textuur van littekens kan variëren.\n- **Contracturen**: Dit zijn strakke littekens die de beweging kunnen beperken.\n\nJe kunt meer lezen over littekenkenmerken op: https://myscarspecialist.com/nl/littekenkenmerken en https://myscarspecialist.com/nl/author/lisamsp.', additional_kwargs={'refusal': None}, response_metadata={'token_