In [171]:
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()

True

In [19]:
BASE_URL = 'https://saatva.com'
categories = ['mattresses', 'furniture', 'bedding']

### Step 1: Get a List of Subpages to Scrape

In [20]:
# Get a set of all unique product pages.
product_urls = set()

for category in categories:
    url = f"{BASE_URL}/{category}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    product_links = soup.find_all("a", class_="detailProductTile__imageContainer")
    for product_link in product_links:
        product_urls.add(product_link["href"])

In [21]:
product_urls = list(product_urls)
print(len(product_urls))
product_urls

84


['/bedding/waffle-towels',
 '/furniture/halle-storage-platform-bed',
 '/mattresses/crib-mattress',
 '/mattresses/saatva-youth',
 '/bedding/weighted-blanket',
 '/bedding/silk-eye-mask',
 '/bedding/aero-quilt',
 '/furniture/halle-with-storage',
 '/bedding/heavyweight-comforter',
 '/bedding/herringbone-knit-blanket',
 '/furniture/lucerne',
 '/bedding/flannel-sheet-set',
 '/mattresses/loom-and-leaf',
 '/bedding/embroidered-hotel-style-sheets',
 '/furniture/siena-leather',
 '/furniture/como-swivel-chair',
 '/furniture/navi-bedroom-rug',
 '/mattresses/dog-bed',
 '/bedding/embroidered-sateen-duvet-set',
 '/furniture/santorini-platform-bed',
 '/bedding/down-alternative-pillow',
 '/furniture/brienne-channel-ottoman',
 '/bedding/organic-sateen-duvet-cover-set',
 '/bedding/waterproof-mattress-protector',
 '/furniture/amalfi',
 '/bedding/graphite-memory-foam-mattress-topper',
 '/bedding/organic-mattress-pad',
 '/furniture/adjustable-base',
 '/furniture/kanan-bedroom-rug',
 '/bedding/linen-sheet-se

### Step 2: Gather relevant data from each PDP

- Extract the name of each element
- Scrape the buttons (any kind of general structure on the page)
- Write all visible text on the screen (for all the details)

A more specialized scraping approach would likely result in higher accuracy to remove all the useless noise, and better formatting would help the selective search filter out relevant chunks.

In [None]:
"""
Returns a list of all the formToggle elements within a specific search space. Useful for getting button options on the page.
:param element: the outer element type that wraps a group of button options.
:param class: the css class to select the corresponding element.
:param search_area: the bs4 element containing all the html to search through.
"""
def get_form_options(element, class_, search_area):
    if not search_area:
        return []
    option_container = search_area.find(element, class_=class_)
    if option_container:
        option_groups = option_container.find_all('div', class_='formToggle')
        return option_groups
    return []

In [None]:
from bs4.element import Comment
'''Returns whether an HTML element should be considered visible or not.'''
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'button']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [161]:
for subpage in product_urls:

    url = f"{BASE_URL}{subpage}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    page_name = subpage.split('/')[-1]

    # grab all the visible text on the page
    main = soup.find('main', class_='main')
    texts = main.findAll(string=True)
    visible_texts = filter(tag_visible, texts)  
    page_text = u" ".join(t.strip() for t in visible_texts).replace('Learn More', '')

    product_info = soup.find('div', class_='productPanel__content')

    # get sizes
    sizes = get_form_options('div', 'formRadioGroup productPanel__toggles--size', product_info)
    sizes_list = [size.text for size in sizes]

    # get heights
    heights = get_form_options('div', 'formRadioGroup productPanel__toggles--height', product_info)
    heights_list = [height.text for height in heights]
    
    # get options
    options = get_form_options('div', 'productPanel__option', product_info)
    options_list = [option.text.replace('\n', '') for option in options]

    # get colors
    colors_list = []
    color_search_area = product_info.find('div', class_='productSwatch')
    if color_search_area:
        labels = color_search_area.find_all('label', class_='formRadio__label')
        colors_list = [label['aria-label'] for label in labels]
    
    # get comfort levels
    comfort_list = []
    comfort_container = product_info('div', {'data-selector': 'buystackToggle--comfort'})
    if comfort_container:
        comfort_list = [cc.text for cc in comfort_container]

    # product title
    product = soup.find('h1', class_='productPanel__headingTitle').text

    print('-----')
    print(product)
    print(url)
    print('Sizes:', sizes_list)
    print('Heights:', heights_list)
    print('Options:', options_list)
    print('Colors: ', colors_list)
    print('Comfort: ', comfort_list)

    with open(f"./page_data/{page_name}.txt", 'w') as text_file:
        text_file.write(product)
        text_file.write('\n')
        text_file.write(url)
        text_file.write('\n')
        
        if sizes_list:
            text_file.write(f"{product} Sizes: {sizes_list}\n")
        if heights_list:
            text_file.write(f"{product} Heights: {heights_list}\n")
        if options_list:
            text_file.write(f"{product} Options: {options_list}\n")
        if colors_list:
            text_file.write(f"{product} Colors: {colors_list}\n")
        if comfort_list:
            text_file.write(f"{product} Comfort: {comfort_list}\n")
        
        text_file.write(f"{product} DETAILS:\n")
        text_file.write(page_text)

    print(f'wrote to ./page_data/{page_name}.txt')
    print('-----')
    

-----
Waffle Towel Collection
https://saatva.com/bedding/waffle-towels
Sizes: []
Heights: []
Options: ['3-piece Bath Towel Set', '4-pack Bath Towels', '2-pack Bath Sheets']
Colors:  ['White', 'Sand', 'Slate']
Comfort:  []
wrote to ./page_data/waffle-towels.txt
-----
-----
Halle Storage Platform
https://saatva.com/furniture/halle-storage-platform-bed
Sizes: []
Heights: []
Options: []
Colors:  ['Taupe Vintage Velvet', 'Graphite Vintage Velvet', 'Natural Linen']
Comfort:  []
wrote to ./page_data/halle-storage-platform-bed.txt
-----
-----
Crib Mattress
https://saatva.com/mattresses/crib-mattress
Sizes: []
Heights: []
Options: []
Colors:  []
Comfort:  []
wrote to ./page_data/crib-mattress.txt
-----
-----
Saatva Youth Mattress
https://saatva.com/mattresses/saatva-youth
Sizes: ['Twin', 'Twin XL', 'Full']
Heights: []
Options: []
Colors:  []
Comfort:  []
wrote to ./page_data/saatva-youth.txt
-----
-----
Weighted Blanket
https://saatva.com/bedding/weighted-blanket
Sizes: []
Heights: []
Options: 

### Integrate Data into Langchain

In [162]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader

#### Create Documents

In [163]:
loader = DirectoryLoader('./page_data/', glob='*.txt', show_progress=True)
docs = loader.load()

100%|██████████| 84/84 [00:03<00:00, 22.88it/s]


In [165]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 0,
    length_function = len,
)
split_docs = text_splitter.split_documents(docs)

#### Run Embedding

In [169]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings import SentenceTransformerEmbeddings
import pinecone
import os

#### Upload to Pinecone DB

In [173]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV"),
)
index_name = "saatva-bot"
docsearch = Pinecone.from_documents(split_docs, embeddings, index_name=index_name)

In [178]:
query = 'What colors does the dog bed come in?'
docs = docsearch.similarity_search(query)
docs

[Document(page_content='you would like to exchange your merchandise, simply return your order and repurchase the item of your choice. Will you offer more colors in the future? Our current bedding color options are thoughtfully curated to fit a more neutral color palette because they complement almost any room style with timeless simplicity. In keeping with our eco-friendly ethos, we either leave the natural fabric undyed in its beautiful natural state or use nontoxic, eco-friendly dyes for our bed sheets.', metadata={'source': 'page_data/embroidered-hotel-style-sheets.txt'}),
 Document(page_content="Saatva Dog Bed https://saatva.com/mattresses/dog-bed Saatva Dog Bed Options: ['Small', 'Medium', 'Large'] Saatva Dog Bed Colors: ['Natural Linen', 'Taupe Boucle', 'Slate Boucle'] Saatva Dog Bed DETAILS: Home Mattresses Saatva Dog Bed Click to zoom New slide page 1 of 5 45-day free returns  Free shipping  1-year limited warranty  Made in the U.S.A.  Saatva Dog Bed Saatva comfort & luxury, de