In [2144]:
import os
import re
from time import perf_counter, sleep
import traceback
import seaborn as sns
import pandas as pd
from gazpacho import Soup
from selenium.webdriver import Chrome
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

In [2145]:
timestamp = f"{pd.Timestamp('today'):%Y-%m-%d %I-%M %p}"

# create project directories
os.makedirs('data', exist_ok=True)
os.makedirs('screenshots', exist_ok=True)

# define CSV path
cwd = os.getcwd()
csv_path = ''.join((cwd, '/data/', timestamp, '.csv'))


In [2146]:
chromedriver = "/Users/renadalbishri/Downloads/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [2147]:
url="https://www.brilliantearth.com/loose-diamonds/search/"
driver = webdriver.Chrome(chromedriver)
driver.get(url)

In [2148]:
def take_screenshot():
    """Saves a screenshot of the current window in the 'screenshots' directory."""
    path = ''.join((
        './screenshots/', 'screenshot ', timestamp, '.png')
    )
    driver.save_screenshot(path)



In [2149]:
def make_soup():
    """Makes soup on raw html to enable parsing."""
    html = driver.page_source
    return Soup(html)

In [2150]:
def load_url(diamond_type: str):
    
    """Navigates to Brilliant Earth's diamonds search page."""
    base = 'https://www.brilliantearth.com/'
    natural_url = base + "/loose-diamonds/search/"
    lab_url = base + "lab-diamonds-search/"
    if diamond_type == 'natural':
        driver.get(natural_url)
    else:
        driver.get(lab_url)
    

In [2151]:
def click_filter():
    sleep(3)
    first=driver.find_element_by_xpath('/html/body/div[6]/div[4]/span')
    first.click()
    sleep(2)

In [2152]:
def close_marketing_box():
    
    """Closes the marketing box when first loading the page."""
    # wait a maximum of 60 seconds to close the box
    try:
        WebDriverWait(driver, 60).until(
            ec.presence_of_element_located((By.CLASS_NAME, 'sailthru-overlay-close'))
        ).click()
    except:
        pass
    

In [2153]:
def get_shapes():
    sleep(2)
    """Returns a list of available shapes."""
    soup = make_soup()
    a = soup.find('div', {'class': 'ir246-product-shape-wrap'})
    b = a.find('a')
    return [shape.text.lower() for shape in b]
    sleep(1)

In [2154]:
def select_shapes(ix: int):
    """Selects diamond shapes on the first pass."""
    sleep(25)
    if ix == 0:
        shapes = get_shapes()
        for shape in shapes:
            shape_element = '-'.join((shape, 'details'))
            driver.find_element_by_class_name(shape_element).click()
        
    else:
        pass
    sleep(10)

In [2155]:
def perform_actions(element: str, box_input: str):
    
    """Takes actions on input box elements."""
    # find element
    e = driver.find_element_by_id(element)
    print(e)
    actions = ActionChains(driver)
    actions.move_to_element(e)
    actions.click()
    actions.send_keys(Keys.BACKSPACE * 10)
    actions.send_keys(box_input + Keys.RETURN)
    actions.perform()
    sleep(1)

    # click header
    header = driver.find_element_by_tag_name('h1')
    header.click();


In [2156]:
def set_LW():
    sleep(2)
    search_box = driver.find_element_by_xpath("//input[@id='min_ratio']")
    search_box.clear()
    search_box.send_keys("1.00")
    # search_box.send_keys(Keys.RETURN) #enter button
    # perform_actions('min_ratio', '1')
    sleep(2)

In [2157]:
#  search_box = driver.find_element_by_xpath("//input[@id='max_price_display']")
#     search_box.clear()
# #     search_box.send_keys("10000000")

In [2158]:
def set_max_price():
    sleep(1)
    """Re-adjusts the max price box in the results table."""
    perform_actions('max_price_display', '100000000')
    sleep(2)

In [2159]:
def set_max_carat():
    sleep(10)
    """Re-adjusts the carat box in the results table."""
    perform_actions('max_carat_display', '11')
    sleep(5)

In [2160]:
def table_scroll():
    sleep(5)
    """Scrolls down the diamond data table.
    The table loads a maximum of 200 items per position.
    """
    base_script = "document.querySelector('#diamond_search_wrapper').scrollTop="
    positions = ['6766', '13566', '20366', '27166', '33966']
    prev_n_items = 0

    for p in positions:
        # make soup & find items
        soup = make_soup()
        items = soup.find('div', {'class': 'inner item'})

        # check if 'items' is a list
        if isinstance(items, list):
            n_items = len(items)
            diff = n_items - prev_n_items

            # if 200 items loaded, track 'n_items' & scroll down to load more
            if diff == 200:
                prev_n_items = n_items
                scroll_by = ''.join((base_script, p))
                driver.execute_script(scroll_by)
                sleep(8)
            else:
                # if there are fewer than 200 items, all items have been loaded
                break
        # if 'items' is not a list (a single item), break
        else:
            break

In [2161]:
def create_dataframe():
    sleep(2)
    """Returns pandas DataFrame from diamonds HTML page."""
    html = driver.page_source
    dfs = pd.read_html(html)

    # return the second table which contains target data
    return dfs[1]
    sleep(1)

In [2162]:
#.drop(columns=['0', 'compare', 'checkbox']))
#'compare', 'checkbox',

In [2163]:
def clean_table_df(df):
    sleep(3)
    """"Returns clean diamonds pandas DataFrame."""
    # rename columns
    df.columns = ['0','shape', 'price', 'carat', 'cut', 'color', 'clarity',
                  'report','LW','compare']

    # drop blank rows & useless columns
    df = (df.dropna(axis=0, how='all', thresh=3)
    .drop(columns=['compare']))

    # remove '$' and commas, and convert float to int
    df['price'] = df['price'].replace({'\\$': '', ',': ''}, regex=True)
    df['price'] = pd.to_numeric(df['price'], downcast='integer')

    return df
    sleep(1)

In [2164]:
def get_url_list():
    sleep(1)
    """Returns list of html containing url sub-directories."""
    soup = make_soup()

    # find html with diamond url page and return it
    return soup.find('a', {'class': 'td-n2'})
    sleep(1)

In [2165]:
def create_url_df():
    """Returns DataFrame with diamond id and individual diamond urls."""
    sleep(1)
    url_list = get_url_list()
    url_dict = {}
    base = 'https://www.brilliantearth.com/'

    # extract url sub-directory & id and add to dict
    for ix, i in enumerate(url_list[1:-1], start=1):
        href = i.attrs.get('href')
        d_id = re.findall("([0-9]+)", href)[0]

        # add diamond id and url to dict
        url_dict[ix] = {'id': d_id, 'url': base + href}

    # construct pandas DataFrame from url_dict and return it
    return pd.DataFrame.from_dict(url_dict, orient='index')
    sleep(2)

In [2166]:
def merge_dfs(left_df, right_df):
    sleep(3)
    """Merges 'df' and 'url_df' and returns merged DataFrame."""
    return pd.merge(left_df, right_df, left_index=True, right_index=True)
    sleep(1)



In [2167]:
def adjust_price(max_price: str):
    sleep(2)
    """Filters diamonds results based on price range."""
    perform_actions('min_price_display', max_price)
    sleep(10)


In [2168]:
def final_cleaning(df, diamond_type):
    sleep(2)
    """Returns DataFrame - removes duplicates, adds 'type' & 'date_fetched' columns."""
    clean_df = df.copy()
    clean_df = clean_df.drop_duplicates()
    clean_df['type'] = diamond_type
    clean_df['date_fetched'] = timestamp
    return clean_df
    sleep(1)

In [2169]:
def to_csv(df):
    """Writes a CSV file in the 'data' directory."""
    df = df.drop_duplicates()
    df.to_csv(csv_path, index=False)
    sleep(1)

In [2170]:
def get_max_price(df):
    sleep(2)
    """Returns string of the max 'price' in the DataFrame."""
    return str(df['price'].max())
    sleep(1)

In [2171]:
def get_last_id(df):
    """Returns the 'id' of the last row in the DataFrame."""
    return df['id'].iloc[-1]
    sleep(1)

In [2172]:
def main():
    """Run script."""
    print('Attempting to scrape diamonds data. This could take a while...')
    tic = perf_counter()
    diamond_type = ['natural', 'lab']
    final_df = pd.DataFrame()

    try:
        for ix, dt in enumerate(diamond_type):
            # first scrape attempt
            print("load dt")
            load_url(dt)
            print("closing marketing")
            close_marketing_box()
            print("select shape")
            select_shapes(ix)
            print("click filter")
            click_filter()
            print("max carat")
            set_max_carat()
            print("set lw")
            set_LW()
            print("max pricde")
            set_max_price()
            print("tbl scroll")
            table_scroll()

            raw_df = create_dataframe()
            table_df = clean_table_df(raw_df)
            url_df = create_url_df()
            df1 = merge_dfs(url_df, table_df)
            
            # get max price & id from the DataFrame to filter diamonds for next scrape
            prev_max_price = get_max_price(df1)
            prev_last_id = get_last_id(df1)

            # scrape remaining rows by iterating the price range
            while True:
                # scrape diamonds table
                print(prev_max_price)
                adjust_price(prev_max_price)
                table_scroll()

                # create and clean DataFrame, and append to 'df1' (created in first pass)
                raw_df = create_dataframe()
                table_df = clean_table_df(raw_df)
                url_df = create_url_df()
                merged_df = merge_dfs(url_df, table_df)
                df1 = df1.append(merged_df)

                # set current max price & id using the last row scraped
                current_max_price = get_max_price(df1)
                current_last_id = get_last_id(df1)

                # check if price and id of the last row have been scraped
                if current_max_price != prev_max_price and \
                        current_last_id != prev_last_id:
                    prev_max_price = current_max_price
                    prev_last_id = current_last_id

                # else there are no new diamond results, export DataFrame
                else:
                    clean_df = final_cleaning(df1, dt)
                    final_df = final_df.append(clean_df)
                    break
        else:
            to_csv(final_df)
            print(f"CSV path: {csv_path}")
    except:
        traceback.print_exc()
        take_screenshot()
    finally:
        
        toc = perf_counter()
        duration = (toc - tic) / 60
        print(f"Finished in {duration:0.1f} minutes")


if __name__ == '__main__':
    main()

Attempting to scrape diamonds data. This could take a while...
load dt
closing marketing
select shape
click filter
max carat
<selenium.webdriver.remote.webelement.WebElement (session="a3c6d4afa975858233ec72824d956037", element="f60404c2-aa75-4349-aa09-dbf571e40348")>
set lw
max pricde
<selenium.webdriver.remote.webelement.WebElement (session="a3c6d4afa975858233ec72824d956037", element="7431b319-a111-4da5-b276-e21cd938806b")>
tbl scroll
570
<selenium.webdriver.remote.webelement.WebElement (session="a3c6d4afa975858233ec72824d956037", element="c377cffc-fdc6-4737-ba95-75e49aa46848")>
600
<selenium.webdriver.remote.webelement.WebElement (session="a3c6d4afa975858233ec72824d956037", element="c377cffc-fdc6-4737-ba95-75e49aa46848")>
620
<selenium.webdriver.remote.webelement.WebElement (session="a3c6d4afa975858233ec72824d956037", element="c377cffc-fdc6-4737-ba95-75e49aa46848")>
640
<selenium.webdriver.remote.webelement.WebElement (session="a3c6d4afa975858233ec72824d956037", element="c377cffc-fdc

In [2173]:
#df=pd.read_csv('data/2021-10-21 07-40 PM.csv')
#df

In [2174]:
#df=pd.read_csv('data/2021-10-21 08-26 PM.csv')
#df

In [2175]:
#df=pd.read_csv('data/2021-10-21 07-40 PM.csv')
#df

In [2176]:
#df=pd.read_csv('data/2021-10-21 08-45 PM.csv')
#df

In [2177]:
df=pd.read_csv('data/2021-10-21 09-37 PM.csv')
df

Unnamed: 0,id,url,0,shape,price,carat,cut,color,clarity,report,LW,type,date_fetched
0,12316229,https://www.brilliantearth.com//loose-diamonds...,,Round,430,0.32,Super Ideal,I,SI2,GIA,1.00,natural,2021-10-21 09-37 PM
1,12316283,https://www.brilliantearth.com//loose-diamonds...,,Round,440,0.30,Very Good,F,SI2,GIA,1.00,natural,2021-10-21 09-37 PM
2,12335848,https://www.brilliantearth.com//loose-diamonds...,,Round,440,0.30,Very Good,E,SI2,GIA,1.00,natural,2021-10-21 09-37 PM
3,12320460,https://www.brilliantearth.com//loose-diamonds...,,Round,450,0.30,Very Good,I,SI2,GIA,1.00,natural,2021-10-21 09-37 PM
4,12219956,https://www.brilliantearth.com//loose-diamonds...,,Round,450,0.30,Very Good,I,SI2,GIA,1.00,natural,2021-10-21 09-37 PM
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88169,11944611,https://www.brilliantearth.com//lab-diamonds-s...,,Cushion,97000,10.25,Good,E,VS1,IGI,1.05,lab,2021-10-21 09-37 PM
88170,12319628,https://www.brilliantearth.com//lab-diamonds-s...,,Oval,102160,8.24,Super Ideal,D,VS1,IGI,1.39,lab,2021-10-21 09-37 PM
88171,12332529,https://www.brilliantearth.com//lab-diamonds-s...,,Asscher,115560,10.03,Ideal,E,VVS2,IGI,1.03,lab,2021-10-21 09-37 PM
88172,12319822,https://www.brilliantearth.com//lab-diamonds-s...,,Radiant,131260,6.11,Ideal,E,VVS2,GCAL,1.37,lab,2021-10-21 09-37 PM
