## Watch U Seek Review Scraper
* Issue with initial attempts to scrape was the main post content (review), javascript was not loading with page load
    * Used "wait_until='domcontentloaded'" to allow page to have the review rendered and scraped
* Initially was trying to scrape with Selenium (unsuccessfully), switched to Playwright and found success
    * Reason you see both is because I was able to successfully grab threads with Selenium
    * Playwright used to get review from those threads
* Successful run = 4.18 hours

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from playwright.async_api import async_playwright
import csv
import time



#### Successfully grabbing review from a wus thread using Playwright

In [8]:
async def run():
    async with async_playwright() as p:
        #different browser and context arguments to allow for headless
        browser = await p.chromium.launch(headless=True,args=["--disable-blink-features=AutomationControlled","--start-maximized",])

        context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
                                            viewport={"width": 1280, "height": 800},)

        #browser = await p.chromium.launch(headless=False)
        #context = await browser.new_context()
        page = await context.new_page()

        url = "https://www.watchuseek.com/threads/a-review-of-a-new-oceaneva-deep-marine-explorer-vi.5635925/"
        await page.goto(url, timeout=60000, wait_until="domcontentloaded")
        await page.wait_for_timeout(3000)

        #DOM access
        content = await page.evaluate("""
        () => {
            const el = document.querySelector("div.bbWrapper");
            return el ? el.innerText : "NO REVIEW FOUND";
        }
        """)

        print(content)

        await browser.close()

await run()


A new watch from Oceaneva with impressive 6000m WR was launched last week. Check my impressions..



    
        
            https://www.cinciwatches.com/my-reviews-and-articles/2620466_diving-into-the-abyss-a-review-of-the-oceaneva-deep-marine-explorer-vi-6000m-diver-watch
        
    


#### Finding Review Threads

In [2]:
options = Options()
#options.add_argument("--headless=new") #not working when running headless
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

all_thread_urls = []

#all reviews from first 100 pages
for page_num in range(1, 101): 
    url = f"https://www.watchuseek.com/forums/reviews.67/page-{page_num}?sorting=latest-activity"
    driver.get(url)
    time.sleep(3)

    threads = driver.find_elements(By.CSS_SELECTOR, "h3.structItem-title > a")
    thread_urls = [t.get_attribute("href") for t in threads if "/threads/" in t.get_attribute("href")]
    all_thread_urls.extend(thread_urls)

driver.quit()

#remove duplicates by making original list into a set then back into a list
all_thread_urls = list(set(all_thread_urls))
print(f"\nThreads: {len(all_thread_urls)}")


Threads: 3501


#### Scraping Reviews and saving to csv

In [5]:
#function to grab review from thread
async def scrape_review(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True,args=["--disable-blink-features=AutomationControlled","--start-maximized",])

        context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
                                            viewport={"width": 1280, "height": 800},)
        page = await context.new_page()
        
        await page.goto(url, timeout=60000, wait_until="domcontentloaded")
        await page.wait_for_timeout(3000)

        review_text = await page.evaluate("""
        () => {
            const el = document.querySelector("div.bbWrapper");
            return el ? el.innerText : null;
        }
        """)
        
        await browser.close()
        return review_text

reviews = []

#going through collected urls to get
for url in all_thread_urls:
    try: #'try' so an error for one url does not cause stopping in middle
        review = await scrape_review(url)
        if review:
            reviews.append({'url': url, 'review': review})
        else:
            continue #if unsuccessful, skip and continue
    except Exception as e:
        print(f"Error scraping {url}: {e}")

#csv
with open("watchuseek_reviews.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "review"])
    writer.writeheader()
    writer.writerows(reviews)
