In [1]:
import json
import time
import csv
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load search terms from the provided text file
with open('top_1000_words.txt', 'r') as file:
    search_terms = file.read().strip().split('\n')

# Set up Chrome WebDriver with headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Load or initialize progress tracking file
progress_file = "progress.json"
try:
    with open(progress_file, 'r') as f:
        progress_data = json.load(f)
except FileNotFoundError:
    progress_data = {"total_reviews_collected": 0}

# Initialize total reviews counter from progress file
total_reviews_collected = progress_data.get("total_reviews_collected", 0)

# Ask user for number of places to process in parallel
while True:
    try:
        max_workers = int(input("Enter the number of places to process in parallel (max 20): "))
        if 1 <= max_workers <= 20:
            break
        else:
            print("Please enter a number between 1 and 20.")
    except ValueError:
        print("Invalid input. Please enter a valid number.")

# Ask user to input places and their URLs
places = []
for i in range(max_workers):
    name = input(f"Enter the name of place {i+1}: ")
    url = input(f"Enter the URL for {name}: ")
    places.append({"name": name, "url": url})
    if name not in progress_data:
        progress_data[name] = {"completed_keywords": []}

def save_progress():
    """Function to save current progress to JSON file."""
    progress_data["total_reviews_collected"] = total_reviews_collected
    with open(progress_file, 'w') as f:
        json.dump(progress_data, f)

def collect_reviews_for_keyword(driver, place_name, keyword, writer, keyword_bar):
    global total_reviews_collected
    try:
        # Locate and enter the search keyword in the search bar
        search_bar = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//input[contains(@aria-labelledby, "placeholder.")]'))
        )
        search_bar.click()
        search_bar.clear()
        search_bar.send_keys(keyword)
        search_bar.send_keys(Keys.ENTER)
        time.sleep(3)
        
        # Locate the review container to scroll through reviews
        review_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "m6QErb DxyBCb kA9KIf dS8AEf XiKgde ")]'))
        )
        last_scroll_height = driver.execute_script("return arguments[0].scrollHeight;", review_container)
        scroll_attempts = 0

        while True:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", review_container)
            time.sleep(1)
            new_scroll_height = driver.execute_script("return arguments[0].scrollHeight;", review_container)

            if new_scroll_height == last_scroll_height:
                scroll_attempts += 1
                if scroll_attempts > 2:
                    break
            else:
                scroll_attempts = 0
            last_scroll_height = new_scroll_height

        reviews = driver.find_elements(By.CLASS_NAME, 'jftiEf')

        for review in reviews:
            attempts = 0
            while attempts < 3:  # Retry up to 3 times for each element if stale
                try:
                    author = review.find_element(By.CLASS_NAME, 'd4r55').text
                    rating = review.find_element(By.CLASS_NAME, 'kvMYJc').get_attribute("aria-label")
                    text = review.find_element(By.CLASS_NAME, 'wiI7pd').text
                    date = review.find_element(By.CLASS_NAME, 'rsqaWe').text
                    writer.writerow({"Author": author, "Rating": rating, "Review": text, "Date": date, "Keyword": keyword})
                    
                    # Update total reviews count
                    total_reviews_collected += 1
                    keyword_bar.set_postfix(total_reviews=total_reviews_collected)  # Update review count on the same line
                    save_progress()  # Save progress after each review
                    break  # Break out of the retry loop on success
                except StaleElementReferenceException:
                    attempts += 1
                    if attempts < 3:
                        print(f"Retrying review extraction for keyword '{keyword}' due to stale element...")
                    else:
                        print(f"Skipping review for keyword '{keyword}' after multiple attempts due to stale element.")
                except NoSuchElementException as e:
                    print(f"Element not found during review extraction for keyword '{keyword}':", e)
                    break

        # Mark this keyword as completed for progress tracking
        progress_data[place_name]["completed_keywords"].append(keyword)
        save_progress()

    except Exception as e:
        print(f"Error processing keyword '{keyword}' for {place_name}:", e)

def collect_reviews_for_place(place):
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(place["url"])
    time.sleep(5)

    try:
        review_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//button[contains(@aria-label, "Reviews")]'))
        )
        review_button.click()
        time.sleep(3)
    except Exception as e:
        print(f"Error clicking the 'Reviews' tab button for {place['name']}:", e)
        driver.quit()
        return

    try:
        search_reviews_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//button[contains(@aria-label, "Search reviews")]'))
        )
        search_reviews_button.click()
        time.sleep(3)
    except Exception as e:
        print(f"Error clicking 'Search reviews' button for {place['name']}:", e)
        driver.quit()
        return

    csv_file_path = f"{place['name'].replace(' ', '_')}_AllReviews.csv"
    fieldnames = ["Author", "Rating", "Review", "Date", "Keyword"]

    with open(csv_file_path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        file.seek(0, 2)
        if file.tell() == 0:
            writer.writeheader()

        # Progress bar for keywords
        keyword_bar = tqdm(search_terms, desc=f"Keywords for {place['name']}", unit="keyword", position=0)
        for keyword in keyword_bar:
            if keyword not in progress_data[place["name"]]["completed_keywords"]:
                collect_reviews_for_keyword(driver, place["name"], keyword, writer, keyword_bar)
                time.sleep(2)

    driver.quit()
    print(f"Total reviews saved to {csv_file_path} for {place['name']}")

# Execute review collection for each place in parallel
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(collect_reviews_for_place, place) for place in places]
    for future in as_completed(futures):
        future.result()

print("Review collection completed for all places.")


KeyboardInterrupt: 