In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import re
import pandas as pd
import time

In [2]:
def initialize_driver():
    """
    Initializes a Selenium WebDriver with headless options.
    
    Returns:
    - WebDriver: Initialized Selenium WebDriver.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    
    # Replace 'path/to/chromedriver' with the actual path to your Chromedriver
    # service = Service('path/to/chromedriver')
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [3]:



def parse_shopee_url(url):
    """
    Extracts shop_id and item_id from a Shopee product URL.
    
    Parameters:
    - url (str): Shopee product URL.
    
    Returns:
    - tuple: (shop_id, item_id) if successful; otherwise None.
    """
    match = re.search(r'i\.(\d+)\.(\d+)', url)
    if match:
        shop_id, item_id = match[1], match[2]
        return shop_id, item_id
    else:
        print("Invalid URL format. Could not extract shop_id and item_id.")
        return None, None

def scroll_and_fetch_reviews(driver, shop_id, item_id, scroll_pause_time=2, max_scrolls=5):
    """
    Scrolls through the Shopee reviews section to load more reviews and fetches them.
    
    Parameters:
    - driver (WebDriver): Selenium WebDriver instance.
    - shop_id (str): Shopee shop ID.
    - item_id (str): Shopee item ID.
    - scroll_pause_time (int): Pause time between scrolls.
    - max_scrolls (int): Maximum number of scroll attempts.
    
    Returns:
    - list: List of reviews in JSON format.
    """
    reviews = []
    for i in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        try:
            # Fetch data from page source, e.g., review containers using Selenium
            review_elements = driver.find_elements(By.CSS_SELECTOR, 'your_review_element_selector_here')
            for element in review_elements:
                # Parse review data from each element
                reviews.append(element.text)
        except Exception as e:
            print(f"Error fetching reviews: {e}")
            break
    return reviews

def reviews_to_dataframe(reviews):
    """
    Converts list of reviews to a pandas DataFrame.
    
    Parameters:
    - reviews (list): List of reviews.
    
    Returns:
    - DataFrame: DataFrame containing the reviews.
    """
    df = pd.DataFrame(reviews, columns=['Review'])
    return df

def main(url):
    """
    Main function to parse the URL, initialize the WebDriver, and fetch reviews.
    
    Parameters:
    - url (str): Shopee product URL.
    
    Returns:
    - DataFrame: DataFrame with the product reviews.
    """
    shop_id, item_id = parse_shopee_url(url)
    if not shop_id or not item_id:
        return pd.DataFrame()
    
    driver = initialize_driver()
    driver.get(url)
    time.sleep(3)  # Wait for page to load

    reviews = scroll_and_fetch_reviews(driver, shop_id, item_id)
    df = reviews_to_dataframe(reviews)
    
    driver.quit()
    return df




In [4]:
# Example usage:
url = 'https://shopee.co.id/YOU-Hy!-Amino-AC-Ttack-Anti-Acne-Facial-Wash-Jerawat-Kulit-Berminyak-i.72375863.14266733286'
df = main(url)
print(df.head())

Empty DataFrame
Columns: [Review]
Index: []
