# Influenster Web Scraping - Unstructured Data Analytics Final Project

## Get 100 URLs

In [1]:
!pip install selenium
!pip install webdriver_manager
!pip install requests
!pip install beautifulsoup4
!apt-get update
!apt install chromium-chromedriver
!pip install selenium webdriver_manager
!pip install google-colab-selenium



'apt-get' is not recognized as an internal or external command,
operable program or batch file.
'apt' is not recognized as an internal or external command,
operable program or batch file.


Collecting google-colab-selenium
  Downloading google_colab_selenium-1.0.14-py3-none-any.whl.metadata (2.7 kB)
Downloading google_colab_selenium-1.0.14-py3-none-any.whl (8.2 kB)
Installing collected packages: google-colab-selenium
Successfully installed google-colab-selenium-1.0.14


In [36]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time

from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--start-maximized')
chrome_options.add_experimental_option('detach', True)  # Keeps the browser open


try:
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )
    driver.implicitly_wait(10)
except Exception as e:
    print(f"Browser initialization error: {e}")


wait = WebDriverWait(driver, 30)

url = 'https://www.influenster.com/search?categories=Hair&categories=Hair+Products&categories=Shampoo&tab=product'
driver.get(url)
# Wait for page load
wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete")
print("Initial page load complete")
try:
    print("Waiting for cookie banner...")
    # First wait for the cookie banner container
    cookie_banner = wait.until(EC.presence_of_element_located((By.ID, "onetrust-banner-sdk")))
    print("Cookie banner found")
    
    # Then locate and click the Accept All Cookies button within the banner container
    accept_button = cookie_banner.find_element(By.ID, "onetrust-accept-btn-handler")
    print("Accept button found")
    
    # Add a small pause to ensure the button is interactable
    time.sleep(1)
    
    # Use Actions class to move to and click the button
    from selenium.webdriver.common.action_chains import ActionChains
    actions = ActionChains(driver)
    actions.move_to_element(accept_button).click().perform()
    print("Clicked Accept All Cookies button")

    # Verify the cookie banner is gone
    WebDriverWait(driver, 5).until_not(
        EC.presence_of_element_located((By.ID, "onetrust-banner-sdk"))
    )
    print("Cookie banner closed successfully")

except Exception as e:
    print(f"Error accepting cookies: {str(e)}")

# Wait for the page to fully load after handling cookies
try:
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".SearchResultsPage_search-results__grid__")))
    print("Page loaded successfully.")
except TimeoutException:
    print("Page did not load properly within the specified time.")


# Create an empty list to store URLs
urls = []

# Function to scroll and load products until we have at least 100
def load_products_until(limit=100):
    total_products = 0
    scroll_pause_time = 2  # Adjust if necessary

    while total_products < limit:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)  # Wait for new products to load

        # Update the total number of products
        product_links = driver.find_elements(By.XPATH, '//a[@data-link-overlay="true"]')
        new_total_products = len(product_links)

        if new_total_products > total_products:
            total_products = new_total_products
            print(f"Total products loaded: {total_products}")
        else:
            # If no new products are loaded after scrolling, break the loop
            print("No more products loaded after scrolling.")
            break

# Function to scrape URLs
def url_scrape(limit=100):
    # Find all product link elements using the data-link-overlay attribute
    product_links = driver.find_elements(By.XPATH, '//a[@data-link-overlay="true"]')

    print(f"Found {len(product_links)} products on the page.")

    for product_link in product_links[:limit]:
        try:
            # Get the product URL from the href attribute
            product_url = product_link.get_attribute('href')

            # Ensure the URL is absolute
            if product_url.startswith('/'):
                product_url = 'https://www.influenster.com' + product_url

            # Remove any trailing slashes and append '/reviews/'
            product_url = product_url.rstrip('/') + '/reviews/'

            # Append the URL to the list
            urls.append({'URL': product_url})
        except Exception as e:
            print(f'Error finding information: {e}')


# Load products until we have at least 200
load_products_until(limit=200)

# Call the function to scrape URLs
url_scrape(limit=200)

# Close the browser when done
driver.quit()

# Convert the URLs list to a DataFrame and display it
urls_df = pd.DataFrame(urls)
print("Number of URLs pulled:", len(urls_df))
urls_df.to_csv('Influenster URL List.csv', sep='\t', encoding='utf-8', index=False, header=True)
urls_df[:5]

Initial page load complete
Waiting for cookie banner...
Cookie banner found
Accept button found
Clicked Accept All Cookies button
Error accepting cookies: Message: 

Page did not load properly within the specified time.
Total products loaded: 60
Total products loaded: 80
Total products loaded: 100
Total products loaded: 120
Total products loaded: 140
Total products loaded: 160
Total products loaded: 180
Total products loaded: 200
Found 200 products on the page.
Number of URLs pulled: 200


Unnamed: 0,URL
0,https://www.influenster.com/reviews/garnier-sl...
1,https://www.influenster.com/reviews/head-shoul...
2,https://www.influenster.com/reviews/pantene-da...
3,https://www.influenster.com/reviews/tresemme-s...
4,https://www.influenster.com/reviews/pantene-pr...


## Get 50 Reviews for Every URL

In [37]:
list = urls_df['URL'] #this is for all 100 URLs
#list = urls_df['URL'][:3] #this is for testing purposes only

In [38]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--start-maximized')
chrome_options.add_experimental_option('detach', True)

# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 20)

# List of product URLs to scrape
product_urls = list

# Initialize an empty list to store all review data
all_data = []

# Loop over each product URL
for url in product_urls:
    print(f"Processing URL: {url}")
    driver.get(url)
    data = []

    # Handle cookie preferences
    try:
        print("Waiting for cookie banner...")
        cookie_banner = wait.until(EC.presence_of_element_located((By.ID, "onetrust-banner-sdk")))
        print("Cookie banner found")
        
        accept_button = cookie_banner.find_element(By.ID, "onetrust-accept-btn-handler")
        time.sleep(1)  # Small pause to ensure button is interactable
        accept_button.click()
        print("Clicked Accept All Cookies button")

        # Verify cookie banner is gone
        WebDriverWait(driver, 5).until_not(
            EC.presence_of_element_located((By.ID, "onetrust-banner-sdk"))
        )
        print("Cookie banner closed successfully")
    except Exception as e:
        print(f"Error handling cookies: {e}")

    # Wait for the page to load
    try:
        wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "InfiniteScroll_infinite-scroll__")]')))
        print("Page loaded successfully.")
    except TimeoutException:
        print("Page did not load properly.")
        continue

    # Extract the product title from the page title
    try:
        full_title = driver.title
        print(f"Full page title: {full_title}")
        # Split the title at 'Reviews' to get the product name
        product_title = full_title.split('Reviews')[0].strip()
        print(f"Product title: {product_title}")
    except Exception as e:
        print(f'Error finding product title: {e}')
        product_title = "Unknown Product"

    # Function to load reviews until we have at least 50
    def load_reviews_until(limit=50):
        while True:
            # Get the current number of reviews
            reviews = driver.find_elements(By.XPATH, '//div[@data-cy="review-ugc-container__body"]')
            total_reviews = len(reviews)
            print(f'Current number of reviews loaded: {total_reviews}')
            if total_reviews >= limit:
                print(f'Reached the limit of {limit} reviews.')
                break
            try:
                # Scroll to the bottom to make sure the "Load more" button is in view
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                # Wait for the "Load more" button to be clickable and click it
                load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Load more")]')))
                print('Clicking "Load more" button...')
                driver.execute_script("arguments[0].click();", load_more_button)
                # Wait for new reviews to load
                time.sleep(2)  # Adjust the sleep time if necessary
            except (NoSuchElementException, TimeoutException):
                print("No more 'Load more' button found.")
                break
            except Exception as e:
                print(f"Error clicking 'Load more' button: {e}")
                break

    # Function to scrape reviews on the current page
    def scrape_reviews(limit=50):
        # Find all individual review elements
        reviews = driver.find_elements(By.XPATH, '//div[@data-cy="review-ugc-container__body"]')

        # Limit the number of reviews to the specified limit
        reviews = reviews[:limit]
        print(f'Scraping {len(reviews)} reviews.')

        for review_element in reviews:
            try:
                # Get the star rating element
                star_rating = review_element.find_element(By.XPATH, './/div[starts-with(@class, "StarRating_star-rating__rating-text__")]').text.strip()

                # Get the review text element
                review_text = review_element.find_element(By.XPATH, './/div[starts-with(@class, "Review_review__body-text__")]').text.strip()

                # Append the data
                data.append({
                    'product_name': product_title,
                    'user_rating': star_rating,
                    'product_review': review_text
                })
            except Exception as e:
                print(f'Error finding information: {e}')

    # Load reviews until we have at least 50
    load_reviews_until(limit=50)

    # Scrape the reviews
    scrape_reviews(limit=50)

    # Add the data from this product to the all_data list
    all_data.extend(data)

    # Optional: Pause between products to be polite to the server
    time.sleep(2)

# Close the browser when done
driver.quit()

# Convert the data to a DataFrame and display it
df = pd.DataFrame(all_data)
print("Total number of reviews pulled: ", len(df))
df[:5]

# Optionally, save the DataFrame to a CSV file
df.to_csv('Influenster Reviews.csv', index=False)


Processing URL: https://www.influenster.com/reviews/garnier-sleek-shine-intensely-smooth-leave-in-conditioning-cream/reviews/
Waiting for cookie banner...
Cookie banner found
Clicked Accept All Cookies button
Error handling cookies: Message: 

Page loaded successfully.
Full page title: Garnier Sleek & Shine Intensely Smooth Leave-In Conditioning Cream Reviews | Find the Best Hair Products Products | Influenster
Product title: Garnier Sleek & Shine Intensely Smooth Leave-In Conditioning Cream
Current number of reviews loaded: 10
Clicking "Load more" button...
Current number of reviews loaded: 20
Clicking "Load more" button...
Current number of reviews loaded: 30
Clicking "Load more" button...
Current number of reviews loaded: 40
Clicking "Load more" button...
Current number of reviews loaded: 50
Reached the limit of 50 reviews.
Scraping 50 reviews.
Processing URL: https://www.influenster.com/reviews/head-shoulders-classic-clean-anti-dandruff-shampoo/reviews/
Waiting for cookie banner...

## Get Website's Product Description for Every URL

In [41]:
list = urls_df['URL'] #this is for all 100 URLs
#list = urls_df['URL'][:3] #this is for testing purposes only

list = list.str.replace('/reviews/$', '', regex=True)


In [58]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time


# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--start-maximized')
chrome_options.add_experimental_option('detach', True)

# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 5) # Adjust the wait time if necessary

# List of product URLs to scrape
product_urls = list

# Initialize an empty list to store product descriptions
product_data = []

# Loop over each product URL
for url in product_urls:
    print(f"Processing URL: {url}")
    driver.get(url)
    time.sleep(2)  # Wait for the page to load

    # Accept cookies if the button appears
    try:
        cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All Cookies")]')))
        driver.execute_script("arguments[0].click();", cookie_button)
        print('Accepted cookies')
    except (NoSuchElementException, TimeoutException):
        print('No cookie button found or already accepted.')

    # Click the "See more" button to expand the description if it exists
    try:
        # Locate the "See more" button
        see_more_button = driver.find_element(By.XPATH, '//button[contains(text(), "See more")]')
        driver.execute_script("arguments[0].click();", see_more_button)
        print("Clicked 'See more' button to expand the product description.")
        # Wait a moment for the content to expand
        time.sleep(1)
    except NoSuchElementException:
        print("'See more' button not found; description is already fully visible.")

    # Extract the product description
    try:
        # Use XPath with starts-with to handle dynamic class names
        description_element = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//p[starts-with(@class, "ProductDescription_product-description__text__")]')
        ))
        product_description = description_element.text.strip()
        print(f"Product Description: {product_description}")
    except TimeoutException:
        print("Product description not found.")
        product_description = None

    # Extract the product title from the page title
    try:
        full_title = driver.title
        # Split the title at 'Reviews' to get the product name
        product_title = full_title.split('Reviews')[0].strip()
    except Exception as e:
        print(f'Error finding product title: {e}')
        product_title = "Unknown Product"

    # Inside the loop after extracting product description, add:
    try:
        # Get average rating
        rating_element = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//div[contains(@class, "StarRating_star-rating__rating-text__")]')
        ))
        average_rating = rating_element.text.strip()
        print(f"Average Rating: {average_rating}")
    except TimeoutException:
        print("Average rating not found")
        average_rating = None
    
    try:
        # Get total number of reviews using the specific class
        reviews_count_element = wait.until(EC.presence_of_element_located(
            (By.CLASS_NAME, "StarRating_star-rating__review-count__ba91T")
        ))
        total_reviews = reviews_count_element.text.replace(' REVIEWS', '').strip()
        print(f"Total Reviews: {total_reviews}")
    except TimeoutException:
        print("Total reviews count not found")
        total_reviews = None
    # Update the product_data dictionary to include the new fields
    product_data.append({
        'URL': url,
        'Product Name': product_title,
        'Product Description': product_description,
        'Average Rating': average_rating,
        'Total Reviews': total_reviews
    })
    # Optional: Pause between products to be polite to the server
    time.sleep(2)

# Close the driver when done
driver.quit()

# Convert the data to a DataFrame and display it
description_df = pd.DataFrame(product_data)
print("\nProduct Descriptions:")


# Optionally, save the DataFrame to a CSV file
description_df.to_csv('Influenster Product Descriptions.csv', index=False)
description_df[:5]

Processing URL: https://www.influenster.com/reviews/garnier-sleek-shine-intensely-smooth-leave-in-conditioning-cream
Accepted cookies
'See more' button not found; description is already fully visible.
Product Description: Leave-in conditioning cream for up to 3 day sleek* *With shampoo, conditioner and leave-in cream
Average Rating: 4 / 5
Total Reviews: 24370
Processing URL: https://www.influenster.com/reviews/head-shoulders-classic-clean-anti-dandruff-shampoo
No cookie button found or already accepted.
Clicked 'See more' button to expand the product description.
Product Description: Head & Shoulders® Classic Clean Shampoo is paraben free America’s #1 dandruff shampoo (*based on volume sales) Clinically proven to protect against flakes, itch, oil and dryness with regular use (**flakes and itch associated with dandruff; washes away oil & flakes) Clinically proven. Up to 100% dandruff protection. ( ***visible flakes, with regular use)
Average Rating: 5 / 5
Total Reviews: 18494
Processing

Unnamed: 0,URL,Product Name,Product Description,Average Rating,Total Reviews
0,https://www.influenster.com/reviews/garnier-sl...,Garnier Sleek & Shine Intensely Smooth Leave-I...,Leave-in conditioning cream for up to 3 day sl...,4 / 5,24370
1,https://www.influenster.com/reviews/head-shoul...,Head & Shoulders Classic Clean Anti-Dandruff S...,Head & Shoulders® Classic Clean Shampoo is par...,5 / 5,18494
2,https://www.influenster.com/reviews/pantene-da...,Pantene Daily Moisture Renewal Shampoo,Daily Moisture Renewal Shampoo,3 / 5,17030
3,https://www.influenster.com/reviews/tresemme-s...,Tresemme Silky & Smooth Shampoo for Frizzy Hair,"Looking for that high glam, smooth look? TRESe...",4 / 5,14324
4,https://www.influenster.com/reviews/pantene-pr...,Pantene Pro-V Smooth & Sleek Shampoo & Conditi...,Smooth things over with this frizz-fighting co...,5 / 5,13594
