# Team Members:

- Pranav Garg
- Ronak Goyal
- Utkarsh Garg
- Alexander Imhoff
- John Izzo
- Akash Barathan


IMPORTING THE REQUIRED LIBRARIES

In [1]:
# Data manipulation and visualization
import pandas as pd


# Selenium WebDriver for web scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Selenium Chrome options (for headless scraping)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

SCRAPPER CODE

In [5]:
def web_scrapper(number_of_pages = 1000):
    # Initialize the WebDriver (Make sure you have the correct path for chromedriver)
    wd = webdriver.Chrome()

    # Base URL
    base_url = "https://basenotes.com/reviews/?&page={}"

    # Initialize an empty list to store reviews from all pages
    all_reviews = []

    # Loop through each page
    for page in range(1, number_of_pages + 1):
        # Navigate to the page
        wd.get(base_url.format(page))

        try:
            # Wait until the reviews section is visible
            WebDriverWait(wd, 2).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'fragreviewouter'))  # Class for the entire review block
            )

            # Find all review blocks on the page
            review_blocks = wd.find_elements(By.CLASS_NAME, 'fragreviewouter')

            # Loop through each review block
            for review_block in review_blocks:

                # Extract perfume name
                perfume_name = review_block.find_element(By.TAG_NAME, 'h3').text

                # Extract review text
                review_text = review_block.find_element(By.CLASS_NAME, 'fragreview').text

                # Extract review rating (Assuming the thumbs3.png indicates the rating, adapt if needed)
                rating_image = review_block.find_element(By.CLASS_NAME, 'reviewrating').find_element(By.TAG_NAME, 'img')
                rating_src = rating_image.get_attribute('src')
                if 'thumbs3.png' in rating_src:
                    rating = 1  # Adjust based on actual image mappings
                elif 'thumbs1.png' in rating_src:
                    rating = -1  # Adjust based on actual image mappings
                elif 'thumbs2.png' in rating_src:
                    rating = 0  # Adjust based on actual image mappings

                # Extract date
                review_date = review_block.find_element(By.CLASS_NAME, 'fragreviewfooter').text.split('\n')[0].strip()

                # Store data in a dictionary
                review_data = {
                    'page' : page,
                    'perfume': perfume_name,
                    'review': review_text,
                    'rating': rating,
                    'date': review_date

                }

                # Append the review to the list
                all_reviews.append(review_data)

        except Exception as e:
            print(f"An error occurred on page {page}: {e}")

    # Convert to DataFrame after all pages are scraped
    df = pd.DataFrame(all_reviews)

    # Display or save the data
    print(df)

    # If you want to save to CSV
    df.to_csv('scraped_reviews.csv', index=False)

    # Close the WebDriver
    wd.quit()


In [6]:
web_scrapper(number_of_pages = 2)

    page                                            perfume  \
0      1                         Psychedelic Love by Initio   
1      1                      Addictive Vibration by Initio   
2      1                    Oud For Greatness NEO by Initio   
3      1                              Goat by Wolf Brothers   
4      1                      Dates Delight by House of Oud   
5      1                       Lavender Extrême by Tom Ford   
6      1                          Venice Rococo by Arquiste   
7      1                     A Grove by the Sea by Arquiste   
8      1                           Almond Suede by Arquiste   
9      1                     Lovely by Sarah Jessica Parker   
10     1  Lolita Lempicka Fleur Défendue by Lolita Lempicka   
11     1                     Cuir Mauresque by Serge Lutens   
12     1       Guimauve de Noël / 31 by Parle Moi de Parfum   
13     1                 Lelong pour Femme by Lucien Lelong   
14     1              Forever Elizabeth by Elizabeth Ta