# Webscraper

For best results, please use a *specific* search term, and a short number of pages.

In order to start, click "Run All" and Enjoy! 


## imports

In [49]:
#pip install selenium
#pip install bs4

from bs4 import BeautifulSoup
from selenium import webdriver
import numpy as np


## helper functions 

In [50]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')

    # enable page query
    url = template.format(search_term)
    url +='&page={}'

    return url

In [51]:
def get_url_newegg(search_term):
    template = 'https://www.newegg.com/p/pl?d={}'
    search_term = search_term.replace(' ','+')

    # enable page query
    url = template.format(search_term)
    url +='&page={}'

    return url

In [52]:
def extract_products_data(item ):
    # return a single product data from page

    # page ,discription and url
    atag = item.h2.a
    try:
        description =atag.text.strip()
    except AttributeError:
        description = 'description is not available for this product'
    
    try:
        url = 'https://www.amazon.com' + atag.get('href')
    except AttributeError:
        url ='url not found - please try another product'
    
    try:
        # get prices
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span','a-offscreen').text[1:]
    except AttributeError:
        return
        
    result = (description, price,url)
    
    
    return result 

In [53]:
def extract_products_data_newegg(item):
    # return a single product data from page

    # page ,discription and url
    atag = item.div.a
    try:
        description =item.div.a.img.get('title')
    except AttributeError:
        description = 'description is not available for this product'
    
    try:
        url = atag.get('href')
    except AttributeError:
        url = 'url not found - please try another product'

    try:
        # get prices
        price_parent = item.find('li', 'price-current')
        price = price_parent.find('strong').text + price_parent.find('sup').text
        # shipping_fee = price_parent.find('li', 'price-ship').text
    except AttributeError:
        return
        
    result = (description, price,url)
    
    
    return result 

In [54]:
def cheapest_in_list(data,prices):
    # get cheapest product from data
    prices = np.array(prices)
    cheapest_product = data[np.argmin(prices)]
    return cheapest_product

In [55]:
def display_cheapest(cheapest_product):
    driver = webdriver.Chrome()
    driver.get(cheapest_product[2])

## Amazon function

In [56]:
def amazon_scraper(search_term,page_number):
    # startup webdriver 
    driver = webdriver.Chrome()

    amazon_products_data = []
    amazon_prices = []
    url = get_url(search_term)

    for page in range(1,page_number):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            product = extract_products_data(item)
            if(product):
                amazon_products_data.append(product)
                amazon_prices.append(float(product[1].replace(',','')))
    
    driver.close()

    # get cheapest product from amazon 
    cheapest_amazon_product = cheapest_in_list(amazon_products_data,amazon_prices)
    
    
    return (amazon_products_data, amazon_prices, cheapest_amazon_product)



## Newegg function

In [57]:
def newegg_scraper(search_term,page_number):
    newegg_products_data = []
    newegg_prices = []
    # startup webdriver 
    driver = webdriver.Chrome()
    url = get_url_newegg(search_term)

    for page in range(1,page_number):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'class': 'item-cell'})

        for item in results:
            product = extract_products_data_newegg(item)
            if(product):
                newegg_products_data.append(product)
                newegg_prices.append(float(product[1].replace(',','')))
    
    driver.close()
    # get cheapest product from amazon 
    cheapest_newegg_product = cheapest_in_list(newegg_products_data,newegg_prices)
    
    
    return (newegg_products_data, newegg_prices, cheapest_newegg_product)



### inputs function

In [58]:
def take_inputs():
    # taking inputs from user
    search_term = input("Please enter your product of interest: ")
    prime_or_not = input("Are you an Amazon-Prime user (yes/no)? ")

    if not(prime_or_not.lower() == 'yes' or prime_or_not.lower() == 'no'):
        print('Type yes or no')


    number_of_pages = input("Please type the number of pages you would like to search thru (keep in mind that high numbers can cause long runtime and less accurate results): ")
    try:
        number_of_pages = int(number_of_pages) + 1 
    except ValueError:
        number_of_pages = 5 
    
        
    return search_term, prime_or_not , number_of_pages


### Main helper function

In [59]:
def cheapest_overall(cheapest_amazon,amazon_prices,amazon_data,cheapest_newegg,newegg_data, newegg_prices):
    # find cheapest product overall, remove said prduct from list and returns product data + name of website
    if(float(cheapest_amazon[1])<float(cheapest_newegg[1])):
        amazon_prices.remove(float(cheapest_amazon[1]))
        amazon_data.remove(cheapest_amazon)
        return cheapest_amazon , 'Amazon'
    else:
        newegg_prices.remove(float(cheapest_newegg[1]))
        newegg_data.remove(cheapest_newegg)
        return cheapest_amazon , 'Newegg'

# main function 

In [60]:

def main():   
    # take input from user
    search_term, prime_or_not , number_of_pages = take_inputs()
    
    # scrape Amazon and Newegg for products data and find cheapest products 
    amazon_data, amazon_prices, cheapest_amazon_product = amazon_scraper(search_term,number_of_pages)
    newegg_data, newegg_prices, cheapest_newegg_product = newegg_scraper(search_term,number_of_pages)

    # find cheapest product overall 
    cur_cheapest , cur_cheapest_website = cheapest_overall( cheapest_amazon_product, amazon_prices,amazon_data,cheapest_newegg_product, newegg_data, newegg_prices )

    # display product
    print('The following product is brought to you from ' + cur_cheapest_website)
    print(cur_cheapest)
    display_cheapest(cur_cheapest)
    
    # if client is not satisfied with the first result, remove last result from data and try the next cheapest product
    inp = True
    while(inp):
        inp = input("If you would like to check the next cheapest product type anything, otherwise press Enter")
        if(inp):
            # get next cheapest product 
            cheapest_amazon_product = cheapest_in_list(amazon_data,amazon_prices)
            cheapest_newegg_product = cheapest_in_list(newegg_data,newegg_prices)
            cheapest_cur, cur_cheapest_website = cheapest_overall(cheapest_amazon_product, amazon_prices,amazon_data,cheapest_newegg_product, newegg_data, newegg_prices)

            # display product and prints product data in case client would like to use it later 
            print('The following product is brought to you from ' + cur_cheapest_website)
            print(cheapest_cur)
            display_cheapest(cheapest_cur)
            

        
        


main()

The following product is brought to you from Amazon
('OGX Hydrating + Tea Tree Mint Shampoo, Nourishing & Invigorating Scalp Shampoo with Tea Tree & Peppermint Oil & Milk Proteins, Paraben-Free, Sulfate-Free Surfactants, 13 fl oz', '5.59', 'https://www.amazon.com/OGX-Invigorating-Paraben-Free-Sulfate-Free-Surfactants/dp/B000TGC8D2/ref=sr_1_38?keywords=shampoo&qid=1672176601&sr=8-38')
