In [18]:
############################
# 1. # IMPORTING LIBRARIES #
############################

import sys
import re
import os
import time
import pytz
import logging
import pandas as pd
from tqdm import tqdm
from dateutil import parser
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import TimeoutException, NoSuchElementException


timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

log_folder = 'logs'
if not os.path.exists(log_folder):
    os.makedirs(log_folder)

log_filename = os.path.join(log_folder, f"log_file_{timestamp}.log")
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    filename=log_filename,
                    filemode='w')



def create_directory():
    list_of_dir = ["./Data", "./Output", "./Exceptions", "./Lookup"]
    for dir_name in list_of_dir:
        try:
            os.makedirs(dir_name, exist_ok=True)
            logging.info(f"Directory '{dir_name}' created successfully")
        except OSError as error:
            logging.error(f"Creation of the directory '{dir_name}' failed due to: {error}")



###############################
# 2. # URL EXTRACTOR FUNCTION #
###############################
def get_urls():
    
    ###############################################################
    # 2. # EXTRACTING NUMBER OF PAGES FROM SIMPLEHUMAN BRAND PAGE #
    ###############################################################

    # Set a custom user-agent to mimic a real user
    driver = Driver(uc=True)
    url = 'https://www.walmart.com/search?q=simplehuman&facet=brand%3Asimplehuman&affinityOverride=default'

    # Navigate to the URL
    driver.get(url)

    # Find all the page number elements using the 'data-automation-id' attribute
    page_number_elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-automation-id="page-number"]')

    # Initialize an empty list to store the page numbers
    page_numbers = []

    # Iterate through the page number elements and extract the text
    for element in page_number_elements:
        page_numbers.append(element.text)
    
    driver.quit()

    # Print the extracted page numbers
    print('List of Page Numbers:',page_numbers)

    ##########################################################
    # 3. # SCRAPPING PRODUCT NAME AND LINKS FROM BRAND PAGES #
    ##########################################################

    # Initialize lists to store item names and links
    item_names = []
    item_links = []

    for page in tqdm(page_numbers):
        # Create a new WebDriver session for each URL
        #driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        driver = Driver(uc=True)
        #driver.get(website)
        url = f'https://www.walmart.com/search?q=_&facet=brand%3Asimplehuman&page={page}'

        # Navigate to the URL
        driver.get(url)

        # Wait for the page to load (you may need to adjust the waiting time)
        driver.implicitly_wait(10)

        human_dialog = driver.find_elements(By.XPATH, "//div[@aria-labelledby='ld_modalTitle_0']")
        while human_dialog:
            driver.refresh()
            time.sleep(5)
            human_dialog = driver.find_elements(By.XPATH, "//div[@aria-labelledby='ld_modalTitle_0']")
            logging.warning('ROBOT or HUMAN Verfication FOUND!')

        # Get the page source
        page_source = driver.page_source

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all div elements with the specified class
        item_divs = soup.find_all('div', class_='mb0 ph1 pa0-xl bb b--near-white w-25')

        # Iterate through the found div elements
        for item_div in item_divs:
            # Find the nested div that contains the item information
            nested_div = item_div.find('div', class_='h-100 pb1-xl pr4-xl pv1 ph1')

            if nested_div:
                # Extract the link and item name from the nested div
                link = nested_div.find('a', href=True)['href']
                name = nested_div.find('span', class_='w_iUH7').text.strip()

                # Append to the respective lists
                item_names.append(name)
                item_links.append(link)

        logging.info(f'Data Extracted for Page: {page}')

        # Close the Selenium WebDriver
        driver.quit()
    
    print('Length of the Lists should be 150+:',len(item_links),len(item_names))

    ##############################################
    # 4. # CONVERRTING LINKS INTO CORRECT FORMAT #
    ##############################################

    # Function to convert Type-1 link
    def convert_type1_link(link):
        return "https://www.walmart.com" + re.sub(r'\?.*$', '', link)

    # Function to convert Type-2 link and extract up to the first unique code
    def convert_type2_link(link):
        match = re.search(r'rd=(https%3A%2F%2Fwww\.walmart\.com.*?%2F(\d+))', link)
        if match:
            return match.group(1).replace('%2F', '/').replace('%3A',':')
        else:
            return None

    # Convert all links in the list
    converted_links = []

    for link in item_links:
        if link.startswith("/"):
            converted_link = convert_type1_link(link)
        else:
            converted_link = convert_type2_link(link)

        if converted_link:
            converted_links.append(converted_link)

    #############################
    # 5. # EXPORTING THE OUTPUT #
    #############################

    df_= pd.DataFrame()
    df_['Name']=item_names
    df_['Link']=converted_links
    df_.to_excel('Data/Walmart_Links.xlsx',index=False)
    logging.info('Formatted Links File Exported.')


######################################################################
# 3. # WEB SCRAPPING #
#######################################################################


def set_zip_code_and_get_message(driver, zip_code):
    date_text = ""
    try:
        try:
            driver.refresh()
            element = driver.find_element(By.XPATH,"//div[contains(text(),'Out of stock')]")
            date_text = element.text
            #print(date_text)
        except:
            try:
                # Find and click the button with the specified class
                zip_click_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@aria-label, 'Delivery to') and contains(@class, 'pointer')]")))
                zip_click_button.click()

            except TimeoutException:
                # If an exception occurs, click the "Change" button
                change_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Change shipping address']")))
                change_button.click()

            time.sleep(10)

            input_element = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[autocomplete='postal-code']")))
            input_element.send_keys(Keys.END)  # Go to the end of the input field
            input_element.send_keys(Keys.HOME + Keys.SHIFT + Keys.END)  # Select the entire text
            input_element.send_keys(Keys.BACK_SPACE)  # Clear the selected text
            input_element.send_keys(zip_code)

            time.sleep(5)

            save_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[form='update-postal-code-form']")))
            save_button.click()

            time.sleep(15) # allow page to refresh

            # Extract the text  
            date_text = driver.find_element(By.CSS_SELECTOR, 'div.f7.mt1.ws-normal.ttn').text
            
    except NoSuchElementException:
        date_text = 'Out of Stock'
    return date_text
    
    
def calculate_days_remaining(date_string):
    # Define the current date in PST (Pacific Standard Time)
    pst = pytz.timezone('US/Pacific')
    date_string = date_string.strip("Arrives ")
    current_date_pst = datetime.now(pst).date()


    # Initialize an empty list to store the number of days
    days_difference = []
    
    print("date_string: ",date_string)

    # Remove the "(Est.)" and any surrounding whitespace using strip
    #date_string = date_string.strip()
    try:
        try:
            if date_string.lower() == 'today':
                return 0
        
            elif date_string.lower() == 'tomorrow':
                return 1
        
            # Check if the string contains a date
            delivery_date = parser.parse(date_string).date()
            # Calculate the time difference in days (delivery date - current date)
            time_difference = (delivery_date - current_date_pst).days

            # Append the number of days to the list
            days_difference.append(str(time_difference))
        except:
            # Handle the case where delivery is unavailable
            days_difference.append(date_string)
    except:
        days_difference.append('-')
        
    print(days_difference)
             
    return days_difference

def stock_information_extractor(zip_code):
    if 'out of stock' in zip_code.lower():
        return 'Out of Stock'
    elif 'discontinued' in zip_code.lower().lower():
        return 'Discontinued'
    elif 'unavailable' in zip_code.lower().lower():
        return 'Not Available'
    else:
        return 'Available'

def scrapper(df_link):
    
    ############ Variables ###################    
    product_name_list = []
    product_price_list = []
    zip_1_stock = []
    zip_2_stock = []
    ship_zip_code_1 = []
    ship_zip_code_2 = []
    product_url_list = []
    
    product_price = ""
    product_name = ""
    delivery_estimate_to_zip_1 = ""
    delivery_estimate_to_zip_1 = ""
    error = []

    link_list = df_link['Link'].to_list() #list of extracted link for different items 
    link_list = link_list[12:25]
    
    for link in tqdm(link_list):
        driver = Driver(uc=True)
        driver.implicitly_wait(10)
        driver.get(link)
        time.sleep(2)
        
       
        #print(link)
        # TITLE:
        product_name = driver.find_element(By.ID,"main-title").text
        #print(product_name)

        product_price = driver.find_element(By.CSS_SELECTOR,'span[data-testid="price-wrap"] span[itemprop="price"]').text
        #print(product_price)
        if 'Now' in product_price:
            product_price = product_price.strip('Now')

        delivery_estimate_to_zip_1 = set_zip_code_and_get_message(driver,'90501')
        print("delivery_estimate_to_zip_1: ",delivery_estimate_to_zip_1)
        #print('Zip-1:',zip_1)
        time.sleep(15)

        delivery_estimate_to_zip_2 = set_zip_code_and_get_message(driver,'08404')
        print("delivery_estimate_to_zip_2: ",delivery_estimate_to_zip_2)
        #print('Zip-2:',zip_2)

        driver.close()

        product_name_list.append(product_name)
        product_price_list.append(product_price)

        try:
            ship_zip_code_1.append(calculate_days_remaining(delivery_estimate_to_zip_1)[0])
        except:
            ship_zip_code_1.append(calculate_days_remaining(delivery_estimate_to_zip_1))

        try:
            ship_zip_code_2.append(calculate_days_remaining(delivery_estimate_to_zip_2)[0])
        except:
            ship_zip_code_2.append(calculate_days_remaining(delivery_estimate_to_zip_2))

        print("ship_zip_code_1: ",ship_zip_code_1)
        print("ship_zip_code_2: ",ship_zip_code_2)

        zip_1_stock.append(stock_information_extractor(delivery_estimate_to_zip_1))
        zip_2_stock.append(stock_information_extractor(delivery_estimate_to_zip_2)) 

        product_url_list.append(link)
    
    ###########################
    # 4. # EXPORTING THE FILE #
    ###########################
    df_prod = pd.DataFrame()
    df_prod['Product Name'] = product_name_list
    df_prod['Price'] = product_price_list
    df_prod['# of days to LBC'] = ship_zip_code_1
    df_prod['Stock for LBC'] = zip_1_stock
    df_prod['# of days to NJ'] = ship_zip_code_2
    df_prod['Stock for NJ'] = zip_2_stock
    df_prod['Stock Status'] = df_prod.apply(lambda row: 'Available' if (row['Stock for LBC'] == 'Available' and row['Stock for NJ'] == 'Available') else 'Not Available', axis=1)
    df_prod['URL'] = product_url_list
        
    df_prod['# of days to LBC'] = df_prod['# of days to LBC'].astype(str)
    df_prod['# of days to NJ'] = df_prod['# of days to NJ'].astype(str)
    
    df_errors = pd.DataFrame(error)
    
    pst_time_now = datetime.now(pytz.timezone('US/Pacific')).strftime('%Y-%m-%d %H-%M-%S')
    pst_time_now = datetime.now(pytz.timezone('US/Pacific')).strftime('%Y-%m-%d %H-%M-%S')
    with pd.ExcelWriter(f'Output/Walmart_{pst_time_now}.xlsx') as writer:
        df_prod.to_excel(writer, sheet_name='Main Data', index=False)
        df_errors.to_excel(writer, sheet_name='Manual review needed', index=False)
        
    logging.info('Scraping completed.')
    return df_prod
    
    
def main():
    #urls_df = get_urls()
    urls_df = pd.read_excel("Walmart_Links.xlsx")
    df = scrapper(urls_df)


if __name__ == '__main__':
    main()

  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

delivery_estimate_to_zip_1:  Out of stock
delivery_estimate_to_zip_2:  Out of stock
date_string:  Out of stock
['Out of stock']
date_string:  Out of stock
['Out of stock']
ship_zip_code_1:  ['Out of stock']
ship_zip_code_2:  ['Out of stock']


  8%|██████▍                                                                            | 1/13 [00:53<10:47, 53.97s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5']
ship_zip_code_2:  ['Out of stock', 1]


 15%|████████████▊                                                                      | 2/13 [02:58<13:46, 75.16s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1]


 23%|███████████████████▏                                                               | 3/13 [05:28<16:15, 97.55s/it]

delivery_estimate_to_zip_1:  Arrives tomorrow
delivery_estimate_to_zip_2:  Arrives Dec 15
date_string:  tomorrow
date_string:  tomorrow
date_string:  Dec 15
['2']
ship_zip_code_1:  ['Out of stock', '5', '5', 1]
ship_zip_code_2:  ['Out of stock', 1, 1, '2']


 31%|█████████████████████████▏                                                        | 4/13 [07:38<16:06, 107.40s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives Dec 20
date_string:  Dec 18
['5']
date_string:  Dec 20
['7']
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7']


 38%|███████████████████████████████▌                                                  | 5/13 [09:59<15:39, 117.47s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1]


 46%|█████████████████████████████████████▊                                            | 6/13 [12:10<14:09, 121.36s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1]


 54%|████████████████████████████████████████████▏                                     | 7/13 [14:30<12:42, 127.17s/it]

delivery_estimate_to_zip_1:  Out of stock
delivery_estimate_to_zip_2:  Out of stock
date_string:  Out of stock
['Out of stock']
date_string:  Out of stock
['Out of stock']
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5', 'Out of stock']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1, 'Out of stock']


 62%|██████████████████████████████████████████████████▍                               | 8/13 [15:22<08:42, 104.50s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives Dec 20
date_string:  Dec 18
['5']
date_string:  Dec 20
['7']
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5', 'Out of stock', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1, 'Out of stock', '7']


 69%|████████████████████████████████████████████████████████▊                         | 9/13 [17:34<07:31, 112.81s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives Dec 20
date_string:  Dec 18
['5']
date_string:  Dec 20
['7']
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5', 'Out of stock', '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1, 'Out of stock', '7', '7']


 77%|██████████████████████████████████████████████████████████████▎                  | 10/13 [19:41<05:51, 117.12s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5', 'Out of stock', '5', '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1, 'Out of stock', '7', '7', 1]


 85%|████████████████████████████████████████████████████████████████████▌            | 11/13 [21:51<04:01, 120.89s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5', 'Out of stock', '5', '5', '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1, 'Out of stock', '7', '7', 1, 1]


 92%|██████████████████████████████████████████████████████████████████████████▊      | 12/13 [24:08<02:05, 125.85s/it]

delivery_estimate_to_zip_1:  Arrives Dec 18
delivery_estimate_to_zip_2:  Arrives tomorrow
date_string:  Dec 18
['5']
date_string:  tomorrow
date_string:  tomorrow
ship_zip_code_1:  ['Out of stock', '5', '5', 1, '5', '5', '5', 'Out of stock', '5', '5', '5', '5', '5']
ship_zip_code_2:  ['Out of stock', 1, 1, '2', '7', 1, 1, 'Out of stock', '7', '7', 1, 1, 1]


100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [26:18<00:00, 121.44s/it]


Unnamed: 0,Name,Link
0,simplehuman Code H Custom Fit Drawstring Trash...,https://www.walmart.com/ip/simplehuman-Code-H-...
1,simplehuman 45 Liter Rectangular Hands-Free Ki...,https://www.walmart.com/ip/simplehuman-45-Lite...
2,simplehuman Code J Custom Fit Drawstring Trash...,https://www.walmart.com/ip/simplehuman-Code-J-...
3,simplehuman 10 gal Stainless Steel Rectangular...,https://www.walmart.com/ip/simplehuman-10-gal-...
4,simplehuman 58 Liter / 15.3 gal Stainless Stee...,https://www.walmart.com/ip/simplehuman-58-Lite...
...,...,...
166,"Simplehuman Code G Custom Fit Liners, 20 Count",https://www.walmart.com/ip/Simplehuman-Code-G-...
167,simplehuman 12 gal Plastic Rectangular Kitchen...,https://www.walmart.com/ip/simplehuman-12-gal-...
168,simplehuman Code K Custom Fit Drawstring Trash...,https://www.walmart.com/ip/simplehuman-Code-K-...
169,simplehuman Code Q Custom Fit Drawstring Trash...,https://www.walmart.com/ip/simplehuman-Code-Q-...
