# ShoppingSpreeScript (s3)
Tested and working as of 05/08/2021. All rights reserved.

In [1]:
### importing all the libraries needed
import time
from time import sleep
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import re
import pandas as pd

## Part 1: Setting up Selenium

In [2]:
### initial set up of selenium and chrome settings. Selenium is imitating as a Chrome broswer. 
### credits @ https://stackoverflow.com/questions/62057645/how-to-scrape-data-from-shopee-using-beautiful-soup

driver = webdriver.Chrome(ChromeDriverManager().install()) #this ensures you are using the latest chrome version everytime! Dont even need to find driver_path.  
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("--window-size=1920,1200")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')

# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")

# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})

### get the browser load the url and display stuffs 
driver.get("https://shopee.sg/flash_deals")

### To get broswer to scroll endlessly. Credits @ https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python
SCROLL_PAUSE_TIME = 0.5

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

### transferring to BeautifulSoup

sleep(30) # seconds. it is v impt to give browser some time to sleep, so that the scrolling can be completed and get full HTML

selenium_html = driver.page_source
soup = BeautifulSoup(selenium_html, 'lxml') #lxml parsing is faster than html parsing 

driver.quit() #to automatically close the browser



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\user\.wdm\drivers\chromedriver\win32\92.0.4515.107\chromedriver.exe] found in cache


## Part 2: Data Extraction

In [3]:
### Data wrangling - selecting portions of interest from the css 
sold_out = soup.find_all('div', class_ = "flash-sale-item-card flash-sale-item-card--landing-page flash-sale-item-card--SG flash-sale-item-card--sold-out")
still_selling = soup.find_all('div', class_ = "flash-sale-item-card flash-sale-item-card--landing-page flash-sale-item-card--SG")
items_type = [sold_out, still_selling]

# empty lists for loop below
items_name = []
original_price = []
sale_price = []
items_url = []
sold_quantity = []

In [4]:
# A double loop to get all fields of interest into lists. Credits @ QYGoh
for types in range(len(items_type)):
    for item in range(len(items_type[types])):
        items_name.append(items_type[types][item].find('div', class_ ="flash-sale-item-card__item-name-box").get_text())
        original_price.append(items_type[types][item].find('div', class_ = "flash-sale-item-card__original-price flash-sale-item-card__original-price--landing-page").get_text())
        sale_price.append(items_type[types][item].find('div', class_ = "flash-sale-item-card__current-price flash-sale-item-card__current-price--landing-page").get_text())
        
        # to get url- updated version on 05/08/2021
        links = items_type[types][item].find_all('a', {'class': "flash-sale-item-card-link", 'href': True})
        for link in links:
            items_url.append(link['href'])
        
        # special arrangement for sold quantity. Fully sold out have diff classes vs still selling
        if types == 0:
            sold_quantity.append(items_type[types][item].find('div', class_ = "flash-sale-sold-out flash-sale-sold-out--landing-page").get_text()) 
        elif types == 1:
            sold_quantity.append(items_type[types][item].find('div', class_ = "flash-sale-progress-bar__text").get_text())

## Part 3: Reading & Saving outputs

In [5]:
### reading extracted data in a pandas dataframe

df = pd.DataFrame(list(zip(items_name, original_price, sale_price, sold_quantity, items_url)),\
                 columns =['Name','Original_price','Sale_price','Sold_quantity','URL'])

### Making good of the URL
for link in range(len(df['URL'])):
    df['URL'][link] = "https://shopee.sg"+ df['URL'][link]
    
df

Unnamed: 0,Name,Original_price,Sale_price,Sold_quantity,URL
0,5-Clip Earphone Cable Winder， Organizer Charge...,$ 5.20,$ 0.10,18 sold in 1 minute,https://shopee.sg/5-Clip-Earphone-Cable-Winder...
1,Hi/ Korean Fashion Scrunchies Blue Hair Tie Sw...,$ 1.80,$ 0.10,200 sold in 1 minute,https://shopee.sg/Hi-Korean-Fashion-Scrunchies...
2,BreadTalk Tuna,$ 1.90,$ 0.80,500 sold in 1 minute,https://shopee.sghttps://shopee.sg/dp
3,Improve Blackhead Bamboo Charcoal Nose Mask Ca...,$ 0.95,$ 0.10,200 sold in 1 minute,https://shopee.sg/Improve-Blackhead-Bamboo-Cha...
4,Women Nose Shaper Clip / Nose Up Lifting Nose ...,$ 4.56,$ 0.10,200 sold in 1 minute,https://shopee.sg/Women-Nose-Shaper-Clip-Nose-...
...,...,...,...,...,...
290,POCO F3 5G 8GB+256GB Global Version[1 year war...,$ 599.00,$ 499.00,5 sold,https://shopee.sg/POCO-F3-5G-8GB-256GB-Global-...
291,[Extra 25% Off + Buy 2 Get 15% Off] Tongkat Al...,$ 89.90,$ 49.90,0 sold,https://shopee.sg/-Ali-King-Tongkat-Ali-Cordyc...
292,The North Face Router Backpack,$ 266.00,$ 151.90,0 sold,https://shopee.sg/The-North-Face-Router-Backpa...
293,Levi's Loose Sleeve Trucker Jacket 22773-0003,$ 139.90,$ 59.80,0 sold,https://shopee.sg/Levi's-Loose-Sleeve-Trucker-...


In [6]:
# saving excel file named according to datetime of extraction 
timestr = time.strftime("%Y%m%d-%H%M")
df.to_excel('Shoppee_'+ timestr + '.xlsx')