In [1]:
# import
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import time
import pandas as pd
import duckdb
import win32com.client
from pretty_html_table import build_table
from datetime import datetime
import os
import warnings

warnings.filterwarnings("ignore")

# accumulators
start_time=time.time()

# page
driver=webdriver.Chrome('chromedriver', options=[])
url='https://www.shwapno.com/SearchResults.aspx?search=unilever'
driver.get(url)
time.sleep(3)

# city
dropdown=driver.find_element(By.XPATH, '//*[@id="state"]')
dd=Select(dropdown)
time.sleep(3)
dd.select_by_visible_text("Dhaka")
time.sleep(3)

# area
dropdown=driver.find_element(By.XPATH, '//*[@id="city"]')
dd=Select(dropdown)
time.sleep(3)
dd.select_by_visible_text("Banani")
time.sleep(3)

# submit
elem=driver.find_element(By.XPATH, '//*[@id="btnFindStore"]')
elem.click()
time.sleep(5)

# scroll
SCROLL_PAUSE_TIME=5
# get scroll height
last_height=driver.execute_script("return document.body.scrollHeight")
while True:
  # till bottom
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  # wait to load page
  time.sleep(SCROLL_PAUSE_TIME)
  # calculate new scroll height and compare with last scroll height
  new_height=driver.execute_script("return document.body.scrollHeight")
  if new_height==last_height: break
  last_height=new_height

# soup
soup_init=BeautifulSoup(driver.page_source, 'html.parser')
soup=soup_init.find_all("div", attrs={"class", "mob-showright"})
driver.close()

# scrape
df=pd.DataFrame()
df['title']=[s.find("h4", attrs={"class": "mtb-title"}).get_text() for s in soup]
df['brand_derived']=[t.split()[0].replace("'", "") for t in df['title']]
df['quantity']=[s.find("div", attrs={"class": "mtb-desc"}).get_text() for s in soup]
df['option']=[s.find("a", attrs={"class": "snackbar-msg btn_addqty locationpopup"}).get_text() for s in soup]
discounted_price=[s.find("label", attrs={"class": "mtb-mrp"}) for s in soup]
ls=[]
for dp in discounted_price:
    try: ls.append(float(dp.find("span", attrs={"class": "sp_amt"}).get_text()))
    except: ls.append(None)
df['discounted_price']=ls
original_price=[s.find("label", attrs={"class": "mtb-ofr"}) for s in soup]
ls=[]
for op in original_price:
    try: ls.append(float(op.find("span", attrs={"class": "sp_amt"}).get_text()))
    except: ls.append(None)
df['original_price']=ls

# stats
print("Total SKUs found: "+str(df.shape[0]))
elapsed_time=time.time()-start_time
print("Elapsed time to scrape (mins): "+str(round(elapsed_time/60.00, 2)))
df=df.reset_index(drop=True)

# csv
qry = '''
select 
    title,
    brand_derived, 
    quantity,
    case when original_price<discounted_price then discounted_price else original_price end original_price, 
    case when discounted_price is not null then original_price else discounted_price end discounted_price, 
    option
from df; 
'''
df_csv = duckdb.query(qry).df()
df_csv.to_csv("shwapno_unilever_skus_data.csv", index=False)

# analyse
qry = '''
select 
    brand_derived brand, 
    count(*) "SKUs enlisted", 
    count(case when discounted_price is not null then title else null end) "SKUs giving discount",
    count(case when discounted_price is not null then title else null end)*1.00/count(*) "SKUs giving discount pct",
    avg((original_price-discounted_price)/original_price) "avg. discount pct"
from df_csv
group by 1
order by 2 desc
limit 7; 
'''
res_df = duckdb.query(qry).df()

# # email
# ol = win32com.client.Dispatch("outlook.application")
# olmailitem = 0x0
# newmail = ol.CreateItem(olmailitem)
#
# # subject, recipients
# newmail.Subject = 'Scraped & Analysed: shwapno.com'
# newmail.To = 'avra.barua@unilever.com'
# newmail.CC = 'mehedi.asif@unilever.com'
#
# # body
# newmail.HTMLbody = f'''
# Hello Bhaiya,<br><br>
# Please find below an analysis of popular Unilever SKUs available on shwapno.com.<br>
# ''' + build_table(res_df, 'blue_light') + '''
# Also, the complete search results are available in the attachment for your convenience.<br><br>
# Note that, the data was extracted at ''' + time.strftime('%d-%b-%y, %I:%M %p') + '''. This is an auto generated email using smtplib.<br><br>
# Thanks,<br>
# Shithi Maitra<br>
# Asst. Manager, Cust. Service Excellence<br>
# Unilever BD Ltd.<br>
# '''
#
# # attachment(s)
# attachment = ['shwapno_unilever_skus_data.csv']
# for atch in attachment:
#     newmail.Attachments.Add(os.getcwd() + '\\' + atch)
#
# # display, send
# # newmail.Display()
# newmail.Send()

Total SKUs found: 188
Elapsed time to scrape (mins): 1.15
