In [1]:
# import
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import random
import warnings
import duckdb
import win32com.client
from pretty_html_table import build_table
from datetime import datetime
import os

warnings.filterwarnings("ignore")

# accumulators
start_time = time.time()
df_acc = pd.DataFrame()
pg = 0

while (1):
    # page
    driver = webdriver.Chrome('chromedriver', options=[])
    url = 'https://shop.shajgoj.com/shop/#q=unilever&hPP=21&idx=wp_posts_product&p=' + str(pg) + '&is_v=1'
    driver.get(url)

    # scroll
    SCROLL_PAUSE_TIME = 5
    # get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # till bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height: break
        last_height = new_height

    # soup
    soup_init = BeautifulSoup(driver.page_source, 'html.parser')
    soup = soup_init.find_all("div", attrs={"class": "ais-infinite-hits--item ais-hits--item"})
    driver.close()

    # title
    title = [s.find("a", attrs={"class": "product_title"}).get_text() for s in soup]

    # quantity
    qty = [s.find("div", attrs={"class": "alg-variation"}).get_text().replace('\n', "") for s in soup]
    qty_val = []
    qty_unit = []
    r = re.compile("([0-9]+)([a-zA-Z]+)")
    for q in qty:
        if q != '':
            qty_val.append(float(r.match(q).group(1)))
            qty_unit.append(r.match(q).group(2).lower())
        else:
            qty_val.append(None)
            qty_unit.append("")

    # price
    price = [s.find("p", attrs={"class": "alg-hit__priceholder product_price_placeholder"}).get_text() for s in soup]
    original_price = []
    discounted_price = []
    for p in price:
        sp = p.split('৳\xa0')
        original_price.append(float(sp[1].replace('\n', "").replace('\t', "").replace('৳\xa0', "").replace(',', "")))
        if len(sp) > 2:
            discounted_price.append(
                float(sp[2].replace('\n', "").replace('\t', "").replace('৳\xa0', "").replace(',', "")))
        else:
            discounted_price.append(
                float(sp[1].replace('\n', "").replace('\t', "").replace('৳\xa0', "").replace(',', "")))

    # rating
    rating = [s.find("span", attrs={"class": "alg-rating"})['style'] for s in soup]
    rating = [float(re.findall(r'\d+', r)[0]) / 20 for r in rating]

    # option
    opt = [s.find("div", attrs={"class": "alg-hit__actions"}).get_text() for s in soup]
    opt = [o.replace('\n', "").replace('\t', "") for o in opt]

    # offer
    offer = [s.find("div", attrs={"class": "alg-product-ribbon-container"}).get_text() for s in soup]
    offer = [o.replace('\n', "").replace('\t', "").replace('\xa0', "") for o in offer]

    # scraped data
    df = pd.DataFrame()
    df['title'] = title
    df['brand_derived'] = [t.split()[0].replace("'", "") for t in title]
    df['quantity_value'] = qty_val
    df['quantity_unit'] = qty_unit
    df['original_price'] = original_price
    df['discounted_price'] = discounted_price
    df['offer'] = offer
    df['rating'] = rating
    df['option'] = opt
    df['source_page'] = pg
    df_acc = df_acc.append(df)

    # loop control
    if df.shape[0] == 0: break
    print("Data scraped from page: " + str(pg) + ", SKUs found: " + str(df.shape[0]))
    pg = pg + 1

# stats
print()
print("Total SKUs found: " + str(df_acc.shape[0]))
elapsed_time = time.time() - start_time
print("Elapsed time to scrape (mins): " + str(round(elapsed_time / 60.00, 2)))
df_acc = df_acc.reset_index(drop=True)

# csv
qry = '''
select 
    title,
    brand_derived, 
    quantity_value, 
    quantity_unit, 
    original_price, 
    case when (discounted_price<original_price) then discounted_price else null end discounted_price, 
    case when (discounted_price<original_price) then (original_price-discounted_price)/original_price else null end discount_pct,
    case when offer like '%SOLD OUT%' then '' else offer end offer, 
    case when rating=0 then null else rating end rating, 
    option, 
    source_page
from df_acc; 
'''
df_acc_csv = duckdb.query(qry).df()
df_acc_csv.to_csv("shajgoj_unilever_skus_data.csv", index=False)

# analyse
qry = '''
select 
    brand_derived brand, 
    count(*) "SKUs enlisted", 
    count(case when option='REQUEST STOCK' then title else null end) "SKUs out of stock", 
    count(case when option='REQUEST STOCK' then title else null end)*1.00/count(*) "SKUs out of stock pct", 
    count(case when offer!='' then title else null end) "SKUs giving offer", 
    count(case when discounted_price is not null then title else null end) "SKUs giving discount",
    avg(discount_pct) "avg. discount pct",
    avg(rating) "avg. rating"
from 
    df_acc_csv tbl1 

    inner join 

    (-- top-05 SKUs
    select brand_derived, count(*) skus
    from df_acc_csv
    group by 1 
    order by 2 desc
    limit 7
    ) tbl2 using(brand_derived)
group by 1
order by 2 desc; 
'''
res_df = duckdb.query(qry).df()

# # email
# ol = win32com.client.Dispatch("outlook.application")
# olmailitem = 0x0
# newmail = ol.CreateItem(olmailitem)
#
# # subject, recipients
# newmail.Subject = 'Scraped & Analysed: shajgoj.com'
# newmail.To = 'avra.barua@unilever.com'
# newmail.CC = 'mehedi.asif@unilever.com'
#
# # body
# newmail.HTMLbody = f'''
# Hello Bhaiya,<br><br>
# Please find below an analysis of popular Unilever SKUs available on shajgoj.com.<br>
# ''' + build_table(res_df, 'blue_light') + '''
# Also, the complete search results are available in the attachment for your convenience.<br><br>
# Note that, the data was extracted at ''' + time.strftime('%d-%b-%y, %I:%M %p') + '''. This is an auto generated email using smtplib.<br><br>
# Thanks,<br>
# Shithi Maitra<br>
# Asst. Manager, Cust. Service Excellence<br>
# Unilever BD Ltd.<br>
# '''
#
# # attachment(s)
# attachment = ['shajgoj_unilever_skus_data.csv']
# for atch in attachment:
#     newmail.Attachments.Add(os.getcwd() + '\\' + atch)
#
# # display, send
# # newmail.Display()
# newmail.Send()

Data scraped from page: 0, SKUs found: 21
Data scraped from page: 1, SKUs found: 21
Data scraped from page: 2, SKUs found: 21
Data scraped from page: 3, SKUs found: 21
Data scraped from page: 4, SKUs found: 21
Data scraped from page: 5, SKUs found: 21
Data scraped from page: 6, SKUs found: 21
Data scraped from page: 7, SKUs found: 8

Total SKUs found: 155
Elapsed time to scrape (mins): 7.91
