In [2]:
# import
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
import random
import duckdb
import win32com.client
from pretty_html_table import build_table
from datetime import datetime
import os
import warnings

warnings.filterwarnings("ignore")

# accumulators
start_time = time.time()
df_acc = pd.DataFrame()

# right brands from ushopbd.com
brands_set = set()
urls = [
    'https://ushopbd.com//collections/skin-cleansing',
    'https://ushopbd.com//collections/oral-care',
    'https://ushopbd.com//collections/skin-care',
    'https://ushopbd.com//collections/haircare'
]
for url in urls:
    pg = 1
    while (1):
        # soup
        link = url + '?page=' + str(pg)
        html = requests.get(link, verify=False).text
        soup_init = BeautifulSoup(html, 'lxml')
        soup = soup_init.find("div", attrs={"class": "productList"})

        # brand
        sku = soup.find_all("a", attrs={"class": "grid-view-item__title"})
        brand = [s.get_text().split()[0].replace("'", "") for s in sku]
        for b in brand: brands_set.add(b)
        time.sleep(3)

        # next page if available
        if len(brand) == 0: break
        pg = pg + 1
    print("Fetched brands from: " + url)
print()

# scrape OHSOGO
for b in brands_set:
    # page
    driver = webdriver.Chrome('chromedriver', options=[])
    url = 'https://ohsogo.com/search/' + b
    print("Scraping started on: " + url)
    driver.get(url)
    driver.maximize_window()
    pg = 1

    while (1):
        # scroll till 'Next'-button/end
        SCROLL_PAUSE_TIME = 5
        last_height = driver.execute_script("return document.body.scrollHeight")
        while (1):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            new_height = driver.execute_script("return document.body.scrollHeight")
            try:
                elem = driver.find_element(By.XPATH, "//button[normalize-space()='Next']")
                break
            except:
                pass
            if new_height == last_height: break
            last_height = new_height

        # soup
        soup_init = BeautifulSoup(driver.page_source, 'html.parser')
        soup = soup_init.find_all("li", attrs={"class": "ProductCard ProductCard_layout_grid"})

        # data
        df = pd.DataFrame()
        df['title'] = [s.find("p", attrs={"class": "ProductCard-Name"}).get_text() for s in soup]
        df['brand_derived'] = [s.find("p", attrs={"class": "ProductCard-Name"}).get_text().split()[0].replace("'", "")
                               for s in soup]
        original_price = []
        for i in range(0, df['title'].shape[0]):
            try:
                original_price.append(
                    float(soup[i].find("del", attrs={"class": "ProductPrice-HighPrice"}).get_text().split()[0]))
            except:
                original_price.append(
                    float(soup[i].find("span", attrs={"class": "ProductPrice-PriceValue"}).get_text().split()[0]))
        df['original_price'] = original_price
        df['discounted_price'] = [
            float(s.find("span", attrs={"class": "ProductPrice-PriceValue"}).get_text().split()[0]) for s in soup]
        df['option'] = [s.find("div", attrs={"class": "ProductCard-Footer"}).get_text().lower() for s in soup]
        df['source'] = "search " + str(b) + " pg " + str(pg)
        df_acc = df_acc.append(df)

        # loop control
        print("Data scraped from page: " + str(pg) + ", SKUs found: " + str(df.shape[0]))
        time.sleep(random.randint(3, 5))
        try:
            elem.send_keys("\n")
        except:
            break
        pg = pg + 1

    # close window
    driver.close()
    print()

# stats
print("Total SKUs found: " + str(df_acc.shape[0]))
elapsed_time = time.time() - start_time
print("Elapsed time to scrape (mins): " + str(round(elapsed_time / 60.00, 2)))
df_acc = df_acc.reset_index(drop=True)

# csv
qry = '''
select 
    title,
    brand_derived, 
    original_price, 
    case when (discounted_price<original_price) then discounted_price else null end discounted_price, 
    case when (discounted_price<original_price) then (original_price-discounted_price)/original_price else null end discount_pct,
    option, 
    source
from df_acc; 
'''
df_acc_csv = duckdb.query(qry).df()
df_acc_csv.to_csv("ohsogo_unilever_skus_data.csv", index=False)

# analyse
qry = '''
select 
    brand_derived brand, 
    count(*) "SKUs enlisted", 
    count(case when option='out of stock' then title else null end) "SKUs out of stock", 
    count(case when option='out of stock' then title else null end)*1.00/count(*) "SKUs out of stock pct", 
    count(case when discounted_price is not null then title else null end) "SKUs giving discount",
    count(case when discounted_price is not null then title else null end)*1.00/count(*) "SKUs giving discount pct",
    avg(discount_pct) "avg. discount pct"
from df_acc_csv
where brand_derived in('Sunsilk', 'Vaseline', 'Tresemme', 'Dove', 'Ponds')
group by 1
order by 2 desc; 
'''
res_df = duckdb.query(qry).df()

# # email
# ol = win32com.client.Dispatch("outlook.application")
# olmailitem = 0x0
# newmail = ol.CreateItem(olmailitem)

# # subject, recipients
# newmail.Subject = 'Scraped & Analysed: OHSOGO.com'
# newmail.To = 'avra.barua@unilever.com'
# newmail.CC = 'mehedi.asif@unilever.com'

# # body
# newmail.HTMLbody = f'''
# Hello Bhaiya,<br><br>
# Please find below an analysis of popular Unilever SKUs available on OHSOGO.<br>
# ''' + build_table(res_df, 'blue_light') + '''
# Also, the complete search results are available in the attachment for your convenience.<br><br>
# Note that, the data was extracted at ''' + time.strftime('%d-%b-%y, %I:%M %p') + '''. This is an auto generated email using smtplib.<br><br>
# Thanks,<br>
# Shithi Maitra<br>
# Asst. Manager, Cust. Service Excellence<br>
# Unilever BD Ltd.<br>
# '''

# # attachment(s)
# attachment = ['ohsogo_unilever_skus_data.csv']
# for atch in attachment:
#     newmail.Attachments.Add(os.getcwd() + '\\' + atch)

# # display, send
# # newmail.Display()
# newmail.Send()

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))