In [1]:
# import
import requests
from bs4 import BeautifulSoup
import lxml
from sympy.parsing.sympy_parser import parse_expr
import pandas as pd
import warnings
import time
import random
import duckdb
import win32com.client
from pretty_html_table import build_table
import os
from datetime import datetime

warnings.filterwarnings("ignore")

# accumulators
start_time = time.time()
df_acc = pd.DataFrame()
pg = 1

while (1):
    # soup
    link = 'https://www.daraz.com.bd/catalog/?from=filter&location=-21&page=' + str(pg) + '&q=unilever'
    html = requests.get(link, verify=False).text
    soup = BeautifulSoup(html, 'lxml')

    # list of key-pair vals
    try:
        list_of_dic_str = str(soup)
        list_of_dic_str = list_of_dic_str.split('"listItems":', 1)[1].split(',"breadcrumb"', 1)[0]
        list_of_dic_str = list_of_dic_str.replace("true", "True")
        list_of_dic_str = list_of_dic_str.replace("false", "False")
        list_of_dic = parse_expr(list_of_dic_str, evaluate=False)
    except:
        break

    # SKUs' data
    title = []
    price = []
    original_price = []
    rating = []
    review = []
    brand = []

    l = len(list_of_dic)
    for i in range(0, l):
        title.append(list_of_dic[i]['name'])
        brand.append(list_of_dic[i]['name'].split()[0].replace("'", ""))
        price.append(float(list_of_dic[i]['price']))
        review.append(int(list_of_dic[i]['review']))
        try:
            original_price.append(float(list_of_dic[i]['originalPrice']))
        except:
            original_price.append(float(list_of_dic[i]['price']))
        try:
            rating.append(float(list_of_dic[i]['ratingScore']))
        except:
            rating.append(None)

    df = pd.DataFrame()
    df['title'] = title
    df['brand_derived'] = brand
    df['discounted_price'] = price
    df['original_price'] = original_price
    df['rating'] = rating
    df['review'] = review
    df['source_page'] = pg
    df_acc = df_acc.append(df)

    wt = random.randint(3, 5)
    print("Data scraped from page: " + str(pg) + ", SKUs found: " + str(df.shape[0]) + ", waiting time (sec): " + str(wt))
    time.sleep(wt)

    pg = pg + 1

# stats
print()
print("Total SKUs found: " + str(df_acc.shape[0]))
elapsed_time = time.time() - start_time
print("Elapsed time to scrape (mins): " + str(round(elapsed_time / 60.00, 2)))
df_acc = df_acc.reset_index(drop=True)

# csv
qry = '''
select 
    title, 
    brand_derived, 
    original_price, 
    case 
        when (discounted_price<original_price) then discounted_price
        else null
    end discounted_price, 
    case 
        when (discounted_price<original_price) then (original_price-discounted_price)/original_price
        else null
    end discount_pct, 
    rating, 
    review, 
    source_page
from df_acc; 
'''
df_acc = duckdb.query(qry).df()
df_acc.to_csv("daraz_unilever_skus_data.csv", index=False)

# analysis
qry = '''
select 
    brand_derived brand, 
    count(*) "SKUs enlisted", 
    count(case when discounted_price is not null then title else null end) "SKUs given discount",
    avg(discount_pct) "avg. discount pct",
    avg(rating) "avg. rating", 
    sum(review)::int "reviews recorded"
from 
    df_acc tbl1 

    inner join 

    (select brand_derived, count(*) skus
    from df_acc 
    where source_page=1
    group by 1 
    order by 2 desc
    limit 7
    ) tbl2 using(brand_derived)
group by 1
order by 2 desc; 
'''
res_df = duckdb.query(qry).df()

# # email
# ol = win32com.client.Dispatch("outlook.application")
# olmailitem = 0x0
# newmail = ol.CreateItem(olmailitem)
#
# # subject, recipients
# newmail.Subject = 'Scraped & Analysed: daraz.com.bd'
# newmail.To = 'avra.barua@unilever.com'
# newmail.CC = 'mehedi.asif@unilever.com'
#
# # body
# newmail.HTMLbody = f'''
# Hello Bhaiya,<br><br>
# Please find below an analysis of popular Unilever SKUs available on daraz.com.bd (filtered for Bangladesh).<br>
# ''' + build_table(res_df, 'blue_light') + '''
# Also, the complete search results are available in the attachment for your convenience.<br><br>
# Note that, the data was extracted at ''' + time.strftime('%d-%b-%y, %I:%M%p') + '''. This is an auto generated email using smtplib.<br><br>
# Thanks,<br>
# Shithi Maitra<br>
# Asst. Manager, Cust. Service Excellence<br>
# Unilever BD Ltd.<br>
# '''
#
# # attachment(s)
# attachment = ['daraz_unilever_skus_data.csv']
# for atch in attachment:
#   newmail.Attachments.Add(os.getcwd() + '\\' + atch)
#
# # display, send
# # newmail.Display()
# newmail.Send()

Data scraped from page: 1, SKUs found: 40, waiting time (sec): 3
Data scraped from page: 2, SKUs found: 40, waiting time (sec): 3
Data scraped from page: 3, SKUs found: 40, waiting time (sec): 3
Data scraped from page: 4, SKUs found: 40, waiting time (sec): 4
Data scraped from page: 5, SKUs found: 40, waiting time (sec): 4
Data scraped from page: 6, SKUs found: 40, waiting time (sec): 4
Data scraped from page: 7, SKUs found: 40, waiting time (sec): 5
Data scraped from page: 8, SKUs found: 40, waiting time (sec): 5
Data scraped from page: 9, SKUs found: 40, waiting time (sec): 3
Data scraped from page: 10, SKUs found: 40, waiting time (sec): 3
Data scraped from page: 11, SKUs found: 40, waiting time (sec): 3
Data scraped from page: 12, SKUs found: 40, waiting time (sec): 4
Data scraped from page: 13, SKUs found: 40, waiting time (sec): 5
Data scraped from page: 14, SKUs found: 40, waiting time (sec): 5
Data scraped from page: 15, SKUs found: 40, waiting time (sec): 3
Data scraped from p