In [None]:
## Shajgoj

# import
import pandas as pd
import duckdb
from selenium import webdriver
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from google.oauth2 import service_account
import time

# time
start_time = time.time()

# credentials
SERVICE_ACCOUNT_FILE = 'read-write-to-gsheet-apis-1-04f16c652b1e.json'
SAMPLE_SPREADSHEET_ID = '1gkLRp59RyRw4UFds0-nNQhhWOaS4VFxtJ_Hgwg2x2A0'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# API
def sheet_api():
    creds = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('sheets', 'v4', credentials=creds)
    sheet = service.spreadsheets()
    return sheet

# open window
driver = webdriver.Chrome('chromedriver', options=[])
driver.maximize_window()

# url
url = "https://shop.shajgoj.com/unilever-bangladesh/"
driver.get(url)
time.sleep(10)

# soup
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup = soup.find_all("div", attrs={"class": "product_page shajgoj_upsell"})

# scrape
skus = []
quants = []
links = []
descriptions = []
for s in soup:
    # sku
    try: val = s.find("div", attrs={"class": "upsell_name"}).get_text()
    except: val = None
    skus.append(val)
    # quantity
    try: val = s.find("div", attrs={"class": "upsell_weight"}).get_text()[0:-3].replace('(', '').replace(')', '')
    except: val = None
    quants.append(val)
    # link
    val = s.find("a", attrs={"class": "shaj_freq ga_link_open upsell_product"})["href"]
    links.append(val)
# description
sku_count = len(skus)
for i in range(0, sku_count):
    print("Describing SKU: " + str(i+1) + ", of " + str(sku_count))
    driver.get(links[i])
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    try: val = soup.find("div", attrs={"class": "woocommerce-product-details__short-description"}).get_text().replace('\n', ' ')
    except: val = None
    descriptions.append(val)
    
# close window
driver.close()

# accumulate
df = pd.DataFrame()
df['sku'] = [s + ' ' + q for s, q in zip(skus, quants)]
df['description'] = descriptions
df['if_described'] = [0 if d is None else 1 for d in descriptions]
df['platform'] = 'Shajgoj'
df['report_time'] = time.strftime('%d-%b-%y, %I:%M %p')

# call API
sheet = sheet_api()
# extract
values = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Descriptions!A1:E').execute().get('values', [])
df_acc_rd = pd.DataFrame(values[1:] , columns = values[0])
# transform
qry = '''select * from df_acc_rd where platform!=''' + "'" + 'Shajgoj' + "'" + ''' union all select * from df'''
df_acc_wrt = duckdb.query(qry).df().fillna('')
# load
sheet.values().clear(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Descriptions').execute()
sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range="Descriptions!A1", valueInputOption='USER_ENTERED', body={'values': [df_acc_wrt.columns.values.tolist()] + df_acc_wrt.values.tolist()}).execute()

# stats
display(df.head(5))
print("Total SKUs found: " + str(sku_count))
elapsed_time = time.time() - start_time
print("Elapsed time to report (mins): " + str(round(elapsed_time / 60.00, 2)))


In [None]:
## Chaldal

# import
import pandas as pd
import duckdb
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from google.oauth2 import service_account
import time

# accumulators
start_time = time.time()
skus = []
quants = []
descs = []
report_times = []

# credentials
SERVICE_ACCOUNT_FILE = 'read-write-to-gsheet-apis-1-04f16c652b1e.json'
SAMPLE_SPREADSHEET_ID = '1gkLRp59RyRw4UFds0-nNQhhWOaS4VFxtJ_Hgwg2x2A0'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# API
def sheet_api():
    creds = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('sheets', 'v4', credentials=creds)
    sheet = service.spreadsheets()
    return sheet

# open window
driver = webdriver.Chrome('chromedriver', options=[])
driver.maximize_window()

# url
url = "https://www.chaldal.com/Unilever"
driver.get(url)

# scroll
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height: break
    last_height = new_height

# soup
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup = soup.find_all("div", attrs={"class": "product"})

# SKU
for s in soup:
    # basepack
    try: val = s.find("div", attrs={"class": "name"}).get_text()
    except: val = None
    skus.append(val)
    # quantity
    try: val = s.find("div", attrs={"class": "subText"}).get_text()
    except: val = None
    quants.append(val)
    
# description
sku_count = len(skus)
moves = driver.find_elements(By.CLASS_NAME, "imageWrapper")
for i in range(0, sku_count): 
    print("Describing SKU: " + str(i+1) + ", of " + str(sku_count))
    descs.append("ERROR")
    try:
        # move
        elem = moves[i]
        ActionChains(driver).move_to_element(elem).perform()
        # details
        elem = driver.find_element(By.XPATH, '//*[@id="page"]/div/div[6]/section/div/div/div/div/section/div[2]/div/div['+str(i+1)+']/div/div/div[5]/span/a')
        elem.click()
        # content
        elem = driver.find_element(By.CLASS_NAME, "details")
        descs[i] = elem.text.replace("\n", " ")
        # close
        elem = driver.find_element(By.CLASS_NAME, "close")
        elem.click()
    except: pass
    report_times.append(time.strftime('%d-%b-%y, %I:%M %p'))

# accumulate
df = pd.DataFrame()
df['sku'] = [str(s) + ' ' + str(q) for s, q in zip(skus, quants)]
df['description'] = descs
df['if_described'] = [0 if d == '' else 1 for d in descs]
df['platform'] = 'Chaldal'
df['report_time'] = report_times

# close window
driver.close()

# call API
sheet = sheet_api()
# extract
values = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Descriptions!A1:E').execute().get('values', [])
df_acc_rd = pd.DataFrame(values[1:] , columns = values[0])
# transform
qry = '''select * from df_acc_rd where platform!=''' + "'" + 'Chaldal' + "'" + ''' union all select * from df'''
df_acc_wrt = duckdb.query(qry).df().fillna('')
# load
sheet.values().clear(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Descriptions').execute()
sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range="Descriptions!A1", valueInputOption='USER_ENTERED', body={'values': [df_acc_wrt.columns.values.tolist()] + df_acc_wrt.values.tolist()}).execute()

# stats
display(df.head(5))
print("Total SKUs found: " + str(sku_count))
elapsed_time = time.time() - start_time
print("Elapsed time to report (mins): " + str(round(elapsed_time / 60.00, 2)))


In [1]:
# Pandamart

# import
import pandas as pd
import duckdb
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from google.oauth2 import service_account
from random import randint
import time

# accumulators
start_time = time.time()
sku = []
desc = []
report_time = []

# credentials
SERVICE_ACCOUNT_FILE = 'read-write-to-gsheet-apis-1-04f16c652b1e.json'
SAMPLE_SPREADSHEET_ID = '1gkLRp59RyRw4UFds0-nNQhhWOaS4VFxtJ_Hgwg2x2A0'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# API
def sheet_api():
    creds = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('sheets', 'v4', credentials=creds)
    sheet = service.spreadsheets()
    return sheet

# SKUs
result = sheet_api().values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Pandamart OLA').execute()
values = result.get('values', [])
desc_df = pd.DataFrame(values[1:], columns = values[0])
desc_df = duckdb.query('''
select sku, max(site) site 
from desc_df 
where ola_status like '%online%' 
group by 1
order by 2
''').df()
skus = desc_df['sku'].tolist()
sites = desc_df['site'].tolist()

# urls
urls = [
    'https://www.foodpanda.com.bd/darkstore/w2lx/pandamart-gulshan-w2lx',
    'https://www.foodpanda.com.bd/darkstore/h5rj/pandamart-bashundhara',
    'https://www.foodpanda.com.bd/darkstore/ta7z/pandamart-dhanmondi',
    'https://www.foodpanda.com.bd/darkstore/n7ph/pandamart-uttara',
    'https://www.foodpanda.com.bd/darkstore/v1ts/pandamart-mogbazar',
    'https://www.foodpanda.com.bd/darkstore/q4hz/pandamart-sylhet-02',
    'https://www.foodpanda.com.bd/darkstore/a2er/pandamart-khulna',
    'https://www.foodpanda.com.bd/darkstore/w2nv/pandamart-chittagong-1'
]

# open window
def get_window():
    driver = webdriver.Chrome('chromedriver', options=[])
    driver.implicitly_wait(5)
    driver.maximize_window()
    return driver

# initialize
sku_count = len(skus)
if_window_open = 0
site = 'loc'

# scrape
for i in range(0, sku_count):
    if site != sites[i]: 
        site = sites[i]
        # link
        for url in urls: 
            if site in url: 
                # close window
                if if_window_open == 1: 
                    driver.close()
                    time.sleep(randint(80, 100))
                # open window
                driver = get_window()
                driver.get(url + "/search?q=unilever")
                if_window_open = 1
                print("Fetching from: " + url)
                # cross
                elem = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div/button')
                elem.click()   
                time.sleep(1)
    # search
    print("Describing SKU: " + str(i+1) + ", of " + str(sku_count))
    elem = driver.find_element(By.XPATH, '//*[@id="groceries-menu-react-root"]/div/div[2]/section/div[2]/div/div/div/div/div[1]/input')
    elem.send_keys(skus[i] + "\n")
    time.sleep(2)
    # soup
    soup_init = BeautifulSoup(driver.page_source, 'html.parser')
    soup = soup_init.find_all('div', attrs={'class', 'box-flex product-card-attributes'})
    # sku
    try: val = soup[0].find('p', attrs={'class', 'product-card-name'}).get_text()
    except: continue
    sku.append(val)
    # details
    desc.append('ERROR')
    try: 
        # fetch
        elem = driver.find_element(By.CLASS_NAME, "groceries-image-wrapper")
        elem.click()
        time.sleep(1)
        elem = driver.find_element(By.CLASS_NAME, "info-description")
        desc[len(desc) - 1] = elem.text.replace('\n', ' ')
        # close
        elem = driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/button")
        elem.click()
    except: print('ERROR\n')
    # record
    report_time.append(time.strftime('%d-%b-%y, %I:%M %p'))
    
# close window
driver.close()

# accumulate
df = pd.DataFrame()
df['sku'] = sku
df['description'] = desc
df['if_described'] = [0 if d == '' else 1 for d in desc]
df['platform'] = 'Pandamart'
df['report_time'] = report_time

# call API
sheet = sheet_api()
# extract
values = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Descriptions!A1:E').execute().get('values', [])
df_acc_rd = pd.DataFrame(values[1:] , columns = values[0])
# transform
qry = '''select * from df_acc_rd where platform!=''' + "'" + 'Pandamart' + "'" + ''' union all select * from df'''
df_acc_wrt = duckdb.query(qry).df().fillna('')
# load
sheet.values().clear(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Descriptions').execute()
sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range="Descriptions!A1", valueInputOption='USER_ENTERED', body={'values': [df_acc_wrt.columns.values.tolist()] + df_acc_wrt.values.tolist()}).execute()

# stats
display(df.head(5))
print("Total SKUs found: " + str(df.shape[0]))
elapsed_time = time.time() - start_time
print("Elapsed time to report (mins): " + str(round(elapsed_time / 60.00, 2)))


Fetching from: https://www.foodpanda.com.bd/darkstore/h5rj/pandamart-bashundhara
Describing SKU: 1, of 211
ERROR

Describing SKU: 2, of 211
ERROR

Describing SKU: 3, of 211
ERROR

Describing SKU: 4, of 211
Describing SKU: 5, of 211
ERROR

Describing SKU: 6, of 211
ERROR

Fetching from: https://www.foodpanda.com.bd/darkstore/w2nv/pandamart-chittagong-1
Describing SKU: 7, of 211
Describing SKU: 8, of 211
Describing SKU: 9, of 211
Describing SKU: 10, of 211
Describing SKU: 11, of 211
Describing SKU: 12, of 211
Describing SKU: 13, of 211
ERROR

Describing SKU: 14, of 211
Describing SKU: 15, of 211
Fetching from: https://www.foodpanda.com.bd/darkstore/ta7z/pandamart-dhanmondi
Describing SKU: 16, of 211
Describing SKU: 17, of 211
Describing SKU: 18, of 211
Fetching from: https://www.foodpanda.com.bd/darkstore/w2lx/pandamart-gulshan-w2lx
Describing SKU: 19, of 211
Describing SKU: 20, of 211
Fetching from: https://www.foodpanda.com.bd/darkstore/a2er/pandamart-khulna
Describing SKU: 21, of 211


Unnamed: 0,sku,description,if_described,platform,report_time
0,Dove Detox Nourishment Shampoo 450ml,ERROR,1,Pandamart,"08-Jul-23, 04:16 PM"
1,Ponds Facial Foam - Bright Beauty 100g,ERROR,1,Pandamart,"08-Jul-23, 04:16 PM"
2,Dove Shampoo Hair Fall Rescue Green 480ml,ERROR,1,Pandamart,"08-Jul-23, 04:16 PM"
3,Sunsilk Shampoo Hijab Anti-Breakage 350 ml,ERROR,1,Pandamart,"08-Jul-23, 04:16 PM"
4,Vaseline Healthy Bright Lotion 400ml,ERROR,1,Pandamart,"08-Jul-23, 04:16 PM"


Total SKUs found: 162
Elapsed time to report (mins): 33.47
