# Sasto deal web scraper

In [2]:
import requests
from bs4 import BeautifulSoup

In [10]:
URL = "https://www.sastodeal.com/sd-fast/food-essentials/dry-fruits.html?p=1&is_scroll=1"
response = requests.get(URL)

In [11]:
response.status_code

200

In [12]:
if response.status_code == 200:
    response_data = response.text
    print(response_data[:500])
else:
    print("Error")

 <!doctype html><html lang="en"><head > <meta charset="utf-8"/>
<meta name="title" content="Dry Fruits | Food Essentials | Daily Needs | Sastodeal"/>
<meta name="description" content="Shop various Dry Fruits from Food Essentials | Daily Needs products only at your local online shopping web-store in Nepal. Visit now!"/>
<meta name="keywords" content="Dry Fruits | Food Essentials | Daily Needs | Sastodeal | Online Shopping in Nepal | Buy Products in Nepal"/>
<meta name="robots" content="INDEX,FOLL


In [13]:
html_resp = BeautifulSoup(response.text, "html.parser")
html_resp.title.text

'Dry Fruits | Food Essentials | Daily Needs | Sastodeal'

In [98]:
def text_parser(html_element):
    return html_element.text if html_element else None

In [107]:
number_of_pages = int(text_parser(html_resp.find("div",{"id":"am-page-count"})))
current_page = int(html_resp.find("li",{"class":"item current"}).find_all("span")[1].text)
number_of_pages, current_page

(6, 1)

In [116]:
# parsing all elements of class "product details product-item-details"

products = html_resp.find_all("div",{'class':"product details product-item-details"})
product_num = 4
len(products), products[product_num].find("strong",{"class":"product name product-item-name"})

(36,
 <strong class="product name product-item-name"><a class="product-item-link" href="https://www.sastodeal.com/dry-fruits-yellow-kismis-1-kg-gh-dfn-08-21.html">Dry Fruits ( Yellow Kismis) 1 Kg</a></strong>)

In [112]:
# product name 
product_name = products[product_num].find("strong",{"class":"product name product-item-name"}).text
detail_link = products[product_num].find("strong",{"class":"product name product-item-name"}).find('a')['href']
{"name":product_name, "url": detail_link}

{'name': 'Dry Fruits ( Yellow Kismis) 1 Kg',
 'url': 'https://www.sastodeal.com/dry-fruits-yellow-kismis-1-kg-gh-dfn-08-21.html'}

In [30]:
#product old price
price_box = products[product_num].find("div",{"class":"price-box price-final_price"})
try:
    newPrice = int(price_box.find("span",{"data-price-type":"finalPrice"})["data-price-amount"])
except TypeError:
    newPrice = None
try:
    oldPrice = int(price_box.find("span",{"data-price-type":"oldPrice"})["data-price-amount"])
except TypeError:
    oldPrice = None

try:
    discount = (oldPrice-newPrice)/oldPrice * 100
except:
    discount = None

{"price" : oldPrice,"discount%": round(discount,3)}

{'price': 800, 'discount%': 7.0}

In [115]:
partial_data = {
    "name":product_name,
    "url":detail_link,
    "price":oldPrice,
    "discount%":round(discount,3),
     "pagination":{
        "currentPage":current_page,
        "numberOfPages":number_of_pages
        }
}
partial_data

{'name': 'Dry Fruits ( Yellow Kismis) 1 Kg',
 'url': 'https://www.sastodeal.com/dry-fruits-yellow-kismis-1-kg-gh-dfn-08-21.html',
 'price': 800,
 'discount%': 7.0,
 'pagination': {'currentPage': 1, 'numberOfPages': 6, 'numberOfProducts': 36}}

# Partial Page Scraping

In [117]:
import requests
from bs4 import BeautifulSoup

In [120]:
def text_parser(html_element):
    return html_element.text if html_element else None

In [140]:
def partial_scraper(URL = "https://www.sastodeal.com/sd-fast/food-essentials/dry-fruits.html?p=1&is_scroll=1"):
    response = requests.get(URL)
    if response.status_code != 200:
        return -1
    
    html_resp = BeautifulSoup(response.text, "html.parser")
    
    #parsing pagination info
    number_of_pages = int(text_parser(html_resp.find("div",{"id":"am-page-count"})))
    current_page = int(html_resp.find("li",{"class":"item current"}).find_all("span")[1].text)
    
    # parsing product info
    # parsing all elements of class "product details product-item-details"
    def parse_product(product_dom):
        product_name = product_dom.find("strong",{"class":"product name product-item-name"}).text
        detail_link = product_dom.find("strong",{"class":"product name product-item-name"}).find('a')['href']
        
        #product price
        price_dom = product_dom.find("div",{"class":"price-box price-final_price"})
        try:
            new_price = float(price_dom.find("span",{"data-price-type":"finalPrice"})["data-price-amount"])
        except TypeError:
            new_price = None
        try:
            old_price = float(price_dom.find("span",{"data-price-type":"oldPrice"})["data-price-amount"])
        except TypeError:
            old_price = None

        try:
            discount = (old_price-new_price)/old_price * 100
        except:
            discount = None
        
        return {
            "name":product_name,
            "url":detail_link,
            "price":old_price if discount else new_price,
            "discount":round(discount,3) if discount else discount
        }    

    raw_products = html_resp.find_all("div",{'class':"product details product-item-details"})
#     print(len(raw_products))
    products = [parse_product(product_dom) for product_dom in raw_products]
    return {
        "products" : products,
        "pagination":{
            "currentPage":current_page,
            "numberOfPages":number_of_pages
            }
        }
        


In [141]:
partial_scraper()

{'products': [{'name': 'Combo Dry Fruits 4 Kg',
   'url': 'https://www.sastodeal.com/combo-dry-fruits-4-kg-dryfruits-sd-002.html',
   'price': 5800.0,
   'discount': None},
  {'name': 'Wallnut ( Without Shell) 1 Kg (Free 100Gm Almond)',
   'url': 'https://www.sastodeal.com/wallnut-without-shell-1-kg-free-100gm-almond-gh-dfn-08-05.html',
   'price': 1800.0,
   'discount': 7.0},
  {'name': 'Mato Almond 200 gm',
   'url': 'https://www.sastodeal.com/mato-almond-200-gm-mato-almond-nut-200gm.html',
   'price': 425.0,
   'discount': None},
  {'name': 'Almond 1 Kg (Free 100Gm Almond)',
   'url': 'https://www.sastodeal.com/almond-1-kg-free-100gm-almond-gh-dfn-08-04.html',
   'price': 1600.0,
   'discount': 2.0},
  {'name': 'Dry Fruits ( Yellow Kismis) 1 Kg',
   'url': 'https://www.sastodeal.com/dry-fruits-yellow-kismis-1-kg-gh-dfn-08-21.html',
   'price': 800.0,
   'discount': 7.0},
  {'name': 'Mato Pistacho (Pista) - 500 gm',
   'url': 'https://www.sastodeal.com/mato-pistacho-pista-500-gm-mato

# Detail page Scraping

In [61]:
URL = detail_link
# URL = "https://www.sastodeal.com/redmi-10-prime-2022-4-64-gb-tdl-rdm-102022.html"
# URL = "https://www.sastodeal.com/britannia-digestive-500gm-sd-gurjbis-002.html"
response = requests.get(URL)
if response.status_code == 200:
    response = BeautifulSoup(response.text,"html.parser")
    print(response.title.text)
else:
    print("Error")

Dry Fruits ( Yellow Kismis) 1 Kg | Dry Fruits | Sastodeal


In [109]:
#product info class = "product-info-main"
data = {
    "name": None,
    "description": None,
    "url": None,
    "additionalProperty":{
        "typeOfProduct":None,
        "nutritionalFacts":None,
        "storingRecommendation":None,
        "detail":None,
        "countryOfOrigin":None
        },
    "offers":{
        "price":None,
        "discount":None,
        "availability":None,
        "warranty":None
    },
    "brand":None,
    "seller":None,
    "sellerUrl":None,
    "image":None,
    "location":None,
    "review":None,
    "rating":None,
    }


detail_info_raw = response.find("div",{"class":"row"})

#vendor info
vendor_shipping_raw = detail_info_raw.find("div",{"class":"ratingsOut estimateRate"})
                 
vendor_raw = vendor_shipping_raw.find("a",{"id":"profileconnect","class":"shoptitle"})
data["seller"] = vendor_raw.text
data["sellerUrl"] = vendor_raw["href"] 

shipping_from_raw = vendor_shipping_raw.find_all("p")[1]
data["location"] = shipping_from_raw.text[14:].strip() if shipping_from_raw.text.startswith("Shipping from:") else None
data["offers"]['availability'] = detail_info_raw.find("div",{"class":"product-info-price"}).\
                             find("span", {"class":"stockqty"}).find("span").text

data["additionalProperty"]["detail"] = detail_info_raw.find("div",{"class":"product attribute overview"}).text
# data["raw detail"] = detail_info_raw.find("div",{"class":"product attribute overview"})

#description
try:
    data["description"] = detail_info_raw.find("div",{"class":"product attribute description"}).find("p").text
except:
    data["description"] = None
#additional attribute
additional_attribute_raw = detail_info_raw.find("div",{"id":"product-attribute-specs-table","class":"data table additional-attributes"})
data["brand"] = text_parser(additional_attribute_raw.find("span",{"data-th":"Brand"}))
data["additionalProperty"]["typeOfProduct"] = text_parser(additional_attribute_raw.find("span",{"data-th":"Type Of Product"}))
data["additionalProperty"]["nutritionalFacts"] = text_parser(additional_attribute_raw.find("span",{"data-th":"Nutritional Facts"}))
data["additionalProperty"]["storingRecommendation"] = text_parser(additional_attribute_raw.find("span",{"data-th":"Storing Recommendation"}))
data["additionalProperty"]["countryOfOrigin"] = text_parser(additional_attribute_raw.find("span",{"data-th":"Country Of Origin"}))
data["offers"]["warranty"] = text_parser(additional_attribute_raw.find("span",{"data-th":"Warranty"}))

data["image"] = response.find("img",{"alt":"main product photo"})["src"]
# 
data

{'name': None,
 'description': None,
 'url': None,
 'additionalProperty': {'typeOfProduct': 'Yellow Kismis',
  'nutritionalFacts': 'Rich in minerals, proteins, fibre and vitamins',
  'storingRecommendation': 'Store at a room temperature of 70°F',
  'detail': '  Delivery inside Kathmandu Valley only.',
  'countryOfOrigin': 'India'},
 'offers': {'price': None,
  'discount': None,
  'availability': 'In Stock',
  'warranty': None},
 'brand': 'Generic',
 'seller': 'GGOrganicStore',
 'sellerUrl': 'https://www.sastodeal.com/marketplace/seller/profile/shop/GGOrganicStore',
 'image': 'https://cdn.sastodeal.com/catalog/product/12935/GH-DFN-08-21/22.jpg',
 'location': 'Kathmandu',
 'review': None,
 'rating': None,
 'pagination': {'currentPage': 1, 'numberOfPages': 6}}

In [76]:
from collections import defaultdict
def remove_none(data):
    output = defaultdict()
    
    if isinstance(data,dict):
        for k,v in data.items():
            if v:
                output[k] = remove_none(data[k])
    else:
        return data
    return dict(output)
    
#     return {k:v for k,v in data.items() if v }

In [77]:
d = {
    "a":["1,2"],
    "b" : {"c": None,"D":1},
    "f": None,
    "g": 'a'
}
remove_none(d)

{'a': ['1,2'], 'b': {'D': 1}, 'g': 'a'}

In [78]:
data_clean = remove_none(data)
data_clean

{'additionalProperty': {'typeOfProduct': 'Yellow Kismis',
  'nutritionalFacts': 'Rich in minerals, proteins, fibre and vitamins',
  'storingRecommendation': 'Store at a room temperature of 70°F',
  'detail': '  Delivery inside Kathmandu Valley only.',
  'countryOfOrigin': 'India'},
 'offers': {'availability': 'In Stock'},
 'brand': 'Generic',
 'seller': 'GGOrganicStore',
 'sellerUrl': 'https://www.sastodeal.com/marketplace/seller/profile/shop/GGOrganicStore',
 'location': 'Kathmandu'}

In [67]:
# field in sasto deal
sasto_deal_fields = [
     'vendor', 'vendor link', 'shipping_from',
     'stock_availability', 'detail', 'raw detail',
     'description', 'brand', 'type of product',
     'nutritional facts', 'storing recommendation',
     'warranty', "product_name", "detail_link",
     "old price ", "new price"
    ]
# daraz fields
daraz_fields = [
    "productUrl","name","nid", "image",
    "price","ratingScore","review", "location",
    "brandId","brandName","sellerId","sellerName"
]

# List of union of fields of both sasto deal and daraz 

# schema for product type nuts
union_fields = {
    "products":[{"name": None,
    "description": None,
    "url": None,
    "additionalProperty":{
        "typeOfProduct":None,
        "nutritionalFacts":None,
        "storingRecommendation":None,
        "detail":None
        },
    "offers":{
        "price":None,
        "discount":None,
        "availability":None,
        "warranty":None
    },
    "brand":None,
    "seller":{
        "name":None,
        "url":None,
        "isNew":None,
        "positiveRating":None,
        "rateLevel":None,
        "ID":None #it can be derived from URL
    },
    "image":None,
    "location":None,
    "review":None,
    "rating":None
    }],
    "pagination":{
        "currentPage":None,
        "numberOfPages":None
        }
    }