# Daraz Data Baker

In [370]:
import os
import json


class DarazDataBaker:
    """
        this class will be responsible for robust scraping data from the daraz officilal website, 
        save the data on the disk and 
        prepare the data for easy use 
    """
    def __init__(self, path="./"):
        self.data_instances = {} # holds the data object

        if os.path.exists(path):
            self.path = path
        else:
            print("error! invalid path")
        
        
    def show_data_instances(self):
        """ returns the name of all the data instances """
        return self.data_instances.keys()
    
    
    def run_scrapper(self, query="nuts", num_pages=-1, resume=True, save=True):
        """
            - it is the heard of this class which is responsible for scraping data and storing 
            - into disk
            parameters:
                * query : (str) search query,
                * last_page: (int) scrap at most up to that page, if -1 then scrap next page 
                * resume: (bool) determine whether scrapping should be 
                          resume or completely state from the begining for the given query
                * save: (bool) determine whether the data should be store on the disk or not
        """
        
    

        def get_current_page_number(query):
            data_json = self.__load_page_context(os.path.join(self.path, "page_context.json"), query)
            current_page = data_json[query]["current_page"]
            return current_page
        
        current_page = get_current_page_number(query) if resume else 0
        print(f"resume : {resume}  current page: {current_page}")
        
        last_page = current_page+1 if num_pages == -1 else current_page+num_pages
            
        if current_page >= last_page:
            print("page already scrapped, please change the number\
            of last page greater than current page")
            return 
        
        # scraping the data
     
        scraped_data_file = self.__get_fp(os.path.join(self.path,query+".json"),"r+", default_value="[]")            
        page_context_file = self.__get_fp(os.path.join(self.path,"page_context.json"), "r+")

        # loading old data from disk
        #move file point to the beginning
        scraped_data = json.load(scraped_data_file)
        print(scraped_data)
        scraped_data_file.seek(0)
        
        page_context = json.load(page_context_file)
        scraped_data_file.seek(0)
        
        #-----------helper functions---------------------------
        import requests
        import re
        def quantity_parser(name):
            # normalizing the text
            name = ' '.join(name.split()).lower()
            amount_pattern  = re.compile("([0-9]+ [a-z]+|[0-9]+[a-z]+)")
            find = amount_pattern.search(name)
            amount = find.group(1)

            price_unit = re.compile("([0-9]+)\ ?([a-z]+)")
            find = price_unit.search(amount)
            return {"qty": find.group(1),"qty_unit": find.group(2)}
        
        def name_parser(name):
            # normalizing the text
            name = ' '.join(name.split()).lower()

            hyphen_search = re.compile("-\ ?[0-9]+").search(name)
            hyphen_sub_str = hyphen_search.group() if hyphen_search else ''

            hyphen_index = name.find(hyphen_sub_str) 

            by_index = name.find("by")

            num_search = re.compile("[0-9]+").search(name)
            num_sub_str = num_search.group() if num_search else ''

            num_index = name.find(num_sub_str)
            indexes = []
            for num in [hyphen_index, by_index, num_index]:
                if num != -1 and num != 0:
                    indexes.append(num)

            index = min(indexes) if len(indexes) > 0 else None
            return {"product_name":name[:index].strip()}
        
        def parse_response(response):
            interested_keys = ["name","nid", "image", "price","ratingScore","review", "location", "brandId","brandName","sellerId","sellerName"]
            filtered_product_data = []
            for item in json.loads(response.text)["mods"]["listItems"]:
                filtered_item = {} 
                for key in interested_keys:
                    filtered_item[key] = item[key]
                    if key == "name":
                        name = item[key]
                        filtered_item.update(quantity_parser(name))
                        filtered_item.update(name_parser(name))

                filtered_product_data.append(filtered_item)
            return filtered_product_data
        # ---------------------------------------------------------------------
        
        page_pointer = page_context[query]["current_page"]
        for i in range(current_page, last_page):
            # preparing api
            end_point_api = f"https://www.daraz.com.np/groceries-canned-dry-packaged-food-dried-goods-dried-fruit-nuts/?ajax=true&from=input&page={i}&q={query}"
            #updating page number
            page_pointer += 1
            print(f"scraping page :{ page_pointer }")
            
            # scraping new data
            response = requests.get(end_point_api) 
            print(response.status_code)
            if response.status_code == 200:
                # updating old data
                scraped_data += parse_response(response)
                #saving updated data
                json.dump(scraped_data, scraped_data_file)
                #saving updated page context
                page_context[query]["current_page"] = page_pointer
                json.dump(page_context, page_context_file)
                
        print("scraper ran successfully")
            #         except Exception as e:
            #             print("!exception: ",e)
            #         finally:
        scraped_data_file.close()
        page_context_file.close()
            
    def __load_page_context(self, file_path, query=None):
        import json
        if os.path.isfile(file_path):
            with open(file_path,"r") as f:
                return json.load(f)
        else:
            print(f"{file_path} doesnot exist! creating a new file with initial values")
            data = {
                query:{
                    "current_page":0
                }
            }

            with open(file_path,"w") as f:
                json.dump(data, f)
                print("file created")
            return data

    def __get_fp(self,file_name, mode="r+", default_value=''):
        if not os.path.exists(file_name):
            #create a file with initial value
            with open(file_name,"w") as f:
                f.write(default_value)
                
        return open(file_name,mode)
            
        

In [371]:
engine = DarazDataBaker()

In [372]:
engine.run_scrapper(num_pages=2)

./page_context.json doesnot exist! creating a new file with initial values
file created
resume : True  current page: 0
[]
scraping page :1
200
scraping page :2
200
scraper ran successfully


In [16]:
def sum(x,y):
    return x+y

class Name:
    def __init__(self, x,y):
        self.x = x
        self.y = y    
    def show(self):
        return sum(self.x,self.y)
    
    def get_x(self):
        return self.x

In [17]:
a = Name(1,2)
a.show()

3