<a id='auct_top'>

# Scrape results of auction
Monthy results of auction are publicized on https://verkoop.domeinenrz.nl. This Notebook scrapes the result from the drz website and parses the text and stores it in a dataframe.

1. <a href="#auct_dl_results">Download results</a>  
    This will read all raw text from pages.  
2. <a href="#auct_basic_parse">Basic parsing</a>  
    Raw text is parsed for the first time. Some basics elements are stored in a pandas.DataFrame (price, image urls, title, ..)
3. <a href="#auct_regex">Regex parsing</a>   
    Do some more sophisticated parsing by using regular expressions.  
4. <a href="#auct_save">Save to disk</a>

- - - 

### Read settings

In [1]:
import drz_config
cfg = drz_config.read_config()
DATE = cfg['DATE']
VERBOSE = cfg['VERBOSE']
OPBOD = cfg['OPBOD']
URL = cfg['URL']
EXTEND_URL = cfg['EXTEND_URL']
SKIPSAVE=cfg['SKIPSAVE']
month_counter = DATE[5:8] # mm

if VERBOSE > 0:
    display(cfg)
else:
    print(URL)



{'settings_fn': '../code/assets/drz-auction-settings.ini',
 'DATE': '2021-05',
 'VERBOSE': 1,
 'OPBOD': False,
 'URL': 'http://verkoop.domeinenrz.nl/verkoop_bij_inschrijving_2021-0005',
 'EXTEND_URL': False,
 'CLOSEDDATA': True,
 'closed_data_fields': '*',
 'SKIPSAVE': False}

### Import modules

In [2]:
import pandas as pd
import re
import os

# needed for as of feb '18 url format
import locale
try:
    locale.setlocale(locale.LC_TIME,'nl_NL')
except:
    locale.setlocale(locale.LC_TIME,'nl_NL.utf8')

# virtualenv
M = re.match('\((.*?)\) ', os.popen('echo $PS1').read())
if M is not None:
    print(f'Virtual environment: {M[1]}')
else:
    print('Virtual environment not activated')

Virtual environment: py38-satdatsci


In [3]:
now = pd.to_datetime('now').strftime('%Y-%m')
if now != DATE:
    raise ValueError(f'''
    Settings file has date set at [{DATE}] but expected [{now}]. Has <{cfg['settings_fn']}> file been updated?
    With older auctions it sometimes works to add [url_add_veilingen=True]
    ''')
    
del(now)

pd.to_datetime('now').strftime('%A %d %B'), pd.to_datetime(DATE,format='%Y-%m').strftime('%A %d %B')


('vrijdag 07 mei', 'zaterdag 01 mei')

In [4]:
## If auction was in past. Url needs to be extended by setting `add_veilingen = True`:
# add_veilingen = True

### Functions

In [5]:
def get_kavel_url(OPBOD, base_url, add_veilingen, lot_id):
    
    '''
    Create url
    '''

    import urllib
    
   
    # Add field to dicts that are passed to urlencode
    urldata = {}
    if not OPBOD:
        # to create '=&'. This might be a bug in the site 
        urldata[''] = ''
    
    # Add auction id
    if add_veilingen:
        # get date from url
        date_string = re.findall(r'_([0-9]{4}-[0-9]{4})', url)
        urldata['veilingen'] = ''.join(date_string)

    # Status is specific for "opbod"
    if OPBOD:
        urldata['status'] = 'both' # or "closed"        

    # Add lot number
    urldata['meerfotos'] = lot_id
    
    # Generate string by using urldata
    kavel_url = base_url + '?' + urllib.parse.urlencode(urldata)

    return kavel_url

# example
example_lot_id = f'K{DATE[2:4]}00{DATE[-2:]}1800'
example_url = URL
get_kavel_url(OPBOD, example_url, False, example_lot_id)


'http://verkoop.domeinenrz.nl/verkoop_bij_inschrijving_2021-0005?=&meerfotos=K2100051800'

In [6]:
def gettree(kavel_url, disp=False):
    
    '''
    get html tree from string
    '''
    
    import requests
    import codecs
    from lxml import html, etree
    
    # Request page
    
    req_success = False; c=0 # Try several times
    
    while req_success == False:
        c+=1
        
        try:
            page = requests.get(kavel_url)
            if disp:
                print(page, c)

            # raise error within try if status is not OK (OK=200)
            assert page.status_code == 200
            
            # Otherwise ok
            req_success = True
        
        except KeyboardInterrupt:
            raise
        except:
            if c == 1:
                print('retry', end=',')
            elif c > 100:
                raise Exception(f'Retried {c} times, but failed')
            else:
                if c > 50:
                    # Add extra pause after many tries
                    time.sleep(c-50)
                print(f'{c}', end='x')
                req_success = False

    # find encoding in header
    DecodeType = page.headers["Content-type"]
    T = 'charset='
    DecodeType = DecodeType[DecodeType.find(T)+len(T):]
    # and convert to unicode
    htmlstring = codecs.decode(page.content, DecodeType)
    
    # Convert string to tree object
    tree = html.fromstring(htmlstring)
    
    return tree


# Example
gettree(
    get_kavel_url(False, example_url, False, example_lot_id),
    True
)

<Response [200]> 1


<Element html at 0x7f434f2849f0>

In [7]:
class Lot:
    
    
    def __init__(self, tree, OPBOD, disp=False):
        self.tree = tree
        self.OPBOD = OPBOD
        self.disp = disp

        # Has content? Price should be bold.
        # if nothing is bold. Page may exist but results are not in yet.
        paths = [
            '//*[@id="content"]/div[1]/div[1]/strong/text()',
            '//*[@id="content"]/div[1]/div[1]/b/text()',
            '//*[@id="content"]/div[1]/b/text()',
        ]
        contents = [self.tree.xpath(path) for path in paths]
        content = [c[0] for c in contents if len(c)>0]
        if len(content) > 0:
            if content[0] == 'Niets gevonden.':
                self.has_result = -1
            else:
                self.has_result = True
        else:
            self.has_result = False

        
    def __str__(self):
        out = self.tree.xpath('/html/head/title/text()')
        if hasattr(self, 'title_'):
            out += [self.title_]
        if hasattr(self, 'lot_index_'):
            out += [self.lot_index_]
        if hasattr(self, 'date_'):
            out += [self.date_]
        if hasattr(self, 'price_'):
            out += [f'EUR {self.price_:6.0f}']
            if hasattr(self, 'draw_') and (self.draw_):
                out[-1] += ' (draw)'
        if hasattr(self, 'nextlot_'):
            out += [f'next [{self.nextlot_:4.0f}]']
        if hasattr(self, 'images_'):
            out += [f'{len(self.images_):3.0f} images']
        if hasattr(self, 'text_'):
            out += [f'{len(self.text_):3.0f} text lines']
            
        if self.has_result == -1:
            out += ['no content.']
        elif self.has_result == False:
            out += ['no result.']
        return ' | '.join(out)

        
    
    def get_title(self):
        
        '''
        Return title of this page. This can be found in a H4 with class name 'title'.
        '''

        path = '//h4[@class="title"]/text()'
        
        self.title_ = self.tree.xpath(path)[0].strip()
        
        
    def get_images(self):
        
        '''
        Return urls (src) of images. These are inside divs of class 'photo'
        '''
        
        lines = [item.get('src') for item in self.tree.xpath('//div[@class="photo"]/a/img')]
    
        self.images_ = lines
    
    def get_text(self):
        
        '''
        Just return all relevant text, which is in class 'catalogusdetailitem split-item-first'.
        '''
        
        lines = self.tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/text()')
        
        self.text_ = lines
        
    def split_title(self):
        
        '''
        Split title into fields
        '''
        
        if not hasattr(self, 'title'):
            self.get_title()

        lot_id = re.match('Kavel (.*)',self.title_).group(1)
        if lot_id.startswith('K'):
            M = re.match('^K([0-9]{2})(00|01)(0[1-9]|1[0-2])([0-9]{4})$', lot_id)
            argout = tuple([M[i] for i in range(1, 5)])
        else:
            lot_nr = int(lot_id)
            argout =(None, None, None, lot_nr)
            
        if self.disp: print(argout)

       
        return lot_id, *argout
    
    def get_date(self):
        '''
        Return date based on title
        '''
        _, yy, _, mm, _ = self.split_title()
        
        self.date_ = f'20{yy}-{mm}'
    
    def get_date_from_tree(self):
        
        '''
        Return date of this auction by taking the title of the page.
        This is pretty obsolete, because date is given at start of this notebook.
        '''
        
        lines = self.tree.xpath('//title/text()')
        date = lines[0]
        
        if 'Verkoop catalogus ' in date:
            # title like "Verkoop catalogus 2017-12"
            date = re.match('Verkoop catalogus (.*)',date)[1]

        elif 'Verkoop bij inschrijving ' in date:
            # title like "Verkoop bij inschrijving 2019-0001 januari"
            M = re.match('Verkoop bij inschrijving (20[0-9]{2})-00([0-9]{2}).*',date)
            date = '-'.join([M.group(1),M.group(2)])

        else:
            raise NotImplementedError(f'TODO: implement a date formatted as <{date}>.')
       
        self.date_ = date
    
    
    def get_nextlot(self):
        
        '''
        Return number of next lot by checking out the link to the next lot in the current page.
        'K1900011801' will become 1801
        '''
        
        
        # link to next lot
        Link = self.tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/div[2]/div[3]/a')
        Tar = Link[0].get("href")
       
        # extract lot name
        nextLot = re.match('.*[\?,\&]meerfotos=(.*)(\&.*)?',Tar).group(1)

        if "&veilingen=" in nextLot:
            nextLot = re.match('(.*)&',nextLot).group(1)
            
        # convert to integer
        nextLot = int(nextLot[-4:])

        if self.disp:
            print(nextLot,Tar,etree.tostring(Link[0]))
                
        self.nextlot_ = nextLot
    
    def get_price(self):
        
        '''
        Return price as float
        '''

        def get_price_opbod(self, price_line):
            # Starts with status
            if len(price_line) < 2:
                # fall back: no bold
                price_line = [self.tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/text()')[0]]
                
            if len(price_line) > 1:
                print('Opbod has more than 1 prices. Take last')
                print(price_line)
            elif len(price_line) == 0:
                print('Price not found, return None')
                return None
            
            return price_line[-1] # Return scalar, not list
                
        def get_price_insch(self, price_line):
            if len(price_line) == 0:
                price_line = self.tree.xpath('//b/text()')
                
            if len(price_line) == 0:
                print('Price not found, return None')
                return None
            return price_line[0] # Return scalar, not list        
        
        def parse_line(price_line):
            tags = ['Zie kavel','Zie massakavel', 'Zie Kavel'] # part of combination lot
            if any([t in price_line for t in tags]) :
                price = 0
            elif price_line == 'Niet gegund':
                price = 0
            else:
                M = re.match(u'Gegund voor: \u20ac *([0-9,.]*,[0-9]{2}) *\(excl. alle eventuele bijkomende kosten en belastingen\)', price_line)
                if self.disp:print(M.group(0))
                price = float(M.group(1).replace('.','').replace(',','.'))
            return price

        # Input error
        if self.OPBOD is None:
            raise ValueError('Set [OPBOD] before running this function.')
        
        # price can be bold or strong
        price_line = self.tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/strong/text()')        
        
        if not self.OPBOD:
            price_line = get_price_insch(self, price_line)
        else:
            price_line = get_price_opbod(self, price_line)
            
        if self.disp: print(price_line)
            
        if (price_line is None) or (len(price_line) == 0):
            print('No price found! use 0 for now')
            print(*self.tree.xpath('//*[@class="catalogusdetailitem split-item-first"]/text()'))
            price_line = 'Niet gegund'
            raise Exception('Fix this')
            
        if price_line == 'Na loting':
            price_line = self.tree.xpath('//strong/text()')[0]
            Draw = True
        else:
            Draw = False        

        Price = parse_line(price_line)

        
        self.price_ = Price
        self.draw_ = Draw
        
        
    def get_index(self):
        
        '''
        Unique id to this lot. Includes date.
        yyyy-mm-xxxx
        '''
        
        _, yy, _, mm, lot_nr = self.split_title()
        
        self.lot_index_  = f'20{yy}-{mm}-{lot_nr}'
        
    def get_images_v1(self):
        
        '''
        Return urls (src) of images. These are inside divs of class 'photo'
        '''
        
        lines = [item.get('src') for item in self.tree.xpath('//div[@class="photo"]/img')]
            
        self.images_ = lines

    def get_nextlot_v1(self):
        
        '''
        Return number of next lot by checking out the link to the next lot in the current page.
        'K1900011801' will become 1801
        
        update 202007: layout changed. Link for next lot is in diffent div
        '''
        
        
        # link to next lot
        link = self.tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/div[4]/div[3]/a')
        tar = link[0].get("href")
        
        # extract lot name
        nextLot = re.match('.*[\?,\&]meerfotos=(.*)(\&.*)?', tar).group(1)

        if "&veilingen=" in nextLot:
            nextLot = re.match('(.*)&',nextLot).group(1)
            
        # convert to integer
        nextLot = int(nextLot[-4:])

        if self.disp:
            print(nextLot, Tar, etree.tostring(Link[0]))
                
        self.nextlot_ = nextLot
            
# Example
kavel_url = get_kavel_url(False, example_url, False, example_lot_id)
print(kavel_url)
tree = gettree(kavel_url, True)
Item = Lot(tree, OPBOD)
print(Item)
Item.disp
Item.get_index()
print(Item)
Item.get_date()
print(Item)
Item.get_title()
print(Item)
Item.get_nextlot()
print(Item)
Item.get_images()
print(Item)
Item.get_text()
print(Item)
# This might throw an error if auction is still open
Item.get_price()
print(Item)
Item.has_result, Item.price_, 


http://verkoop.domeinenrz.nl/verkoop_bij_inschrijving_2021-0005?=&meerfotos=K2100051800
<Response [200]> 1
Verkoop bij inschrijving 2021-0005
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05 | next [1801]
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05 | next [1801] |   6 images
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05 | next [1801] |   6 images |  19 text lines
Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05 | EUR   1755 | next [1801] |   6 images |  19 text lines


(True, 1755.0)

<H1><a href="#auct_top">^</a></H1><a id='auct_dl_results'>

# Get all results from all pages

The "**next lot**" is linked in the current result. The function will look for this link and proceed. 
Because it is not know what the first lot will be, it is hard coded at `lot_counter = 1799`. 
It will increment with a step of `+1` to find the first lot.  
Searching for next lots will continue untill the next lot has a **smaller** value that the current. This will cause the routine to stop when the last lot points back to the first lot.


In [8]:
# info
print('+: add 1 to lot number\n>: follow link to go to next\nX: Done. Reached first lot in carousel\n')

DOLOOP = True; all_lots = dict()
# first lot
if OPBOD:
    lot_counter = 1000
    lot_pat = 'K{:s}{:s}01{:.0f}'
else:
    lot_counter = 1799
    lot_pat = 'K{:s}00{:s}{:.0f}'
    
while DOLOOP:
    all_lots[lot_counter] = dict()
    # get lot
    lot_id = lot_pat.format(DATE[2:4], month_counter, lot_counter) # 'K1800091800': Kyy00mmllll    
    lot_url = get_kavel_url(OPBOD, URL, EXTEND_URL, lot_id)
    lot_tree = gettree(lot_url, disp = VERBOSE > 2)
    lot_item = Lot(lot_tree, OPBOD)
    
    # continue with next if no content
    if lot_item.has_result == -1:
        next_lot = lot_counter + 1
        print(lot_counter, end='+')
        lot_counter = next_lot
        continue

    # find next number
    try:
        lot_item.get_nextlot()
        next_lot = lot_item.nextlot_
    except KeyboardInterrupt:
        raise
    except:
        # Do not go to next, but try this one again
        print(lot_url)
        print('try again',end='>')
        next_lot = lot_counter

    # add current results to list
    all_lots[lot_counter]['url'] = lot_url
    all_lots[lot_counter]['item'] = lot_item
    print(lot_counter, end='>')
    
    if VERBOSE > 2: print(lot_item)

    if next_lot < lot_counter :
        # First lot_counter again. Break loop before entering a carousel
        DOLOOP = False
    else :
        # continue with next_lot
        lot_counter = next_lot

print('X') # done

+: add 1 to lot number
>: follow link to go to next
X: Done. Reached first lot in carousel

1799+1800>1801>1802>1803>1804>1805>1806>1807>1808>1809>1810>1811>1812>1813>1814>1815>1817>1818>1819>1820>1821>1822>1823>1824>1825>1827>1828>1829>1830>1831>1832>1833>1834>1835>1837>1838>1840>1841>1842>1843>1844>1845>1846>1847>1848>1849>1850>1851>1852>1853>1854>1855>1857>1858>1860>1862>1863>1864>1865>1866>1867>1868>1869>1870>1871>1872>1873>1874>1875>1876>1877>1878>2200>2201>2202>2203>2204>2205>2206>2400>2401>2402>2403>2600>2602>2603>8000>8002>8003>8004>8005>8006>8007>8008>8010>8012>8013>8014>8015>8016>8017>8018>8020>8021>8023>8025>8026>8027>8029>8031>8033>8034>8035>8036>8037>8038>8040>8041>8042>8043>8044>8045>8046>8047>8048>8049>8050>8051>8052>8053>8054>8055>8056>8057>8058>8059>8060>8061>8063>8064>8065>8066>8067>8068>8069>8070>8071>8073>8074>8075>8077>8078>8079>8080>8081>8082>8083>8084>8085>8087>8088>8089>8090>8092>8094>8095>8096>8097>8098>8099>8100>8101>8102>8103>8104>8106>8107>8108>8109>8110>811

<H1><a href="#auct_top">^</a></H1><a id='auct_basic_parse'>

# Basic parsing

Simple stuff, without regex.

In [9]:
# Filter out empty lots
all_lots = {k: v for k,v in all_lots.items() if 'item' in v}

# Get indices and read info from tree
lot_indices = []
for lot_nr, lot in all_lots.items():
    lot_item = lot['item']
    lot_url = lot['url']
    lot_item.get_title()
    try:
        lot_item.get_price()
    except:
        print('catch', end='')
        lot_item.price_ = -1
        lot_item.draw_ = False
    lot_item.get_date()
    lot_item.get_images()
    lot_item.get_text()
    lot_item.get_index()
    if VERBOSE>0: print(lot_nr, lot_item)

out = pd.DataFrame(
    columns = ['Source', 'Title', 'Price', 'Draw', 'Raw_text', 'N_images', 'Images'],
    index = [i['item'].lot_index_ for i in all_lots.values()],
    data = {
        'Source': [i['url'] for i in all_lots.values()],
        'Title': [i['item'].title_ for i in all_lots.values()],
        'Price': [i['item'].price_ for i in all_lots.values()],
        'Draw': [i['item'].draw_ for i in all_lots.values()],
        'Raw_text': [i['item'].text_ for i in all_lots.values()],
#         'Images': [
#             [re.sub('\/catalog((us)|(i))','',baseurl) + jpg for jpg in i['item'].images] 
#             for i in all_lots.values()
#         ],
        'Images': [
            [re.search(r'^http://.*?/',URL)[0] + jpg[1:] for jpg in i['item'].images_] # [1:] remove leading "/"
            for i in all_lots.values()
        ]
    }
)
out.N_images = out.Images.apply(len)
out.loc[:, 'lot_counter'] = all_lots.keys()

out

1800 Verkoop bij inschrijving 2021-0005 | Kavel K2100051800 | 2021-05-1800 | 2021-05 | EUR   1755 | next [1801] |   6 images |  19 text lines
1801 Verkoop bij inschrijving 2021-0005 | Kavel K2100051801 | 2021-05-1801 | 2021-05 | EUR    477 | next [1802] |   5 images |  19 text lines
1802 Verkoop bij inschrijving 2021-0005 | Kavel K2100051802 | 2021-05-1802 | 2021-05 | EUR   2251 | next [1803] |   6 images |  20 text lines
1803 Verkoop bij inschrijving 2021-0005 | Kavel K2100051803 | 2021-05-1803 | 2021-05 | EUR    900 | next [1804] |   9 images |  19 text lines
1804 Verkoop bij inschrijving 2021-0005 | Kavel K2100051804 | 2021-05-1804 | 2021-05 | EUR    602 | next [1805] |   5 images |  19 text lines
1805 Verkoop bij inschrijving 2021-0005 | Kavel K2100051805 | 2021-05-1805 | 2021-05 | EUR    500 | next [1806] |   5 images |  19 text lines
1806 Verkoop bij inschrijving 2021-0005 | Kavel K2100051806 | 2021-05-1806 | 2021-05 | EUR   2575 | next [1807] |   5 images |  19 text lines
1807 V

Unnamed: 0,Source,Title,Price,Draw,Raw_text,N_images,Images,lot_counter
2021-05-1800,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100051800,1755.0,False,"[K2100051800, Bromfiets, PIAGGIO, Type vespa s...",6,[http://verkoop.domeinenrz.nl/ufc/static/16194...,1800
2021-05-1801,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100051801,477.0,False,"[K2100051801, Bromfiets, SYM, Type hu05w, Kent...",5,[http://verkoop.domeinenrz.nl/ufc/static/16194...,1801
2021-05-1802,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100051802,2251.0,False,"[K2100051802, Motorfiets, PIAGGIO, Type m45, K...",6,[http://verkoop.domeinenrz.nl/ufc/static/16194...,1802
2021-05-1803,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100051803,900.0,False,"[K2100051803, Bromfiets, PIAGGIO, Type c38, Ke...",9,[http://verkoop.domeinenrz.nl/ufc/static/16194...,1803
2021-05-1804,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100051804,602.0,False,"[K2100051804, Bromfiets, SYM, Type mio 50, Ken...",5,[http://verkoop.domeinenrz.nl/ufc/static/16194...,1804
...,...,...,...,...,...,...,...,...
2021-05-8158,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100058158,2222.0,False,"[K2100058158, Personenauto, LANDROVER, Type ra...",13,[http://verkoop.domeinenrz.nl/ufc/static/16194...,8158
2021-05-8159,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100058159,4919.0,False,"[K2100058159, Generator, WILSON, Type XD100P2,...",9,[http://verkoop.domeinenrz.nl/ufc/static/16194...,8159
2021-05-8160,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100058160,0.0,False,"[K2100058160, Aanhangwagen, Caravan, HOMECAR, ...",15,[http://verkoop.domeinenrz.nl/ufc/static/16194...,8160
2021-05-8161,http://verkoop.domeinenrz.nl/verkoop_bij_insch...,Kavel K2100058161,2450.0,False,"[K2100058161, Aanhangwagen, SARIS, Type pk, Ke...",6,[http://verkoop.domeinenrz.nl/ufc/static/16194...,8161


In [10]:
# ran when auction was still open
if (sum(out.Price == -1) / out.shape[0] > 0.8):
    # add "without-price" to file name
    NO_PRICE = True
else:
    NO_PRICE = False

file_name = f'../data/drz-data-unparsed-{DATE}.pkl'
if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')

if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

../data/drz-data-unparsed-2021-05.pkl


<H1><a href="#auct_top">^</a></H1><a id='auct_regex'>

# In depth parsing
Use `Raw_text` as input.  
Modify regex files if fragment is not recognized.


In [11]:
# Read regex patterns
import read_regex_patterns

read_regex_patterns.read_tag_value()
tags, flagtags, repfragments = read_regex_patterns.read_all()

# Replace dataframe stored in memory with dataframe that was just saved to disk.
file_name = f'../data/drz-data-unparsed-{DATE}.pkl'
out = pd.read_pickle(file_name)

# NB This might fail if "drz-data-unparsed-{DATE}.pkl" does not exist because price is not available yet and ran when auction was still open.
# It is safe to continue with the dataframe that is still in memory
#file_name = file_name.replace('.pkl', '-without-price.pkl')

In [12]:
# parse raw text
for IX in out.index :
    
    # find info
    
    rt = out.loc[IX,"Raw_text"]
    
    # first line:
    
    # Is it a draw?
    Val = rt.pop(0) 
    if Val == 'Na loting':
        Val = rt.pop(0) # val is now kavelnr
        out.loc[IX,"Draw"] = True
    else:
        out.loc[IX,"Draw"] = False
    
    # when lot number is followed by an asteriks there is a note
    if Val.endswith('*\r'):
        Val = Val[0:-2]
        out.loc[IX,"Note"] = True
    else :
        Val = Val.strip()
        out.loc[IX,'Note'] = False
        
    if VERBOSE>0:
        print(Val)

    # store lot nr        
    out.loc[IX,"LotNr"]=Val
    
    
    # second line
    out.loc[IX,"LotType"]=rt.pop(0).strip()

    # third line
    Val = rt.pop(0).strip()
    # This line is brand or optional line with type of lot
    # All caps is brand
    if Val in ['Quad','Kampeerwagen/ camper','Pleziervaart motorvaartuig met opbouw en open kuip','Rubberboot'] or not Val.isupper():
        out.loc[IX,"LotType"] += ''.join([' (' + Val + ')'])
        if VERBOSE>0:print(Val, out.loc[IX,"LotType"])
        Val = rt.pop(0).strip() # now it is brand
    out.loc[IX,"ItemBrand"]=Val

    
    
    # escape characters, repair typos and translate 
    for i in range(len(rt)):
        
        # encode string as bytes
        rt[i] = rt[i].encode('ascii',errors='xmlcharrefreplace')
        
        # replace text
        for pat,sub in zip(repfragments.Pattern,repfragments.Replace):
            rt[i] = re.sub(pat.encode('ascii',errors='xmlcharrefreplace'),sub.encode('ascii',errors='xmlcharrefreplace'),rt[i])
        
        # decode back to string, but special characters escaped to xml
        rt[i]=rt[i].decode('ascii')

    # Pull value after trailing or leading pattern (bgntag/endtag)
    for Tag,Field in zip(tags.Pattern,tags.Field):
        M = re.search(Tag,'\n'.join(rt))
        if M:
            Val = M.group('val')
            if VERBOSE>2:
                print(str(Field) + ' : ' + M.group(0).replace('\n','[newline]') + '\n\t' + '|' + Val + '|')
            # remove pattern and make rt a list again.
            rt = '\n'.join(rt).replace(M.group(0),'').split('\n')
        else:
            Val = ''
        out.loc[IX,Field] = Val        

    # Pattern in full text? (flagtag)
    for Tag,Field in zip(flagtags.Pattern,flagtags.Field):
        # flagtags might occur more than once, hence a list of finditer results
        Ms = list(re.finditer(Tag,'\n'.join(rt)))
        if Ms:
            Val = True
            for M in Ms:
                if VERBOSE>2:
                    print(str(Field) + ' : ' + M.group(0).replace('\n','[newline]') + '\n\t' + '|' + str(Val) + '|')
                # remove pattern and make rt a list again.
                rt = '\n'.join(rt).replace(M.group(0),'').split('\n')
        else:
            Val = False
        out.loc[IX,Field] = Val

        
        
    # loop trough remaining lines

    for line in rt:
               
        # do comparison in bytes
        line = line.encode('ascii',errors='xmlcharrefreplace')
        if VERBOSE>2:
            print(line)
            
        # parsing
        isParsed = False # some accounting: in the end this line should be parsed
         
        # line is empty.. skip .. next
        if not line :# empty
            isParsed = True
            continue
            
        # line starting with '*' is a note
        if out.loc[IX,'Note'] and line.startswith(bytes('*','ascii')):
            if VERBOSE>2:
                print('\tNote:',end='')
                print(out.loc[IX,'Note'],end='')
                print(line)
            Val = line[1:].decode('ascii')
            out.loc[IX,'Note'] = Val
            isParsed = True
            continue
                
        if isParsed == False:
            line = line.decode('ascii')
            
            # create empty string if not exist
            if (
                'SupInfo' not in out.loc[IX].index
            ) or (
                (
                    not isinstance(out.loc[IX,'SupInfo'], str)
                ) and (
                    pd.isna(out.loc[IX,'SupInfo'])
                )
            ):
                out.loc[IX,'SupInfo'] = ''
            out.loc[IX,"SupInfo"] = '\n'.join([out.loc[IX,'SupInfo'] , str(line)])
            if ('prev_ix' in locals()) and (IX == prev_ix):
                print(''.join([' '] * len(IX)), end='')
            else:
                print(str(IX), end='')
            print(f'[{line:s}]')
            prev_ix = str(IX)
            
            


K2100051800
K2100051801
K2100051802
K2100051803
K2100051804
K2100051805
K2100051806
K2100051807
K2100051808
K2100051809
K2100051810
K2100051811
K2100051812
K2100051813
K2100051814
K2100051815
2021-05-1815[Bromfiets heeft een lekke band achter.]
K2100051817
K2100051818
K2100051819
K2100051820
K2100051821
K2100051822
K2100051823
K2100051824
K2100051825
K2100051827
K2100051828
K2100051829
K2100051830
K2100051831
K2100051832
K2100051833
K2100051834
K2100051835
K2100051837
K2100051838
2021-05-1838[Selectieknop voor km-stand defect]
K2100051840
K2100051841
K2100051842
K2100051843
K2100051844
K2100051845
K2100051846
K2100051847
K2100051848
2021-05-1848[Eventueel via individuele aanvraag een kenteken te verkrijgen.]
            [In dat geval dient het framenummer opnieuw te worden ingeslagen.]
K2100051849
K2100051850
K2100051851
K2100051852
2021-05-1852[Serienummer 08060306E]
K2100051853
2021-05-1853[Motor kubota V1505]
            [Sleutel is stuk maar draait wel in contactslot.]
K2100051854


<H1><a href="#auct_top">^</a></H1><a id='auct_save'>

# Save results to disk

In [13]:
if OPBOD:
    file_name = f'../../../python-nb/data/drz-data-opbod-{DATE}.pkl'
else:
    file_name = f'../data/drz-data-{DATE}.pkl'
if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')


../data/drz-data-2021-05.pkl


# Next: add rdw data

Because rdw data changes constantly it is advisable to run the notebook that adds rdw data to the above results soon.

In [14]:
raise

RuntimeError: No active exception to reraise

In [None]:
# Compare results

In [None]:
file_name = '../data/drz-data-{}.pkl'.format(DATE)
out2 = pd.read_pickle(file_name)
out.drop(columns='lot_counter').fillna('nn').equals(out2.fillna('nn'))


In [None]:
import numpy as np
print(
    np.setdiff1d(out.columns, out2.columns),
    np.setdiff1d(out2.columns, out.columns)
)
print(
    np.setdiff1d(out.index, out2.index),
    np.setdiff1d(out2.index, out.index)
)

In [None]:
iseq = out.drop(columns='lot_counter').fillna('nn').eq(out2.fillna('nn'))
print(
    out.drop(columns='lot_counter').index[(iseq == False).any(axis=1)],
    out.drop(columns='lot_counter').columns[(iseq == False).any(axis=0)]
)
col = 'Images'
pd.concat([
    out.loc[iseq[col] == False, col],
    out2.loc[iseq[col] == False, col]    
], axis=1)


In [None]:
# Test regex on misbehaving fragment.