# Store results of auction of Dienst Roerende Zaken
Monthy results of auction are publicized on http://www.domeinenrz.nl/catalogus. This Notebook scrapes the result from the drz website and parses the text and stores it in a dataframe.  
- - - 

### User variables
- `Date`: Date of current results. This is needed to create proper url  
- `Verbose`: Debug variable. 

In [1]:
Date = '2019-04' # yyyy-mm
Verbose = 0 # debug level

### Import modules

In [2]:
import pandas as pd
import urllib
from lxml import html, etree
import requests
import codecs 
import re
import time

In [3]:
# needed for new (as of feb '18) url format
import locale
locale.setlocale(locale.LC_TIME,'nl_NL')
pd.to_datetime('now').strftime('%A %d %B')
pd.to_datetime(Date,format='%Y-%m').strftime('%A %d %B')

'maandag 01 april'

### Internal variables

In [4]:
# website with results
url = 'http://www.domeinenrz.nl/catalogus'

### Read external data
These files are used to recognize text fragments. Regex patterns are mapped to field names.

In [5]:
tags=pd.read_csv('./regex-patterns/drz-re-patt-tag.txt',
                    comment='#',
                    header=None,
                    quotechar='"',
                    delimiter=",",
                    skipinitialspace=True).rename(columns={0 : 'Field',1 : 'Pattern'})
                    
flagtags=pd.read_csv('./regex-patterns/drz-re-patt-hastag.txt',
                     comment='#',
                     header=None,
                     quotechar='"',
                     delimiter=",",
                     skipinitialspace=True).rename(columns={0 : 'Field',1 : 'Pattern'})

repfragments=pd.read_csv('./regex-patterns/drz-re-patt-replace.txt',
                      comment='#',
                      header=None,
                      quotechar='"',
                      delimiter=",",
                      skipinitialspace=True).rename(columns={0 : 'Pattern',1 : 'Replace'}).fillna('')

### Functions

In [6]:
def gettree(baseurl,Lotid,Date=Date,disp=False):
    
    '''
    tree from url. Version where urls are formatted as 
    http://www.domeinenrz.nl/catalogus/verkoop_bij_inschrijving_2018-0009?=&meerfotos=K1800091800"
    later it even changed to:
    .  .  .  .  .  .  . klik_hier_voor_verkoop_bij_inschrijving_2018-0011
    '''
    
    # Change date format and extend to url
    Datestr = Date.replace('-','-00')
    if pd.to_datetime(Datestr,format='%Y-00%m') < pd.to_datetime('2018-0011',format='%Y-00%m'):
        baseurl += '/verkoop_bij_inschrijving_{:s}'.format(Datestr)
    elif pd.to_datetime(Datestr,format='%Y-00%m') < pd.to_datetime('2019-0002',format='%Y-00%m'):
        baseurl += '/klik_hier_voor_verkoop_bij_inschrijving_{:s}'.format(Datestr)
    else:
        baseurl += '/verkoop_bij_inschrijving_{:s}_{:s}'.format(Datestr,pd.to_datetime(Datestr,format='%Y-00%m').strftime('%B'))
        
    # create url
    urldata = {}
    urldata[''] = '' # to create '=&'. This might be a bug in the site 
    
    # Add auction id
    urldata['veilingen'] = Datestr

    # Add lot number
    urldata['meerfotos'] = Lotid
    # generate url with urldata
    KavelUrl = baseurl + '?' + urllib.parse.urlencode(urldata)
    if disp: print(KavelUrl)
    
    # get html string
    req_success = False
    c=0
    while req_success == False:
        c+=1
        try:
            page = requests.get(KavelUrl)
            req_success = True
        except:
            if c > 10:
                raise Exception('Retried, but failed')
            else:
                print('pause 1 sec and try again!')
                time.sleep(1)
                req_success = False

    # find encoding
    DecodeType = page.headers["Content-type"]
    T = 'charset='
    DecodeType = DecodeType[DecodeType.find(T)+len(T):]
    # convert to unicode
    htmlstring = codecs.decode(page.content, DecodeType)
    # convert string to tree object
    tree = html.fromstring(htmlstring)
    
    return tree,KavelUrl

def gettree_v1(baseurl,Lot,Date=None,disp=False):
    
    '''
    tree from url. Version where urls are formatted as 
    http://www.domeinenrz.nl/catalogus?=&meerfotos=1799&veilingen=2018-09"
    '''
    
    # create url
    urldata = {}
    urldata[''] = '' # to create '=&'. This might be a bug in the site 
    # was date in input?
    if Date != None and len(Date) != 0:
        urldata['veilingen'] = Date
    # Add lot number
    urldata['meerfotos'] = Lot
    # generate url with urldata
    KavelUrl = baseurl + '?' + urllib.parse.urlencode(urldata)
    if disp: print(KavelUrl)
    
    # get html string
    page = requests.get(KavelUrl)
    # find encoding
    DecodeType = page.headers["Content-type"]
    T = 'charset='
    DecodeType = DecodeType[DecodeType.find(T)+len(T):]
    # convert to unicode
    htmlstring = codecs.decode(page.content, DecodeType)
    # convert string to tree object
    tree = html.fromstring(htmlstring)
    
    return tree,KavelUrl


def extractitem(tree,name,disp=False):
    "extract lines from tree"
    
    if name == "title":
        
        '''
        Return title of this page. This can be found in a H4 with class name 'title'.
        '''
            
        # path = '//body/div[@id="mainwrapper"]/div[@id="main"]/div[@class="wrapper"]/div[@class="article"]/div[@class="catalogus"]/div[@class="catalogusdetailitem split-item-first"]/a/h4[@class="title"]//text()'
        path = '//h4[@class="title"]/text()'
        return tree.xpath(path)[0].strip()

    elif name == "images":
        
        '''
        Return urls (src) of images. These are inside divs of class 'photo'
        '''
        
        lines = [item.get('src') for item in tree.xpath('//div[@class="photo"]/img')]
        
        if disp:
            print(lines)
        
        return lines
    
    elif name == "text":
        
        '''
        Just return all relevant text, which is in class 'catalogusdetailitem split-item-first'.
        '''
        
        lines=tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/text()')
        
        if disp:
            print(len(lines))
        
        return lines
    
    elif name == "date":
        
        '''
        Return date of this auction by taking the title of the page.
        This is pretty obsolete, because date is given at start of this notebook.
        '''
        
        lines = tree.xpath('//title/text()')
        Date = lines[0]
        
        if 'Verkoop catalogus ' in Date:
            # title like "Verkoop catalogus 2017-12"
            Date = re.match('Verkoop catalogus (.*)',Date)[1]

        elif 'Verkoop bij inschrijving ' in Date:
            # title like "Verkoop bij inschrijving 2019-0001 januari"
            M = re.match('Verkoop bij inschrijving (20[0-9]{2})-00([0-9]{2}).*',Date)
            print(M.group(2))
            Date = '-'.join([M.group(1),M.group(2)])

        else:
            raise Exception('TODO: implement')

        # Date = Date.strip()
        # T = 'Verkoop catalogus '
        # Date = Date[Date.index(T)+len(T):]
        
        return Date
    
    elif name == "nextlot":
        
        '''
        Return number of next lot by checking out the link to the next lot in the current page.
        'K1900011801' will become 1801
        '''
        
        
        # link to next lot
        Link = tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/div[4]/div[3]/a')
        Tar = Link[0].get("href")
        
        # extract lot name
        #nextLot = re.match('\?meerfotos=(.*)',Tar).group(1)
        nextLot = re.match('.*[\?,\&]meerfotos=(.*)(\&.*)?',Tar).group(1)

        if "&veilingen=" in nextLot:
            nextLot = re.match('(.*)&',nextLot).group(1)
            
        # convert to integer
        nextLot = int(nextLot[-4:])

        if disp:
            print(nextLot,Tar,etree.tostring(Link[0]))
                
        return nextLot
    
    elif name == "price":
        
        '''
        Return price as float
        '''
        
        
        # price can be bold or strong
        Price = tree.xpath('//div[@class="catalogusdetailitem split-item-first"]/strong/text()')
        if len(Price) == 0:
            Price = tree.xpath('//b/text()')

        if disp: print(Price)
        
        if len(Price) == 0:
            print('No Price found! use 0 for now')
            print(*tree.xpath('//*[@class="catalogusdetailitem split-item-first"]/text()'))
            Price = ['Niet gegund']
            raise Exception('Fix this')
        
        # select first in list (xpath returns lists)
        Price = Price[0]
            
        if Price == 'Na loting':
            Price = tree.xpath('//strong/text()')[0]
            Draw = True
        else:
            Draw = False        
 
        Tags = ['Zie kavel','Zie massakavel']# part of combination lot
        if any([Tag in Price for Tag in Tags]) :
            Price = 0
        elif Price == 'Niet gegund':
            Price = 0
        else:
            M = re.match(u'Gegund voor: \u20ac *([0-9,.]*,[0-9]{2}) *\(excl. alle eventuele bijkomende kosten en belastingen\)',Price)
            if disp:print(M.group(0))
            Price = float(M.group(1).replace('.','').replace(',','.'))
            # Tag1 = u'Gegund voor: \u20ac'
            # Tag2 = u'(excl. alle eventuele bijkomende kosten en belastingen)'
            # Price = float(Price[Price.index(Tag1)+len(Tag1):Price.index(Tag2)].strip().replace('.','').replace(',','.'))
            
        return Price,Draw
    

### First: Get all results from all pages
This will read all pages and the raw text is stored for later use.  
The "**next lot**" is linked in the current result. The function will look for this link and proceed. Because it is not know what the first lot will be, it is hard coded at `Lot = 1799`. It will increment with a step of `+1` to find the first lot. If the first lot is not (yet) found a period (`.`) is printed, otherwise the lot nummer will be printed. The console output should start with "`.`" (a period).  
Searching for next lots will continue untill the next lot has a **smaller** value that the current. This will cause the routine to stop when the last lot points back to the first lot.

In [7]:
# empty lists
AllLot = []
AllTree = []
AllKavelUrl = []
doLoop = True # set to false at the end.
Lot = 1799 # first
while doLoop:
    # Lot id
    # 'K1800091800'
    Lotid = 'K{:s}00{:s}{:.0f}'.format(Date[2:4],Date[5:8],Lot)
    
    # read page
    read_success = False
    c=0
    while read_success == False:
        c+=1
        if pd.to_datetime(Date, format = '%Y-%m') < pd.to_datetime('2018-9-1'):
            [Tree,KavelUrl]=gettree_v1(url,Lot=str(Lot),Date=Date,disp=Verbose>1)
        else:
            [Tree,KavelUrl]=gettree(url,Lotid=Lotid,Date=Date,disp=Verbose>1)
        Content = Tree.xpath('//*[@id="content"]/div[1]/b/text()')
        if Content == 'failed': # future
            if c > 10:
                raise Exception('Retried, but failed')
            else:
                print('pause 1 sec and try again!')
                time.sleep(1)
                read_success = False
        else:
            read_success = True
    
    if Content and Content[0] == 'Niets gevonden.':
        # Lot number does not exist
        NextLot = Lot + 1
        print('.',end='-')
    else :
        # find next number
        try:
            NextLot=extractitem(Tree,'nextlot')
        except:
            print(KavelUrl)
            print('try again',end='>')
            NextLot = Lot
#             continue
#             print (etree.tostring(Tree,pretty_print=True).decode('utf8'))
            raise 
                   
        # add current results to list
        AllLot.append(Lot)
        AllTree.append(Tree)
        AllKavelUrl.append(KavelUrl)
        print(str(Lot),end='>')
    if NextLot < Lot :
        # First Lot again. Break loop
        doLoop = False
    else :
        Lot = NextLot

print('.',end='X')


.-1800>1801>1802>1803>1804>1805>1807>1808>1810>1811>1812>1816>1817>1818>1819>1820>1821>1822>1823>1824>1825>1826>1828>1829>1830>1831>1832>1833>1834>1835>1836>1837>1838>1839>1841>1842>2000>2001>2002>2003>2004>2005>2007>2008>2009>2010>2011>2012>2013>2014>2015>2016>2017>2018>2019>2020>2021>2022>2200>2201>2202>2203>2204>2205>2206>2207>2208>2209>2210>2211>2214>2215>2216>2219>2220>2221>2400>2401>2402>2403>2404>2406>2407>2408>2409>2410>2411>2412>2413>2415>2416>2417>2600>2601>2602>2603>2604>2605>2606>2607>2608>2609>2610>2611>2612>2613>2614>2615>7100>7101>7102>7103>7104>7105>7106>7107>7110>7111>7112>7113>7114>7115>7116>7117>7118>7120>7121>7122>7123>7124>7125>7126>7127>7128>7129>7130>7131>7132>7133>7134>7135>7136>7137>7138>7139>7140>7141>7142>7143>7144>7145>7146>7147>7148>7149>7150>7151>7152>7153>7155>7156>7157>7158>7159>7160>7161>7162>7163>7164>7165>7166>7167>7168>7169>7170>7175>7176>7177>7178>7180>7181>7182>7183>7184>7185>7186>7187>7188>7189>7190>7191>7192>7193>7194>7195>7196>7197>7198>7199>720

### Basic parsing
Raw text is parsed for the first time. Some basics are stored in a pandas.DataFrame:  
- price
- image urls
- title
- ..

In [8]:
Verbose = 0
# empty list
out = None
# loop over all pages
for iK, tree in enumerate(AllTree):
    
    #
    # create an index
    #

    #   date
    if "Date" not in locals() or not Date:
        Date = extractitem(tree,'date',disp=Verbose>2)

    DT = pd.to_datetime(Date,format="%Y-%m")
    
    #   title and lot number
    title = extractitem(tree,'title')
    Lotid = re.match('Kavel (.*)',title).group(1)
    #Lotid = title[len('Kavel '):]
    if Lotid.startswith('K'):
        Lot = int(Lotid[-4:])
    else:
        Lot = int(Lotid)

    #   index
    IX = "-".join([str(DT.year),str(DT.month),str(Lot)])

    if Verbose>0: print(IX)
    
    
    #
    # extract images
    #
    
    
    image_urls = [re.sub('\/catalogus','',url) + item for item in extractitem(tree,'images',disp=Verbose>2)]
    
    #if Verbose>0: print(image_urls)
            
     
    #
    # Price
    #
    
    [Price,Draw] = extractitem(tree,'price',disp=Verbose>2)
        
        
        
    #    
    # add to data frame
    #
    
    out = pd.concat([out,pd.DataFrame({'Source' : AllKavelUrl[iK],
                                      'Title' : title,
                                      'Price' : Price,
                                      'Draw' : Draw,
                                      'Raw_text' : [extractitem(tree,'text')],
                                      'N_images' : len(image_urls),
                                      'Images' : [image_urls]},
                                      index = [IX])])
out.tail()

Unnamed: 0,Source,Title,Price,Draw,Raw_text,N_images,Images
2019-4-7347,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047347,488.0,False,"[K1900047347\r, Aanhangwagen\r, Enkelasser\r, ...",3,[http://www.domeinenrz.nl/ufc/static/155360666...
2019-4-7348,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047348,24590.0,False,"[K1900047348\r, Vrachtwagencombinatie\r, Trekk...",3,[http://www.domeinenrz.nl/ufc/static/155360666...
2019-4-7349,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047349,3115.0,False,"[K1900047349, Trekker, VOLVO, Type fm12 4x2t f...",3,[http://www.domeinenrz.nl/ufc/static/155360666...
2019-4-7350,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047350,3689.0,False,"[K1900047350, Tankcontainer, VAN HOOL, Inhoud ...",2,[http://www.domeinenrz.nl/ufc/static/155360666...
2019-4-7351,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047351,3971.0,False,"[K1900047351, Bedrijfswagen, DAF, Type ae 45 c...",3,[http://www.domeinenrz.nl/ufc/static/155360666...


### In depth parsing
Do some more sofisticated parsing. Use `Raw_text` as input.  
When a line is not recognized. It will be printed to console. One might choose to add to the external text files if a fragment of tag occurs often.

In [9]:
Verbose = 0
# parse raw text
for IX in out.index :
    
    # find info
    
    rt = out.loc[IX,"Raw_text"]
    
    # first line:
    
    # Is it a draw?
    Val = rt.pop(0) 
    if Val == 'Na loting':
        Val = rt.pop(0) # val is now kavelnr
        out.loc[IX,"Draw"] = True
    else:
        out.loc[IX,"Draw"] = False
    
    # when lot number is followed by an asteriks there is a note
    if Val.endswith('*\r'):
        Val = Val[0:-2]
        out.loc[IX,"Note"] = True
    else :
        Val = Val.strip()
        out.loc[IX,'Note'] = False
        
    if Verbose>0:
        print(Val)

    # store lot nr        
    out.loc[IX,"LotNr"]=Val
    
    
    # second line
    out.loc[IX,"LotType"]=rt.pop(0).strip()

    # third line
    Val = rt.pop(0).strip()
    # This line is brand or optional line with type of lot
    # All caps is brand
    if Val in ['Quad','Kampeerwagen/ camper','Pleziervaart motorvaartuig met opbouw en open kuip','Rubberboot'] or not Val.isupper():
        out.loc[IX,"LotType"] += ''.join([' (' + Val + ')'])
        if Verbose>0:print(Val)
        Val = rt.pop(0).strip() # now it is brand
    out.loc[IX,"ItemBrand"]=Val

    
    
    # escape characters, repair typos and translate 
    for i in range(len(rt)):
        
        # encode string as bytes
        rt[i] = rt[i].encode('ascii',errors='xmlcharrefreplace')
        
        # replace text
        for pat,sub in zip(repfragments.Pattern,repfragments.Replace):
            rt[i] = re.sub(pat.encode('ascii',errors='xmlcharrefreplace'),sub.encode('ascii',errors='xmlcharrefreplace'),rt[i])
        
        # decode back to string, but special characters escaped to xml
        rt[i]=rt[i].decode('ascii')

    # Pull value after trailing or leading pattern (bgntag/endtag)
    for Tag,Field in zip(tags.Pattern,tags.Field):
        M = re.search(Tag,'\n'.join(rt))
        if M:
            Val = M.group('val')
            if Verbose>2:
                print(str(Field) + ' : ' + M.group(0).replace('\n','[newline]') + '\n\t' + '|' + Val + '|')
            # remove pattern and make rt a list again.
            rt = '\n'.join(rt).replace(M.group(0),'').split('\n')
        else:
            Val = ''
        out.loc[IX,Field] = Val        

    # Pattern in full text? (flagtag)
    for Tag,Field in zip(flagtags.Pattern,flagtags.Field):
        # flagtags might occur more than once, hence a list of finditer results
        Ms = list(re.finditer(Tag,'\n'.join(rt)))
        if Ms:
            Val = True
            for M in Ms:
                if Verbose>2:
                    print(str(Field) + ' : ' + M.group(0).replace('\n','[newline]') + '\n\t' + '|' + str(Val) + '|')
                # remove pattern and make rt a list again.
                rt = '\n'.join(rt).replace(M.group(0),'').split('\n')
        else:
            Val = False
        out.loc[IX,Field] = Val

        
        
    # loop trough remaining lines

    for line in rt:
               
        # do comparison in bytes
        line = line.encode('ascii',errors='xmlcharrefreplace')
        if Verbose>2:
            print(line)
            
        # parsing
        isParsed = False # some accounting: in the end this line should be parsed
         
        # line is empty.. skip .. next
        if not line :# empty
            isParsed = True
            continue
            
        # line starting with '*' is a note
        if out.loc[IX,'Note'] and line.startswith(bytes('*','ascii')):
            if Verbose>2:
                print('\tNote:',end='')
                print(out.loc[IX,'Note'],end='')
                print(line)
            Val = line[1:].decode('ascii')
            out.loc[IX,'Note'] = Val
            isParsed = True
            continue
                
        if isParsed == False:
            line = line.decode('ascii')
            
            # create empty string if not exist
            if (
                'SupInfo' not in out.loc[IX].index
            ) or (
                (
                    type(out.loc[IX,'SupInfo']) != str
                ) and (
                    pd.np.isnan(out.loc[IX,'SupInfo'])
                )
            ):
                out.loc[IX,'SupInfo'] = ''
            out.loc[IX,"SupInfo"] = '\n'.join([out.loc[IX,'SupInfo'] , str(line)])
            print(str(IX) + '[' + str(line) + ']')
  

2019-4-1819[Uitlaatsteun is afgescheurd]
2019-4-1830[Inclusief beenkleed]
2019-4-2005[Diesel*]
2019-4-2005[*Deze kavel is aangepast d.d. 29.03.2019]
2019-4-2411[Voertuig is niet eerder geregistreerd.]
2019-4-2412[Er moeten nieuwe kentekenplaten op het voertuig]
2019-4-7114[Motorkap sluit niet]
2019-4-7168[&#201;&#233;n wiel aan het voertuig gemonteerd]
2019-4-7168[Overige wielen op achterbank]
2019-4-7168[Extra wielen onder auto (zie foto's)]
2019-4-7178[kenteken SJ-166-J]
2019-4-7189[Voertuig heeft hagelschade]
2019-4-7202[In Frankrijk geregistreerd als 2 persoons bedrijfswagen, fiscaal "P".]
2019-4-7202[Modificatie naar personenauto kan alleen met CVO of via IGK]
2019-4-7209[Gat in carterpan]
2019-4-7209[Geen olie in motorblok]
2019-4-7302[Inclusief 4 velgen waarvan 2 met band]
2019-4-7303[Accu niet aanwezig]
2019-4-7326[Oplegger]
2019-4-7326[GROENEWOLD]
2019-4-7326[Type tsph lk]
2019-4-7326[Kenteken WR-09-YP]
2019-4-7326[Datum eerste toelating 10.04.1997]
2019-4-7340[Maximaal vermog

In [10]:
out.tail()

Unnamed: 0,Source,Title,Price,Draw,Raw_text,N_images,Images,Note,LotNr,LotType,...,no_nlreg194,no_regneeded,rhd,no_odo,no_road,disclaim_cr6,crewcab,carwrap,no_vin,SupInfo
2019-4-7347,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047347,488.0,False,"[Type h434, Niet kentekenplichtig, Zonder kent...",3,[http://www.domeinenrz.nl/ufc/static/155360666...,False,K1900047347,Aanhangwagen (Enkelasser),...,False,True,False,False,False,False,False,False,False,
2019-4-7348,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047348,24590.0,False,"[Type ft xf 105, Kenteken BV-XS-15, Afgelezen ...",3,[http://www.domeinenrz.nl/ufc/static/155360666...,False,K1900047348,Vrachtwagencombinatie (Trekker),...,False,False,False,True,False,False,False,False,False,\nOplegger\nVAN HOOL\nType t-315\nKenteken OB-...
2019-4-7349,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047349,3115.0,False,"[Type fm12 4x2t fal7.1 rad-a4; high, Automatic...",3,[http://www.domeinenrz.nl/ufc/static/155360666...,False,K1900047349,Trekker,...,True,False,False,False,False,False,False,False,False,
2019-4-7350,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047350,3689.0,False,"[Inhoud 22000 liter, De kavelomschrijving op d...",2,[http://www.domeinenrz.nl/ufc/static/155360666...,False,K1900047350,Tankcontainer,...,False,False,False,False,False,False,False,False,False,\nInhoud 22000 liter
2019-4-7351,http://www.domeinenrz.nl/catalogus/verkoop_bij...,Kavel K1900047351,3971.0,False,"[Type ae 45 ce, Nederlands kenteken, Afgelezen...",3,[http://www.domeinenrz.nl/ufc/static/155360666...,False,K1900047351,Bedrijfswagen,...,True,False,False,False,False,False,False,False,False,\nDit voertuig staat voor export geregistreerd...


### Save results to disk

In [11]:
file_name = '../data/drz-data-{}.pkl'.format(Date)
print(file_name)
out.to_pickle(file_name)

../data/drz-data-2019-04.pkl


# Next: add rdw data

Because rdw data changes constantly it is advisable to run the notebook that adds rdw data to the above results soon.