<a id='rdw_top'>

# Add extra information from RDW open data

Query to the open data dataset of the RDW.


1. <a href="#rdw_registrations">Registration numbers</a>  
    Apis with registration as key
2. <a href="#rdw_confcodes">Conformity codes</a>  
    Cars get a conformity code when certified.
3. <a href="#rdw_other_apis">Other APIs</a>  
    It may take a while (10 min) to query all conformity codes individually.
4. <a href="#rdw_ovi">Closed data</a>  
    Get closed data from RDW website. This also takes a while because of time out enforced by website. Use config to disable.
5. <a href="#rdw_merge">Merge results</a>  
    Combine all dataframes and save
6. <a href="#rdw_save">Save results</a>  
- - - - 

### User variables


In [None]:
import os
# setting path
os.chdir(r'..')

import drz_config
cfg = drz_config.read_config()
DATE = cfg['DATE']
VERBOSE = cfg['VERBOSE']
OPBOD = cfg['OPBOD']
CLOSEDDATA = cfg['CLOSEDDATA']
closed_data_fields = cfg['closed_data_fields']
SKIPSAVE = cfg['SKIPSAVE']
if not OPBOD:
    month_counter = cfg['URL'][-2:]
else:
    month_counter = cfg['URL'][-4:-2]

QUICK_MERGE = False # check if rdw data already exist (ran when auction was still open)



if VERBOSE > 0:
    display(cfg)

### Modules and functions

In [2]:
import pandas as pd
import numpy as np
import re 
import os
# to keep api key hidden import this from sub dir
import assets.hidden_api_keys as hidden_api_keys
from time import sleep
import urllib

# base url
apiurl = 'https://opendata.rdw.nl/resource/m9d7-ebf2.json?$$app_token=' + hidden_api_keys.socrata_apptoken + '&'

def get_json_from_api(url,reg,c=0):
    
    '''Get json object from api'''
    
    import time

    c+=1
    try:
        df=pd.read_json(url + 'kenteken=' + reg.replace('-','').upper()).to_dict()
    except:
        if c > 10:
            print(url,reg)
            raise 
        else:
            print('pause 2 sec and try again!')
            time.sleep(2)
            df = get_json_from_api(url,reg,c)
    
    return df
    
# get_json_from_api(apiurl,'61-sf-FG')

### Load auction results

In [None]:
file_name = f'../data/auctions/results/drz-data-{DATE}-{month_counter}.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
if not os.path.isfile(file_name):
    # see if -without price- exists
    NO_PRICE = True
    if OPBOD:
        file_name = file_name.replace('-opbod.pkl', '-opbod-without-price.pkl')
    else:
        file_name = file_name.replace('.pkl', '-without-price.pkl')
else:
    NO_PRICE = False


print(file_name)
drz = pd.read_pickle(file_name)

In [4]:
# load existing RDW data and merge with price
if QUICK_MERGE:
    file_name = f'../data/auctions/results/rdw-data-{DATE}-{month_counter}.pkl'
    if os.path.isfile(file_name):
        raise
    file_name = file_name.replace('.pkl', '-without-price.pkl')
    print(file_name)
    rdw = pd.read_pickle(file_name)
    
    # add price
    rdw.update(drz.Price)

    # save
    out = rdw.copy()
    raise # is path correct?
    file_name = file_name.replace('-without-price.pkl','.pkl')
    if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
        print(file_name)
        out.to_pickle(file_name)
    else:
        print(f'Skip. {file_name} exists or saving is disabled in settings.')
        
    assert False, "Stop running. Everything is done"
else:
    if VERBOSE: print('Ok continue running. Quick merge is:', QUICK_MERGE)

Ok continue running. Quick merge is: False


### Collect registrations

In [None]:
# see what lots have a Dutch registration (license number).
hasReg = (~drz.Reg.isnull()) & (drz.Reg != 'onbekend') & (drz.Reg != '') & (~drz.LotType.isin([
    'Vaartuig',
    'Jetski',
    'Sloep',
    'Speedboot',
    'Vaartuig (Type onbekend)',
    'Motorvaartuig met opbouw (Pleziervaartuig)',
]))

print('nr. of registrations:',sum(hasReg))

# adhoc fix
idx = '2022-08-5012' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'LM-82-14'
idx = '2022-29-5001' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'LM-82-14'
idx = '2022-29-2008' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'KT-05-40'



vc = drz.loc[hasReg, 'Reg'].str.upper().str.replace('-','').value_counts()
if any(vc > 1):
    display(vc[vc>1])
    display(drz[drz.Reg.str.upper().str.replace('-','').isin(vc[vc>1].index)])
    raise ValueError('Registration occurs in more than one lot.')
# assert all(vc == 1), [, display(vc[vc>1])]

# # make a copy and add info
# rdw = drz.copy()


In [6]:
def get_query_url(api_url, keys, token=hidden_api_keys.socrata_apptoken, query_field='kenteken'):
    '''construct query url'''

    import urllib

    # First part
    q = api_url + '?$$app_token=' + token + '&'

    # convert list to string
    id_list = ''.join(["'{}', ".format(k) for k in keys])
    id_list = id_list[0:-2] # remove trailing ', '
    
    # add escaped soql
    soql = '$where=' + urllib.parse.quote(query_field + ' in (' + id_list + ')')
    q += soql
    
    # See if field is available
    accepted_fields = eval(urllib.request.urlopen(api_url).headers.get('X-SODA2-Fields'))
    if query_field not in accepted_fields:
        raise ValueError(f'<{query_field}> not allowed as SODA2 field in {api_url}.\n\tAccepted are: ' + ', '.join(accepted_fields))
    
    return q

def long_to_wide(long, rank_col, index_name=None):
    
    '''
    Convert dataframe to wide by appending rank
    '''
    
    # input
    if index_name is None:
        index_name = long.index.name
        
    # ranking and append to index
    rank_name = f'rank_{index_name}_x_{rank_col}'
    long.loc[:, rank_name] = long.groupby(index_name)[rank_col].rank(method='first').astype(int)
    long = long.reset_index().set_index([index_name, rank_name]).sort_index()
    assert long.index.is_unique

    # make wide and append rank to column name
    wide = long.drop(columns=rank_col).unstack()
    wide.columns = wide.columns.map('{0[0]}_{0[1]}'.format)
    # add count
    count_col = rank_name.replace('rank_', 'nr_of_')
    wide.loc[:, count_col] = long.reset_index().groupby(index_name)[rank_name].max()

    return wide

Create list of dataframes with different api results

In [None]:
# empty dictionary
rdw_per_reg = dict()

# first element of dict is registrations
key = 'registrations'
rdw_per_reg[key] = drz.loc[hasReg,['Reg', 'LotType']].copy() # copy from drz
rdw_per_reg[key]['kenteken'] = rdw_per_reg[key].Reg.apply(lambda r: r.replace('-','').upper())
rdw_per_reg[key].index.name = 'lot_index'
rdw_per_reg[key] = rdw_per_reg[key].reset_index().set_index('kenteken')
with pd.option_context('display.max_rows', 999):
    display(rdw_per_reg[key].reset_index().set_index(['LotType', 'kenteken']).sort_index())

print('\n'.join(rdw_per_reg.keys()))

<a href="#rdw_top" id='rdw_registrations'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Main api 

The main api: `api_gekentekende_voertuigen` points to subsequent apis.

In [None]:
# Assess these registrations
regs = rdw_per_reg['registrations'].Reg.values
regs = [r.replace('-','').upper() for r in regs]
print(len(regs),'registrations in this set')

In [None]:
# Do main api first to get other possible apis
api_name = 'api_gekentekende_voertuigen'
api_url = 'https://opendata.rdw.nl/resource/m9d7-ebf2.json'

# Query api
key = re.sub('^api_','', api_name) # store with this key
q = get_query_url(api_url,regs)
rdw_per_reg[key] = pd.read_json(q)
rdw_per_reg[key].set_index('kenteken', inplace=True)

print(api_name, end=': ')
if VERBOSE > 1:
    display(rdw_per_reg[key])
print(rdw_per_reg[key].shape)

# Query other available apis
for api_name in [c for c in rdw_per_reg['gekentekende_voertuigen'].columns if c.startswith('api')]:
    print(api_name, end=': ')
    key = re.sub('^api_','',api_name)
    for api_url in rdw_per_reg['gekentekende_voertuigen'][api_name].unique():
        print(api_url)
        # query the web
        q = get_query_url(api_url,regs)
        df0 = pd.read_json(q)
        # name of index
        df0.columns.name = api_name

        # query should return 'kenteken', make it the index
        if df0.shape[0] != 0:
            df0.set_index('kenteken', inplace=True)
            
            # Some apis return multiple values. Pivot around index number ("volgnummer")
            if api_name == 'api_gekentekende_voertuigen_assen':
                df0 = pd.pivot(df0, columns='as_nummer')

            elif api_name == 'api_gekentekende_voertuigen_brandstof':
                df0 = pd.pivot(df0, columns='brandstof_volgnummer')

            elif api_name == 'api_gekentekende_voertuigen_carrosserie':
                df0 = pd.pivot(df0, columns='carrosserie_volgnummer')

            elif api_name == 'api_gekentekende_voertuigen_carrosserie_specifiek':
                df0 = pd.pivot(df0, columns='carrosserie_voertuig_nummer_code_volgnummer')

        # squeeze multi index
        one_level = [
            re.sub('^api_gekentekende_voertuigen_','',api_name) + '_' + '_'.join(
                [str(c) if type(c)==int else c for c in l]
            ) for l in df0.columns
        ]
        df0.columns = one_level

        # add to list
        if VERBOSE > 1:
            display(df0.tail(3))
        print(df0.shape)
        rdw_per_reg[key]=df0
        
        
print('\n'.join(rdw_per_reg.keys()))

In [None]:
# Save
file_name = f'../data/auctions/enriched-results/rdw-reg/rdw-reg-main-0-data-{DATE}-{month_counter}.pkl'

out = pd.concat(rdw_per_reg, axis=1)
out['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    
done_keys = list(rdw_per_reg.keys())
done_keys.pop(0);

# Inspection api

"APK"

In [None]:
# Do main api first to get other possible apis
api_name = 'api_meldingen_keuringsinstantie'
api_url = 'https://opendata.rdw.nl/resource/sgfe-77wx.json'

# Query api
key = re.sub('^api_','', api_name) # store with this key
q = get_query_url(api_url,regs)
df0 = pd.read_json(q).add_prefix(key + '_')
df0.set_index(key + '_kenteken', inplace=True)

print(api_name, end=': ')
print(api_url)

long = df0.copy()
long.loc[:, key + '_meld_datum_door_keuringsinstantie_dt_'] = pd.to_datetime(long.loc[:, key + '_meld_datum_door_keuringsinstantie_dt'])
df0 = long_to_wide(long, key + '_meld_datum_door_keuringsinstantie_dt_')
df0.rename(columns={'nr_of_meldingen_keuringsinstantie_kenteken_x_meldingen_keuringsinstantie_meld_datum_door_keuringsinstantie_dt_': f'nr_of_{key}'}, inplace=True)

# drop those that didn't need to go long
api_cols = [re.match('^(.*)_[0-9]+$',c)[1] for c in df0.columns if c.startswith(f'{key}_api_')]
api_cols = np.unique(api_cols)
for col in api_cols:
    sel = df0.columns.str.startswith(col)
    rm_cols = df0.columns[sel]
    df0.loc[:, re.sub(f'{key}_', '', col)] = df0.loc[:, sel].ffill(axis=1).T.drop_duplicates().T.iloc[:,0]
    df0.drop(columns=rm_cols, inplace=True)

print(df0.shape)
rdw_per_reg[key] = df0
if VERBOSE > 1:
    display(rdw_per_reg[key])
else:
    print('\n'.join(rdw_per_reg.keys()))

In [None]:
# Query other available apis
for api_name in ['api_gebrek_constateringen']: # columns in 'meldingen_keuringsinstantie'
    print(api_name, end=': ')
    key = re.sub('^api_','',api_name)
    for api_url in rdw_per_reg['meldingen_keuringsinstantie'][api_name].unique():
        if ~api_url.endswith('.json'):
            # translate
            code = [t for t in api_url.split('/') if len(t)>0][-1]
            api_url = f'https://opendata.rdw.nl/resource/{code}.json'
        print(api_url)

        # query the web
        q = get_query_url(api_url,regs)
        df0 = pd.read_json(q).add_prefix(key + '_')
        # name of index
        df0.columns.name = api_name

        # query should return 'kenteken', make it the index
        if df0.shape[0] != 0:
            df0.set_index(key + '_kenteken', inplace=True)

        # add to list
        print(df0.shape)
        rdw_per_reg[key]=df0

# Add description that is in different api
api_name = 'api_gebrek_beschrijving'
print(api_name, end=': ')
key = re.sub('^api_','',api_name)
api_url = rdw_per_reg['meldingen_keuringsinstantie'][api_name].unique()
assert len(api_url) == 1
api_url = api_url[0]
if ~api_url.endswith('.json'):
    # translate
    code = [t for t in api_url.split('/') if len(t)>0][-1]
    api_url = f'https://opendata.rdw.nl/resource/{code}.json'
    print(api_url)

# query the web if there are registrations
if rdw_per_reg['gebrek_constateringen'].shape[0] > 0:
    q = get_query_url(api_url, 
                      rdw_per_reg['gebrek_constateringen'].gebrek_constateringen_gebrek_identificatie.unique(), 
                      query_field='gebrek_identificatie')
    df0 = pd.read_json(q).add_prefix('gebrek_constateringen_')
    # name of index
    df0.columns.name = api_name

    # query should return 'gebrek_identificatie', make it the index
    if df0.shape[0] != 0:
        df0.set_index('gebrek_constateringen_gebrek_identificatie', inplace=True)
    # Join description
    rdw_per_reg['gebrek_constateringen'] = rdw_per_reg['gebrek_constateringen'].join(df0, on='gebrek_constateringen_gebrek_identificatie')

    # make wide
    long = rdw_per_reg['gebrek_constateringen'].copy()
    long.loc[:, 'gebrek_constateringen_meld_datum_door_keuringsinstantie_dt_'] = pd.to_datetime(long.gebrek_constateringen_meld_datum_door_keuringsinstantie_dt)
    rdw_per_reg['gebrek_constateringen'] = long_to_wide(long, 'gebrek_constateringen_meld_datum_door_keuringsinstantie_dt_')
    rdw_per_reg['gebrek_constateringen'].rename(columns={'nr_of_gebrek_constateringen_kenteken_x_gebrek_constateringen_meld_datum_door_keuringsinstantie_dt_': f'nr_of_gebrek_constateringen'}, inplace=True)

    print(rdw_per_reg['gebrek_constateringen'].shape)

if VERBOSE > 1:
    display(rdw_per_reg['gebrek_constateringen'])
else:
    print('\n'.join(rdw_per_reg.keys()))
    



In [None]:
# .. and some more registrations.
for api_name, api_url in zip(['api_toegevoegde_objecten', 'api_keuringen'], 
                             ['https://opendata.rdw.nl/resource/sghb-dzxx.json', 'https://opendata.rdw.nl/resource/vkij-7mwc.json']):

    print(api_name, end=': ')
    print(api_url)

    # Query api
    key = re.sub('^api_','', api_name) # store with this key
    q = get_query_url(api_url,regs)
    df0 = pd.read_json(q).add_prefix(key + '_')
    print(df0.shape)
    if df0.shape[0] == 0:
        print('NO RESULTS')
        continue
    df0.set_index(key + '_kenteken', inplace=True)


    # make wide
    if key == 'toegevoegde_objecten':
        df0 = long_to_wide(df0.copy(), key + '_montagedatum')
        df0.rename(columns={'nr_of_toegevoegde_objecten_kenteken_x_toegevoegde_objecten_montagedatum': f'nr_of_{key}'}, inplace=True)

    rdw_per_reg[key] = df0
    if VERBOSE > 1:
        display(rdw_per_reg[key])
    else:
        print('\n'.join(rdw_per_reg.keys()))

In [None]:
# Save
file_name = f'../data/auctions/enriched-results/rdw-reg/rdw-reg-apk-0-data-{DATE}-{month_counter}.pkl'

out = pd.concat({k:rdw_per_reg[k] for k in rdw_per_reg.keys() if k not in done_keys}, axis=1)
out['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

done_keys = list(rdw_per_reg.keys())
done_keys.pop(0);


## Merge all registration

In [None]:
# Are columns unique?
col_names = []
for n in [list(c.columns) for c in rdw_per_reg.values()]:
    col_names += n 
if not pd.Series(col_names).is_unique:
    display(pd.Series(col_names).value_counts().to_frame().rename(columns={0: 'occurance'}).query('occurance > 1').sort_index())
    raise LookupError('Dataframes share column names. Add a suffix to column names might solve this.')

# Merge dataframes from different apis
df_regs = pd.concat(rdw_per_reg.values(), axis='columns', sort=False)
# add timestamp
df_regs['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')
# set lot id as index
df_regs.index.name = 'kenteken'
df_regs = df_regs.reset_index().set_index('lot_index')
assert df_regs.kenteken.is_unique, 'Index <kenteken> is not unique.'
if VERBOSE > 1:
    display(df_regs)
else:
    print(df_regs.shape)

In [None]:
# Save
file_name = f'../data/auctions/enriched-results/rdw-reg/rdw-reg-full-0-data-{DATE}-{month_counter}.pkl'

out = df_regs
out['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

<a href="#rdw_top" id='rdw_confcodes'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Conformity codes

In [None]:
# empty dict
rdw_per_confcode = dict()
# Conformity codes have sub-divisions. Four fields make a super key
key = 'conformity_codes'
rdw_per_confcode[key] = rdw_per_reg['gekentekende_voertuigen'][[
    'typegoedkeuringsnummer', 
    'uitvoering', 
    'variant', 
    'volgnummer_wijziging_eu_typegoedkeuring'
]].dropna().drop_duplicates()
rdw_per_confcode[key].reset_index(drop=True, inplace=True)
rdw_per_confcode[key].volgnummer_wijziging_eu_typegoedkeuring = rdw_per_confcode[key].volgnummer_wijziging_eu_typegoedkeuring.astype(int)
print(len(rdw_per_confcode[key]),'conformity codes in this set')

if VERBOSE > 1:
    display(rdw_per_confcode[key])

print(f'{key} occurs more than once.')
# replace(1,np.NaN): to drop single occurence
display(\
    rdw_per_reg['gekentekende_voertuigen'].loc[:, rdw_per_confcode[key].columns]\
    .reset_index()\
    .groupby([rdw_per_confcode[key].columns[0]])\
    .nunique()\
    .replace(1,np.NaN)\
    .dropna(how='all')\
    .fillna(1)\
    .astype(int)\
    .sort_values(by='kenteken', ascending=False)
)

In [None]:
# do conformity api and again get other possible apis
api_name = 'api_eeg_voertuigtypegoedkeuring'
api_url = 'https://opendata.rdw.nl/resource/55kv-xf7m.json'

# Query api
key = re.sub('^api_','', api_name) # store with this key
q = get_query_url(
    api_url, 
    rdw_per_confcode['conformity_codes'].typegoedkeuringsnummer.unique(), # Use long (year with century) version of conformity code
    query_field='typegoedkeuringsnummer'
)
rdw_per_confcode[key] = pd.read_json(q)
# more than one conformity code?
assert not (rdw_per_confcode[key].groupby('typegoedkeuringsnummer')['typegoedkeuringsnummer'].count() > 1).any()
rdw_per_confcode[key].set_index('typegoedkeuringsnummer', inplace=True)

# add slightly different keys (year has no century)
rdw_per_confcode['conformity_codes'] = rdw_per_confcode['conformity_codes'].merge(
    rdw_per_confcode['eeg_voertuigtypegoedkeuring'].eu_type_goedkeuringssleutel, 
    how='left', 
    left_on='typegoedkeuringsnummer', 
    right_index=True
)

print(api_name, end=': ')
if VERBOSE > 1:
    display(rdw_per_confcode[key])
print(rdw_per_confcode[key].shape)




<a href="#rdw_top" id='rdw_other_apis'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Other apis

In [19]:
def _query_all_conf_code_from_api(api_url, l = rdw_per_confcode['conformity_codes'].loc[:,'eu_type_goedkeuringssleutel'].to_list()):
    
    MAX_ROWS = 50000
    
    q = api_url
    q += '?$$app_token=' + hidden_api_keys.socrata_apptoken 
    q += '&$where='
    l = str(tuple(l))
    l = re.sub(' ','',l)
    q += urllib.parse.quote(f"eu_type_goedkeuringssleutel IN {l:s}")
    q += f'&$limit={MAX_ROWS}'
    #print(q)
    all_conf = pd.read_json(q)

    if all_conf.shape[0] == MAX_ROWS:
        raise NotImplementedError('Max nr of rows reached.')
        
    return all_conf

In [20]:
def _query_one_by_one_from_api(api_url, l = rdw_per_confcode['conformity_codes']):
    # unfortunately this needs to be done one by one, because conformity code is not unique                   
    out = pd.DataFrame()
    nr = l.shape[0]
    bar = ['.'] * nr
    c = -1
    disp_id = display({'text/plain': ''.join(bar)}, raw=True, display_id=True)
    for ix, row in l.iterrows():
        disp_id.update({'text/plain': ''.join(bar)}, raw=True)
        c += 1
        q = api_url
        q += '?$$app_token=' + hidden_api_keys.socrata_apptoken 
        q += '&{}=\'{}\''.format('eu_type_goedkeuringssleutel', urllib.parse.quote(row.eu_type_goedkeuringssleutel))
        q += '&{}=\'{}\''.format('eeg_uitvoeringscode', urllib.parse.quote(row.uitvoering))
        q += '&{}=\'{}\''.format('eeg_variantcode', urllib.parse.quote(row.variant))
        q += '&{}={:.0f}'.format('uitvoering_wijzigingsnummer', row.volgnummer_wijziging_eu_typegoedkeuring)

        n_try = 0
        OK = False
        while (n_try < 10) & (not OK):
            try:
                res = pd.read_json(q, 
                                   dtype={
                                       'eu_type_goedkeuringssleutel': str, 
                                       'eeg_uitvoeringscode': str, 
                                       'eeg_variantcode': str, 
                                       'uitvoering_wijzigingsnummer': int
                                   })
                OK = True
                n_try = 0
            except:
                n_try +=1
                sleep(10)
            if n_try == 10:
                raise 'tried too often'

        if len(res) == 0:
            bar[c] = 'x'
            continue
        else:
            bar[c] = '|'

        res.index=[c] * res.shape[0]   
        out = pd.concat([out, res], axis=0)
        
    disp_id.update({'text/plain': ''.join(bar)}, raw=True)
    return out
    

In [21]:
def _reduce_to_current_set(all_conf, this_set = rdw_per_confcode['conformity_codes'].loc[:,['eu_type_goedkeuringssleutel', 'uitvoering', 'variant', 'volgnummer_wijziging_eu_typegoedkeuring']]):
    
    idx_left =   ['eu_type_goedkeuringssleutel', 'uitvoering',          'variant',         'volgnummer_wijziging_eu_typegoedkeuring']
    idx_right =  ['eu_type_goedkeuringssleutel', 'eeg_uitvoeringscode', 'eeg_variantcode', 'uitvoering_wijzigingsnummer']
    
    
    if \
    (all_conf.shape[0] == 0) \
    or ((this_set.loc[:,idx_left].T.reset_index().T.merge(
        all_conf.loc[:,idx_right].T.reset_index().T,
        how = 'outer',
        on = [0,1,2,3],
        indicator=True
    )._merge == 'both').any() == False):
        
        # None are in this_set
        out = pd.DataFrame(columns=idx_right) # empty
        
    else:    
        
        out = pd.merge(
            left = this_set,
            right = all_conf,
            how = 'inner',
            left_on = idx_left,
            right_on= idx_right,
        ).drop(columns=idx_left[1:]) # drop duplicate left_indices. They are in right_index.
    
    # progess bar
    df_ = rdw_per_confcode['conformity_codes'].rename(columns={l:r for l,r in zip(idx_left,idx_right)})    
    df_.set_index(idx_right, inplace=True)
    bar = ['|' if i in out.set_index(idx_right).sort_index().index else 'x' for i in df_.index]
    print(''.join(bar))
              
    return out


In [22]:
def _widen_and_add_pfx(res, api_name):

    # Set index of input
    res.set_index(['eu_type_goedkeuringssleutel', 'eeg_uitvoeringscode', 'eeg_variantcode', 'uitvoering_wijzigingsnummer'], inplace=True)
    
    key = re.sub('^api_','',api_name)
    pfx = re.sub('_eeg_uitvoering$', '', key)
    
    
    # This api has no pivot column, but needs pivoting. Use index
    if api_name == 'api_merk_uitvoering_toegestaan':
        res['volgnummer_door_index'] = res.groupby(list(res.index.names)).cumcount().add(1)

    # Pivot columns. 
    #     key: api name, 
    #     value: column name to pivot on. Needs to be a counter.
    pivot_columns = {
        'api_as_gegevens_eeg_uitvoering': 'asnummer',
        'api_handelsbenaming_uitvoering': 'volgnummer',
        'api_carrosserie_uitvoering': 'carrosserie_volgnummer',
        'api_plaatsaanduiding_uitvoering': 'plaats_aanduiding_volgnummer',
        'api_carrosserie_uitvoering_nummerieke_code': 'carrosserie_uitvoering_numeriek_volgnummer',
        'api_uitvoeringverbruik_per_uitgave': 'uitvgavenummer_verbruikboek',
        'api_motor_uitvoering': 'volgnummer',
        'api_versnellingsbak_uitvoering': 'volgnummer',
        'api_motor_uitvoering_brandstof': ['volgnummer', 'brandstof_volgnummer'],
        'api_merk_uitvoering_toegestaan': 'volgnummer_door_index',
    }
    
    # Get pivot columns according to dict
    if api_name in pivot_columns.keys():
        piv_col = pivot_columns[api_name]
    else:
        piv_col = None
        
    # If input is empty, override by planning to do no modifications
    if res.shape[0] == 0:
        piv_col = None
        api_name = ''

    if piv_col is not None:
        if isinstance(piv_col, str):
            # pivot
            piv = res.pivot(columns = piv_col)
        elif isinstance(piv_col, list):
            piv = pd.pivot_table(res, index = res.index.names, columns = piv_col)
        
        # flatten column to one level
        one_level = ['_'.join([str(c) if type(c)==int else c for c in l]) for l in piv.columns]
        piv.columns = one_level
    
    # Modify when not in dict
    # elif api_name == 'api_merk_uitvoering_toegestaan---':
    #     # concat all observations in list
    #     piv = pd.pivot_table(res, index = res.index.names, values = res.columns, aggfunc = list)
        
    # No modifications
    else: 
        piv = res
    
    # sanity check. 
    assert piv.index.is_unique, f'There are multiple observations per key: [{", ".join(piv.index.names)}]' # you might want to concat to list, prototyped here above.
        
    # Add prefix
    return piv.add_prefix(pfx+'_')


In [23]:
def _get_dtype_from_api(api_url):
    resp=urllib.request.urlopen(api_url)
    hdr = {i[0]:i[1] for i in resp.headers.raw_items()}
    dtypes = {f:v for v,f in zip(eval(hdr['X-SODA2-Types']), eval(hdr['X-SODA2-Fields']))}
    return dtypes


In [None]:
# query other available apis
api_list = [c for c in rdw_per_confcode['eeg_voertuigtypegoedkeuring'].columns if c.startswith('api')]
for api_name in api_list:
    print(f'{{:{max([len(c) for c in api_list])}s}}'.format(api_name), 
          end=''.join(['|' if api_name==list_element else '.'  for list_element in api_list]))
    key = re.sub('^api_','',api_name)
    for api_url in rdw_per_confcode['eeg_voertuigtypegoedkeuring'][api_name].unique():
        
        # reformat url
        M=re.search('https://opendata.rdw.nl/.*/([a-z0-9]{4}-[a-z0-9]{4})$', api_url)
        api_url = 'https://opendata.rdw.nl/resource/{}.json'.format(M[1])
        print(api_url)
       
        # query the web
        try:
            # get all conformation codes in one go.
            all_conf = _query_all_conf_code_from_api(api_url) 
            df1 = _reduce_to_current_set(all_conf)
        except NotImplementedError:
            df1 = _query_one_by_one_from_api(api_url)

        print(df1.shape)
        df1 = _widen_and_add_pfx(df1, api_name)
        
        # add to dict
        rdw_per_confcode[key] = df1
        if VERBOSE > 1:
            display(rdw_per_confcode[key].tail())
            


<a href="#rdw_top" id='rdw_ovi'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Closed data from rdw (OVI)
Optionally get data from rdw website

In [25]:
def get_closed_data(reg, fields = '*'):
    web_url = f'https://ovi.rdw.nl/?kenteken={reg}'

    import requests
    import codecs
    from lxml import html, etree
    import re
    import time
    
    def _all_fields(tree):
        elements = tree.xpath('//*[@class="ui-block-d border ovigrid"]') + tree.xpath('//*/div[@class="ui-block-d ovigrid"]')
        return [el.attrib['id'] for el in elements if 'id' in el.keys()]
    
    # get page
    page = requests.get(web_url)
    DecodeType = re.findall('charset=(.*)$', page.headers["Content-type"])[0]
    htmlstring = codecs.decode(page.content, DecodeType)
    # Convert string to tree object
    tree = html.fromstring(htmlstring)

    # analyze results
    ERROR = len(tree.xpath('//*[@action="FoutPagina.aspx?error=ErrorBlocked"]')) == 1
    warn_text = tree.xpath('//*[@class="warning"]//text()')
    WARNING_NA = (len(warn_text) > 0) and (
        (warn_text[0].startswith(f'Er zijn geen gegevens gevonden voor het ingevulde kenteken {reg.upper()}.')) or \
        (warn_text[0].startswith(f'{reg.upper()} is geen geldig kenteken.'))
    )

    # return result
    if ERROR:
        # print('X', end='')
        return False, 'X'
    elif WARNING_NA:
        if VERBOSE > 1: print(warn_text[0])
        # print('x', end='')
        return None, 'x'
    else:
        if fields == '*':
            fields = _all_fields(tree)
        out = dict()
        succes = [0,0]
        for fld in fields:
            txt = tree.xpath(f'//*[@id="{fld}"]/text()')
            if len(txt) != 1:
                # No result with this field. Just skip
                if VERBOSE > 1: print(fld)
                succes[0] += 1
            else:
                succes[1] += 1
                out[fld] = str(txt[0])
        # print(f'[{succes[1]}/{sum(succes)}]', end='')

        # No result with these fields. Probably limit request
        if all([v is None for v in out.values()]):
            raise AssertionError(f'No result with fields: {", ".join(fields)}.\n{web_url}')


    out['TimeStamp'] = time.strftime('%Y%m%d')
    
    return out, f'[{succes[1]}/{sum(succes)}]'
    

In [26]:
def _get_prog_mtx(prog):
    n_col = 10
    prev_r = -1
    lines = ''
    for n,msg in enumerate(prog.values):
        c = n%n_col
        r = int((n-c)/n_col)
        if prev_r < r:
            lines += f'\n{(prev_r+1)*n_col+1:03.0f}-{(r+1)*n_col:03.0f}'
            prev_r = r
        lines += f'{msg:>7s}'
    return lines

In [None]:
if CLOSEDDATA:
    data = {k: False for k in df_regs.kenteken}
    prog = pd.Series(index = data.keys(), data = '-')
    disp_id = display({'text/plain': _get_prog_mtx(prog)}, display_id=True, raw=True)
    if VERBOSE > 1: print('X: error (site time out after 30 requests)\nx: reg invalid\n[x/y]: <x> fields succesfully retrieved from a total of <y> fields\n-: N.A.')
    DO_LOOP = True
    c = 0 #counter
    max_loop = 2 * len(data) / 30 # approx nr of succesful retrievals is 30. Twice the amount should retrieve all
    while DO_LOOP:
        
        # break
        if c > max_loop:
            raise RuntimeError(f'Max nr of loops ({c}) reached.')

        # pause
        if c > 0:
            if VERBOSE > 1: print(f' time out (60s) for next iteration', end='')
            sleep(60)
            if VERBOSE > 1: print(f': {c:2.0f}/{max_loop:2.0f}')
        c+=1 

        # has no value or field has no value
        to_do_regs = [k for k,v in data.items() if v == False]
        if VERBOSE > 0: print(f'\nto do:{len(to_do_regs):3.0f}', end=' ')
        for reg in to_do_regs:
            res, msg = get_closed_data(reg, closed_data_fields)
            prog.loc[reg] = msg
            disp_id.update({'text/plain':_get_prog_mtx(prog)}, raw=True)
            if msg == 'X':
                break
            data.update({reg: res})
        DO_LOOP = any([v==False for v in data.values()])

    # drop None
    data = {k:v for k,v in data.items() if v is not None}    
    rdw_closed = pd.DataFrame.from_dict(data=data, orient='index')
else:
    rdw_closed = None

if VERBOSE > 1:
    rdw_closed
else:
    print(rdw_closed.shape)

<a href="#rdw_top" id='rdw_merge'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Data from The National Highway Traffic Safety Administration (NHTSA)
Based on VIN. Product Information Catalog and Vehicle Listing (vPIC)
https://vpic.nhtsa.dot.gov/api/

In [28]:
from vin_lookup import Nhtsa_batch

In [29]:
# empty dictionary
nhtsa_per_vin = dict()

In [None]:
key = 'vpic'
df_ =  drz.loc[:, ['Vin', 'Mfyear']].copy() # copy from drz

# borrow mfyear from rdw info
rdw_myr = pd.merge(  left = rdw_per_reg['registrations'].reset_index(),
                     right = rdw_per_reg['gekentekende_voertuigen'].datum_eerste_toelating.reset_index(),
                     how='left',
                     right_on='kenteken',
                     left_on='kenteken'
                    ).loc[:, ['lot_index', 'datum_eerste_toelating']].set_index('lot_index')
df_ =  pd.concat([df_, (rdw_myr // 10000).astype(pd.Int16Dtype())], axis=1)
df_.update(df_.loc[:, ['Mfyear', 'datum_eerste_toelating']].replace({'': np.NaN}).bfill(axis=1))
df_.rename(columns={'Vin': 'VIN', 'Mfyear': 'MFY'}, inplace=True)
nhtsa_per_vin[key] = df_.loc[:, ['VIN', 'MFY']]

# lookup vins in batches
Batch = Nhtsa_batch(nhtsa_per_vin[key].iloc[:,:2], verbose=VERBOSE)
Batch.full_parse()
out = Batch.data.copy()

# store in dict
nhtsa_per_vin[key] = pd.concat([
    nhtsa_per_vin[key],
    out.drop(columns=out.columns[out.columns.str.startswith('system') | out.columns.str.startswith('internal')])
], axis=1)

if VERBOSE > 1:
    display(nhtsa_per_vin[key])
else:
    print('\n'.join(nhtsa_per_vin.keys()))

In [None]:
# Save
file_name = f'../data/auctions/enriched-results/nhtsa-vpic/nhtsa-vpic-0-data-{DATE}-{month_counter}.pkl'

out = pd.concat([
    nhtsa_per_vin['vpic'].loc[:, ['VIN', 'MFY']], 
    Batch.data
], axis = 1)
out['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

# Merge datasets
- Merge dataframes from conformity codes apis
- Merge with registration results
- Merge with auction results

In [None]:
# first merge first two results
tmp = rdw_per_confcode['conformity_codes'].merge(rdw_per_confcode['eeg_voertuigtypegoedkeuring'], how='left', 
                                              left_on='typegoedkeuringsnummer',
                                              right_index=True
                                             )

tmp.rename(columns={
    'eu_type_goedkeuringssleutel_x': 'eu_type_goedkeuringssleutel',
    'uitvoering': 'eeg_uitvoeringscode',
    'variant': 'eeg_variantcode',
    'volgnummer_wijziging_eu_typegoedkeuring': 'uitvoering_wijzigingsnummer',
}, inplace=True)
tmp.set_index(['eu_type_goedkeuringssleutel', 'eeg_uitvoeringscode', 'eeg_variantcode', 'uitvoering_wijzigingsnummer'], inplace=True)


# list of df. Remove the ones in merged into tmp
to_concat = [tmp] + [v for k,v in rdw_per_confcode.items() if k not in ['conformity_codes', 'eeg_voertuigtypegoedkeuring']]
# Are columns unique?
col_names = []
for n in [list(c.columns) for c in to_concat]:
    col_names += n 
if not pd.Series(col_names).is_unique:
    display(pd.Series(col_names).value_counts().to_frame().rename(columns={0: 'occurance'}).query('occurance > 1').sort_index())
    raise LookupError('Dataframes share column names. Add a suffix to column names might solve this.')
    
# merge with subsequent api results
df_confcodes = pd.concat(to_concat, axis='columns', sort=False)
# add timestamp
df_confcodes['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

if VERBOSE > 1:
    display(df_confcodes)
else:
    print(df_confcodes.shape)

In [None]:
# Save
file_name = f'../data/auctions/enriched-results/rdw-conf/rdw-conf-0-data-{DATE}-{month_counter}.pkl'

out = df_confcodes

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

In [None]:
# Merge confirmation codes with registrations
df_regs.index.name = 'lot_index'
df = df_regs.reset_index().merge(df_confcodes.reset_index(), how='left',
                   left_on=['typegoedkeuringsnummer', 'uitvoering', 'variant', 'volgnummer_wijziging_eu_typegoedkeuring'],
                   right_on=['typegoedkeuringsnummer', 'eeg_uitvoeringscode', 'eeg_variantcode', 'uitvoering_wijzigingsnummer'],
).set_index('lot_index')
if VERBOSE > 1:
    display(df)
else:
    print(df.shape)
assert all(df.columns.value_counts() == 1)

In [35]:
# Merge closed source data with registrations

# make fields lowercase and add "ovi_"
rdw_closed.index.name='kenteken'
rdw_closed.columns = ['ovi' + re.sub(r'([A-Z])',r'_\1', c).lower() if c != 'TimeStamp' else c for c in rdw_closed.columns ]

# Basic operations
rdw_closed['ovi_private_owners'] = rdw_closed.ovi_eigenaren.str.split('/').apply(lambda x:int(x[0].strip()))
rdw_closed['ovi_company_owner'] = rdw_closed.ovi_eigenaren.str.split('/').apply(lambda x:int(x[1].strip()))
rdw_closed['ovi_owners'] = rdw_closed['ovi_private_owners'] + rdw_closed['ovi_company_owner']
rdw_closed['ovi_under_survey'] = rdw_closed.ovi_wachten_op_keuring.apply(lambda x: {'Ja': True, 'Nee': False}[x])

df = df.merge(rdw_closed, how='left', left_on='kenteken', right_index=True)

In [None]:
# Save
file_name = f'../data/auctions/enriched-results/rdw-ovi/rdw-ovi-0-data-{DATE}-{month_counter}.pkl'

out = rdw_closed.merge(df.reset_index().loc[:, ['lot_index', 'kenteken']], how='left', right_on='kenteken', left_index=True).set_index('lot_index')
out.rename(columns={'kenteken': 'ovi_kenteken'}, inplace=True)
out['ovi_TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

In [37]:
# TODO when nhtsa gets more merge them here 
assert len(nhtsa_per_vin) == 1
df_vins = nhtsa_per_vin['vpic']
# add timestamp
df_vins['TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')

### Merge rdw and drz

In [38]:
# merge auction results and rdw queries. Add prefix "rdw_"
rdw = pd.concat([drz, df.add_prefix('rdw_')], axis='columns', sort=False)
# There should be no duplicates in column names
assert all(rdw.columns.value_counts() == 1)
# indices should match
assert rdw.index.isin(drz.index).all() & drz.index.isin(rdw.index).all()


In [None]:
# Save
file_name = f'../data/auctions/enriched-results/rdw-reg/rdw-reg-0-data-{DATE}-{month_counter}.pkl'

out = rdw.copy()
out = out.loc[:, [c for c in out.columns if c.startswith('rdw_') & (not c.startswith('rdw_ovi_'))]]

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

### Merge nhtsa and drz

In [40]:
# merge auction results and rdw queries. Add prefix "rdw_"
rdw = pd.concat([rdw, df_vins.drop(columns=['VIN', 'MFY']).add_prefix('nhtsa_')], axis='columns', sort=False)
# There should be no duplicates in column names
assert rdw.columns.is_unique
# indices should match
assert rdw.index.isin(drz.index).all() & drz.index.isin(rdw.index).all()

out = rdw.copy()

<a href="#rdw_top" id='rdw_save'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Saving

In [None]:
file_name = f'../data/auctions/enriched-results/rdw-data-{DATE}-{month_counter}.pkl'
if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

# Next: download images (or parallel)

Because images might be taken down from the drz site, it is advisable to run the notebook that downloads images soon.

In [None]:
assert False, 'Stop running. Below is to check if existing file has same dataframe.'

In [None]:
rdw = pd.read_pickle('../data/rdw-data-2021-06-opzij.pkl')

In [None]:
is_eq = rdw.eq(out)
is_na = rdw.isna() & out.isna()

In [None]:
col = 'Raw_text'

c = 0
for aa,bb in zip(rdw[col],out[col]):
    c+=1
    sd1 = np.setdiff1d(aa,bb)
    sd2 = np.setdiff1d(bb,aa)
    if (len(sd1) == 1 & len(sd2) == 1) & (sd2[0][:-1] == sd1[0]):
        continue
    else:
        if sd1:
            for l in sd1:
                print(f'|{l}|')
        if sd2:
            for l in sd2:
                print(f'|{l}|')
        raise
            
is_eq[col] = True

In [None]:
col = 'LotCat'

c = 0
for aa,bb in zip(rdw[col],out[col]):
    c+=1
    sd1 = np.setdiff1d(aa,bb)
    sd2 = np.setdiff1d(bb,aa)
    if (len(sd1) == 1 & len(sd2) == 1) & (sd2[0][:-1] == sd1[0]):
        continue
    else:
        if sd1:
            for l in sd1:
                print(f'|{l}|')
        if sd2:
            for l in sd2:
                print(f'|{l}|')
        raise
            
is_eq[col] = True

In [None]:
sel = (is_na | is_eq)

sel.loc[:, sel.columns.str.startswith('rdw_TimeStamp')] = True


In [None]:
pd.concat([
    pd.concat([rdw.loc[(sel == False).any(axis=1), sel.all() == False]], keys=['on disk'], axis=1),
    pd.concat([out.loc[(sel == False).any(axis=1), sel.all() == False]], keys=['memory'], axis=1)
], axis = 1).sort_index(level=1, axis=1)


In [None]:
for col in rdw.loc[:, sel.all() == False].columns:
    print(col)
    display(pd.concat([
        rdw.loc[sel[col]==False, col], 
        out.loc[sel[col]==False, col]
    ], axis=1))

In [None]:
with pd.option_context('max_rows', 999):
    display(out[out.rdw_kenteken == 'J892TZ'].T.dropna())