# ORGINEEL
----------

<a id='rdw_top'>

# Add extra information from RDW open data

Query to the open data dataset of the RDW.


1. <a href="#rdw_registrations">Registration numbers</a>  
    Apis with license plates as key
2. <a href="#rdw_confcodes">Conformity codes</a>  
    Cars get a conformity code when certified.
3. <a href="#rdw_other_apis">Other APIs</a>  
    Query all conformity codes in belonging to data set.
4. <a href="#rdw_ovi">Website data</a>  
    Get data from OVI RDW website. This takes a while because of time out enforced by website. Use config to disable.
5. <a href="#rdw_merge">Merge results</a>  
    Combine all dataframes and save
6. <a href="#rdw_save">Save results</a>  
- - - - 

### User variables


In [1]:
import sys
import re
import json
from IPython.display import display

In [8]:
with open('../assets/drz-settings-current.json', 'r') as fid:
    cfg = json.load(fid)

OPBOD = cfg['AUCTION']['kind'] == 'opbod'
AUCTION_ID = cfg['AUCTION']['id']
DATE = cfg['AUCTION']['date']
DATA_DIR = cfg['FILE_LOCATION']['data_dir']
auction_month = DATE[:4] + '-' + DATE[4:6]
if cfg['AUCTION']['kind'] == 'inschrijving':
    month_counter = re.sub('(-)(\d{2})', '\g<1>', AUCTION_ID)[5:8]
elif cfg['AUCTION']['kind'] == 'opbod':
    month_counter = re.sub('(-)(\d{2})(\d{2})', '-\g<2>', AUCTION_ID)[5:8]

sys.path.insert(0, cfg['FILE_LOCATION']['code_dir'])

QUICK_MERGE = False
SKIPSAVE = False
OVIDATA = True
VERBOSE = 1

print(AUCTION_ID)

2025-0003


### Modules and functions

In [9]:
import pandas as pd
import numpy as np
import re 
import os
# to keep api key hidden import this from sub dir
import assets.hidden_api_keys as hidden_api_keys
from rdw_info import *

In [10]:
main_api = 'm9d7-ebf2' # gekentekende_voertuigen
keur_api = 'vkij-7mwc' # keuringen
apk_api = 'sgfe-77wx' # meldingen_keuringsinstantie
gebr_api = 'hx2c-gt7k' # gebreken
toe_api = 'sghb-dzxx' # toegevoegde_objecten

conf_api = '55kv-xf7m' # EEG_Voertuigtypegoedkeuring

# g2s6-ehxa Motor-Uitvoering

# byxc-wwua TGK Basis Uitvoering
# kyri-nuah TGK Merk Uitvoering
# xn6e-huse TGK-Rupsbandset-Uitvoering
# d3ex-xghj TGK-Koppeling-Uitvoering
# 4by9-ammk TGK-Aandrijving-Uitvoering
# m692-vvff TGK-Speciale-Doeleinden
# gr7t-qfnb TGK-Energiebron-Uitvoering
# 9s6a-b42z TGK-Intrekking-Typegoedkeuring

# wx3j-69ie     Basisgegevens_EEG_Uitvoering
# ahsi-8uyu     AS_Gegevens_EEG_Uitvoering
#  xhyb-w7xt     TGK-As-Uitvoering
# q7fi-ijjh     Carrosserie_Uitvoering_Klasse
# w2qp-idms     Carrosserie_Uitvoering
#  ky2r-jqad     TGK-Carrosserie-Uitvoering
# nypm-t8hx     Carrosserie_Uitvoering_Nummerieke_Co
# mdqe-txpd     Handelsbenaming_Uitvoering
#  x5v3-sewk     TGK-Handelsbenaming-Fabrikant
# fj7t-hhik     Merk_Uitvoering_Toegestaan
# g2s6-ehxa     Motor_Uitvoering
# 5w6t-p66a     Motor_Uitvoering_Brandstof
# mt8t-4ep4     Plaatsaanduiding_Uitvoering
# h9pa-e9ta     Subcategorie_Uitvoering
# 2822-t8sx     Uitvoering_Gebruiksgegevens_Per_Uitg
# r7cw-67gs     Versnellingsbak_Uitvoering
#  7rjk-eycs     TGK-Versnelling-Uitvoering

### Load auction results

In [11]:
file_name = f'{DATA_DIR}/auctions/results/drz-data-{auction_month}-{month_counter}.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
if not os.path.isfile(file_name):
    # see if -without price- exists
    NO_PRICE = True
    if NO_PRICE:
        file_name = file_name.replace('auctions/results', 'auctions/without-price')
        file_name = file_name.replace('.pkl', '-without-price.pkl')
    if OPBOD:
        file_name = file_name.replace('-opbod-without-price.pkl', '-without-price-opbod.pkl')
#     else:
#         file_name = file_name.replace('.pkl', '-without-price.pkl')
else:
    NO_PRICE = False


print(file_name)
drz = pd.read_pickle(file_name)

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/results/drz-data-2025-02-03.pkl


In [12]:
if QUICK_MERGE:
    raise NotImplementedError

### Collect number plate registrations

In [13]:
# see what lots have a Dutch registration (license number).
hasReg = (~drz.Reg.isnull()) & (drz.Reg != 'onbekend') & (drz.Reg != '') & (~drz.LotType.isin([
    'Vaartuig',
    'Jetski',
    'Sloep',
    'Speedboot',
    'Vaartuig (Type onbekend)',
    'Motorvaartuig met opbouw (Pleziervaartuig)',
]))

print('nr. of registrations:',sum(hasReg))

# adhoc fix
idx = '2022-08-5012' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'LM-82-14'
idx = '2022-29-5001' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'LM-82-14'
idx = '2022-29-2008' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'KT-05-40'



vc = drz.loc[hasReg, 'Reg'].str.upper().str.replace('-','').value_counts()
if any(vc > 1):
    display(vc[vc>1])
    display(drz[drz.Reg.str.upper().str.replace('-','').isin(vc[vc>1].index)])
    raise ValueError('Registration occurs in more than one lot.')
# assert all(vc == 1), [, display(vc[vc>1])]

# # make a copy and add info
# rdw = drz.copy()

nr. of registrations: 68


<a href="#rdw_top" id='rdw_registrations'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Main api 

The main api: `api_gekentekende_voertuigen` points to subsequent apis.

In [14]:
# empty dictionary
rdw_per_reg = dict()

# first element of dict is registrations
key = 'registrations'
rdw_per_reg[key] = drz.loc[hasReg,['Reg', 'LotType']].copy() # copy from drz
rdw_per_reg[key]['kenteken'] = rdw_per_reg[key].Reg.apply(lambda r: r.replace('-','').upper())
rdw_per_reg[key].index.name = 'lot_index'
rdw_per_reg[key] = rdw_per_reg[key].reset_index().set_index('kenteken')
with pd.option_context('display.max_rows', 999):
    display(rdw_per_reg[key].reset_index().set_index(['LotType', 'kenteken']).sort_index())

print('\n'.join(rdw_per_reg.keys()))

Unnamed: 0_level_0,Unnamed: 1_level_0,lot_index,Reg
LotType,kenteken,Unnamed: 2_level_1,Unnamed: 3_level_1
Bedrijfswagen,25JZL9,2025-03-7046,25-JZL-9
Bedrijfswagen,36VZP8,2025-03-7013,36-VZP-8
Bedrijfswagen,41VZD9,2025-03-7043,41-VZD-9
Bedrijfswagen,V362DH,2025-03-7000,V-362-DH
Bedrijfswagen,V827GF,2025-03-7041,V-827-GF
Bedrijfswagen,VG210T,2025-03-7027,VG-210-T
Bedrijfswagen,VR430X,2025-03-7038,VR-430-X
Bedrijfswagen,VS509L,2025-03-7015,VS-509-L
Bromfiets,DBB10X,2025-03-1803,DBB-10-X
Bromfiets,DFN04G,2025-03-1816,DFN-04-G


registrations


In [15]:
# Assess these registrations
regs = rdw_per_reg['registrations'].Reg.values

# Main rdw api
Info = RdwInfo(regs, main_api, hidden_api_keys.socrata_apptoken)
Info.process_api()
key = re.sub('\s', '_', Info.metadata_['name'].lower())
rdw_per_reg[key] = Info.get_df().copy()
print(Info)

Class contains
	api_name_ <class 'str'>:
		m9d7-ebf2
	api_url_ <class 'str'>:
		https://opendata.rdw.nl/resource/m9d7-ebf2.json
	data_ <class 'pandas.core.frame.DataFrame'>:
		shape=(66, 69)
	idx_ <class 'list'>:
		len=68
	metadata_ <class 'dict'>:
		contains fields ['online', 'name', 'pivot_columns', 'primary_keys', 'header']
	metadata_.online <class 'pandas.core.frame.DataFrame'>:
		shape=(96, 10)
	metadata_.name <class 'str'>:
		Gekentekende_voertuigen
	metadata_.pivot_columns["m9d7-ebf2"] <class 'str'>:
		-empty-
	metadata_.primary_keys["m9d7-ebf2"] <class 'str'>:
		kenteken
	metadata_.header <class 'str'>:
		{'Server': 'nginx', 'Date': 'S .. 44921f904449415169cae9adfd95'}
	resp_status_ <class 'str'>:
		200


Sub apis

In [18]:
# Get sub apis from main api
from_key = 'gekentekende_voertuigen'
sub_apis,_,_ = get_sub_apis(rdw_per_reg[from_key])
# add extra apis
sub_apis += ['3xwf-ince', '2ba7-embk', '7ug8-2dtt', 't49b-isb7', keur_api, apk_api, toe_api] #,'a34c-vvps', # some extra apis with registrations
print(f'{from_key}')
for api_name in sub_apis:
    Info.set_api_name(api_name)
    Info.process_api()
    key = re.sub('\s', '_', Info.metadata_['name'].lower())
    key = re.sub(f'^{from_key}_', '', key)
    rdw_per_reg[key] = Info.get_df().copy()
    print(api_name, key)

# Get apis from apk api
from_key = 'meldingen_keuringsinstantie'
sub_apis,_,_ = get_sub_apis(rdw_per_reg[from_key])
print(f'{from_key}')
for api_name in set(sub_apis):
    Info.set_api_name(api_name)
    Info.process_api()
    key = re.sub('\s', '_', Info.metadata_['name'].lower())
    key = re.sub(f'^{from_key}_', '', key)
    rdw_per_reg[key] = Info.get_df().copy()
    print(api_name, key)

gekentekende_voertuigen
3huj-srit assen
8ys7-d773 brandstof
vezc-m2t6 carrosserie
jhie-znh9 carrosserie_specificatie
kmfi-hrps voertuigklasse
3xwf-ince rupsbanden
2ba7-embk subcategorie_voertuig
7ug8-2dtt bijzonderheden
t49b-isb7 terugroep_actie_status
vkij-7mwc keuringen
sgfe-77wx meldingen_keuringsinstantie
sghb-dzxx toegevoegde_objecten
meldingen_keuringsinstantie
a34c-vvps geconstateerde_gebreken
hx2c-gt7k gebreken


In [19]:
# Use reference table to add info
df_left = rdw_per_reg['geconstateerde_gebreken'].copy()
df_right = rdw_per_reg['gebreken'].copy()
on_column = 'gebrek_identificatie'

for left_column, left in df_left.loc[:, df_left.columns.str.startswith(on_column)].items():
    suffix = re.sub(on_column, '', left_column)
    df_merge = pd.merge(
        left=left.reset_index(),
        right=df_right,
        how='left',
        left_on=left_column,
        right_on=on_column,
    ).set_index('kenteken')
    df_merge = df_merge.drop(columns=[left_column, on_column, 'TimeStamp']).add_suffix(suffix)
    df_left = df_left.merge(df_merge, left_index=True, right_index=True)

# add extra table
rdw_per_reg['geconstateerde_gebreken_met_beschrijving'] = df_left
# clean up: 
#    remove reference table
del rdw_per_reg['gebreken']
#    remove table without description
del rdw_per_reg['geconstateerde_gebreken']

In [20]:
out = pd.concat(rdw_per_reg, axis=1)
# When TimeStamp is filled, this auction has information from that api.
display(out.loc[:, (slice(None), 'TimeStamp')].bfill(axis=0).iloc[[0],:].reset_index(drop=True).T)

Unnamed: 0,Unnamed: 1,0
gekentekende_voertuigen,TimeStamp,2025-02-15 16:47:40
assen,TimeStamp,2025-02-15 16:48:50
brandstof,TimeStamp,2025-02-15 16:48:50
carrosserie,TimeStamp,2025-02-15 16:48:51
carrosserie_specificatie,TimeStamp,2025-02-15 16:48:52
voertuigklasse,TimeStamp,NaT
rupsbanden,TimeStamp,NaT
subcategorie_voertuig,TimeStamp,2025-02-15 16:48:53
bijzonderheden,TimeStamp,2025-02-15 16:48:54
terugroep_actie_status,TimeStamp,2025-02-15 16:48:55


In [21]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-reg/rdw-reg-full-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

Skip. /home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/rdw-reg/rdw-reg-full-0-data-2025-02-03.pkl exists or saving is disabled in settings.


<a href="#rdw_top" id='rdw_confcodes'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Conformity codes

In [22]:
# As of Dec 2024 something changed. This field is no longer returned
if 'volgnummer_wijziging_eu_typegoedkeuring' not in rdw_per_reg['gekentekende_voertuigen']:
    conf = rdw_per_reg['gekentekende_voertuigen'][[
        'typegoedkeuringsnummer', 
        'uitvoering', 
        'variant'
    ]].copy()
    conf.loc[:,'volgnummer_wijziging_eu_typegoedkeuring'] = 0
else:
    conf = rdw_per_reg['gekentekende_voertuigen'][[
        'typegoedkeuringsnummer', 
        'uitvoering', 
        'variant', 
        'volgnummer_wijziging_eu_typegoedkeuring'
    ]].copy()

In [23]:
# empty dict
rdw_per_confcode = dict()
# # Conformity codes consists of four fields that make a composite key
# conf = rdw_per_reg['gekentekende_voertuigen'][[
#     'typegoedkeuringsnummer', 
#     'uitvoering', 
#     'variant', 
#     'volgnummer_wijziging_eu_typegoedkeuring'
# ]].copy()

# drop nan
conf.dropna(inplace=True)
conf = conf.query('typegoedkeuringsnummer != "nan"')

# Add shorter key "eu_type_goedkeuringssleutel"
conf = conf.merge(
    how='left', 
    right=long_to_short_conf(conf.typegoedkeuringsnummer).drop_duplicates(), 
    left_on='typegoedkeuringsnummer', right_index=True
)

# rename fields
conf.volgnummer_wijziging_eu_typegoedkeuring = conf.volgnummer_wijziging_eu_typegoedkeuring.astype('Int8').astype(str)
conf.rename(columns={
    'uitvoering': 'eeg_uitvoeringscode',
    'variant': 'eeg_variantcode',
    'volgnummer_wijziging_eu_typegoedkeuring': 'uitvoering_wijzigingsnummer',
}, inplace=True)

# duplicates
display(
    conf.loc[:, conf.columns]\
    .reset_index()\
    .groupby('eu_type_goedkeuringssleutel')\
    .nunique()\
    .replace(1,np.nan)\
    .dropna(how='all')\
    .fillna(1)\
    .astype(int)\
    .sort_values(by='kenteken', ascending=False)
)

key = 'conformity_codes'
rdw_per_confcode[key] = conf.reset_index().set_index(['eu_type_goedkeuringssleutel', 'eeg_variantcode', 'eeg_uitvoeringscode', 'uitvoering_wijzigingsnummer']).copy()

AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.

In [18]:
short_confs = rdw_per_confcode['conformity_codes'].reset_index()\
.set_index('typegoedkeuringsnummer').eu_type_goedkeuringssleutel
full_confs = rdw_per_confcode['conformity_codes'].reset_index()\
.set_index('typegoedkeuringsnummer').loc[:, ['eu_type_goedkeuringssleutel', 'eeg_variantcode', 'eeg_uitvoeringscode', 'uitvoering_wijzigingsnummer']]
full_confs_with_long = full_confs.reset_index().drop(columns=['eu_type_goedkeuringssleutel'])

In [19]:
# Get sub apis from main api
from_key = 'tgk'
sub_apis,_,_ = get_apis_with_search('title:"Open Data RDW: TGK"')
print(f'{from_key}')
for api_name in sub_apis:
    if api_name == '9s6a-b42z':
        # This api only needs one field as an index
        Info = RdwInfo(full_confs_with_long.typegoedkeuringsnummer, api_name, hidden_api_keys.socrata_apptoken)
    else:
        Info = RdwInfo(full_confs_with_long, api_name, hidden_api_keys.socrata_apptoken)
    Info.process_api()
    key = re.sub('\s', '_', Info.metadata_['name'].lower())
    key = re.sub(f'^{from_key}_', '', key)
    rdw_per_confcode[key] = Info.get_df().copy()
    if key == 'handelsbenaming_fabrikant':
        # rename typo codevariantgk -> codevarianttgk (missing 't')
        rdw_per_confcode[key].index.names = [re.sub('codevariantgk', 'codevarianttgk', n) for n in rdw_per_confcode[key].index.names]
    print(api_name, key)

tgk
xhyb-w7xt as_uitvoering
ky2r-jqad carrosserie_uitvoering
byxc-wwua basis_uitvoering
x5v3-sewk handelsbenaming_fabrikant
m692-vvff speciale_doeleinden
kyri-nuah merk_uitvoering
gr7t-qfnb energiebron_uitvoering
d3ex-xghj koppeling_uitvoering
7rjk-eycs versnelling_uitvoering
4by9-ammk aandrijving_uitvoering
xn6e-huse rupsbandset_uitvoering
9s6a-b42z intrekking_typegoedkeuring


Merge dataframes from conformity codes apis

In [20]:
print('x: Data can be merged. (should be unique, 4 level key and contain data)')
full_codes = dict()
for k, df in rdw_per_confcode.items():
    if (k != 'conformity_codes') and (df.index.nlevels == 4) and (df.index.is_unique):
        assert df.index.names == ['typegoedkeuringsnummer', 'codevarianttgk', 'codeuitvoeringtgk', 'volgnummerrevisieuitvoering']
        df.reset_index(inplace=True)
        df.volgnummerrevisieuitvoering = df.volgnummerrevisieuitvoering.astype(int).astype(str)
        df.set_index(['typegoedkeuringsnummer', 'codevarianttgk', 'codeuitvoeringtgk', 'volgnummerrevisieuitvoering'], inplace=True)
        full_codes[k] = df
        print(f'[x] {k:64s}', end='')
    else:
        print(f'[ ] {k:64s}', end='')

    print({True: '[idx: unique    ]', False: '[idx: NOT unique]'}[df.index.is_unique],
          f'[keys: {df.index.nlevels}]', 
          f'[shape: {df.shape[0]:3.0f},{df.shape[1]:3.0f}]'
         )

x: Data can be merged. (should be unique, 4 level key and contain data)
[ ] conformity_codes                                                [idx: NOT unique] [keys: 4] [shape:  56,  6]
[x] as_uitvoering                                                   [idx: unique    ] [keys: 4] [shape:  43, 29]
[x] carrosserie_uitvoering                                          [idx: unique    ] [keys: 4] [shape:  31,  4]
[x] basis_uitvoering                                                [idx: unique    ] [keys: 4] [shape:  52, 33]
[x] handelsbenaming_fabrikant                                       [idx: unique    ] [keys: 4] [shape:  52,490]
[ ] speciale_doeleinden                                             [idx: unique    ] [keys: 1] [shape:   0,  1]
[x] merk_uitvoering                                                 [idx: unique    ] [keys: 4] [shape:  52,457]
[x] energiebron_uitvoering                                          [idx: unique    ] [keys: 4] [shape:  52,133]
[x] koppeling_uitvoering

In [21]:
# merge
out = pd.concat(full_codes, axis=1)
display(out.loc[:, (slice(None), 'TimeStamp')].bfill(axis=0).iloc[0,:].to_frame())

Unnamed: 0_level_0,Unnamed: 1_level_0,e1*168/2013*00265*00
Unnamed: 0_level_1,Unnamed: 1_level_1,2
Unnamed: 0_level_2,Unnamed: 1_level_2,01
Unnamed: 0_level_3,Unnamed: 1_level_3,0
as_uitvoering,TimeStamp,2024-12-29 12:08:44
carrosserie_uitvoering,TimeStamp,2024-12-29 12:08:47
basis_uitvoering,TimeStamp,2024-12-29 12:08:51
handelsbenaming_fabrikant,TimeStamp,2024-12-29 12:08:58
merk_uitvoering,TimeStamp,2024-12-29 12:09:02
energiebron_uitvoering,TimeStamp,2024-12-29 12:09:07
koppeling_uitvoering,TimeStamp,2024-12-29 12:09:14
versnelling_uitvoering,TimeStamp,2024-12-29 12:09:16
aandrijving_uitvoering,TimeStamp,2024-12-29 12:09:20


In [22]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-conf/rdw-conf-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

Skip. /home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/rdw-conf/rdw-conf-0-data-2024-12-24.pkl exists or saving is disabled in settings.


<a href="#rdw_top" id='rdw_ovi'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Data from rdw website (OVI)
Optionally get data from rdw website

In [23]:
if OVIDATA == False:
    rdw_ovi = None
else:
    regs = rdw_per_reg['registrations'].Reg.to_list()
    Info = OviInfo(regs, verbose=VERBOSE)
    Info.process_api()
    print(Info)

    rdw_ovi = Info.data_.copy()
    # make fields lowercase and add "ovi_"
    rdw_ovi.index.name='kenteken'
    rdw_ovi.columns = [re.sub(r'([A-Z])',r'_\1', c).lower() if c != 'TimeStamp' else c for c in rdw_ovi.columns] # after capital, add _ 
    rdw_ovi.columns = [re.sub(r'^_','', c) for c in rdw_ovi.columns] # remove trailing _
    # Basic operations
    rdw_ovi = pd.concat(
        [rdw_ovi, 
         rdw_ovi.eigenaren.str.split('/', expand=True).rename(columns = {0: 'eigenaren_private', 1: 'eigenaren_company'}).astype('Int8')
        ], axis=1)
    rdw_ovi['eigenaren_total'] = rdw_ovi.eigenaren_private + rdw_ovi.eigenaren_company
    #rdw_ovi['ovi_wachten_op_keuring_ind'] = rdw_ovi.ovi_wachten_op_keuring.apply(lambda x: {'Ja': True, 'Nee': False}[x] if isinstance(x, str) else x).astype('boolean')

    if VERBOSE > 1:
        rdw_ovi
    else:
        print(rdw_ovi.shape)

This takes approximately 3 minutes. Done by 13:12:52.


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0-9,[0/0]73MFDZ,[0/0]11MSDH,[0/0]F910TX,[0/0]MPRZ21,[0/0]DSZ98P,[0/0]FRF75K,[0/0]FGX40T,[0/0]MXFT83,[0/0]FFT21P,[0/0]D850BP
10-19,[0/0]DGD12D,[0/0]FGD21N,[0/0]DHX15J,[0/0]FHK07R,[0/0]90MNNF,[0/0]73MHJB,[0/0]DBX53T,[0/0]FDR38K,[0/0]DLX22R,[0/0]FSN45T
20-29,[0/0]FD426H,[0/0]68MPDR,[0/0]FKG10L,[0/0]DNP37N,[0/0]FJV64D,[0/0]J624PJ,[0/0]N158GV,[0/0]R244KD,[0/0]X035GN,[0/0]J575KJ
30-39,[0/0]70LZV7,[0/0]29XZDH,[0/0]30XZDH,[0/0]49GRG6,[0/0]VZ002K,[0/0]9SRR06,[0/0]G613BG,[0/0]4VHX72,[0/0]6XDB96,[0/0]V983GG
40-49,[0/0]V640VG,[0/0]PJ672G,[0/0]67TFV7,[0/0]S458XL,[0/0]64FLND,[0/0]90LTH8,[0/0]J052TH,[0/0]83GTTL,[0/0]GNB70N,[0/0]6KBJ11
50-59,[0/0]ZV790F,[0/0]29TLTN,[0/0]PP823J,[0/0]RSTN76,[0/0]VHD86F,[0/0]G388DR,[0/0]WDJX57,[0/0]51JPN7,[0/0]9VNS00,[0/0]22TDKX
60-69,[0/0]NJ868F,[0/0]4TZF77,[0/0]3SNF20,[0/0]33SLKF,[0/0]LM3884,[0/0]KM0764,,,,
70-79,done,,,,,,,,,


Class contains
	current_reg_ <class 'str'>:
		None
	data_ <class 'pandas.core.frame.DataFrame'>:
		shape=(66, 1)
	idx_ <class 'list'>:
		len=66
	metadata_ <class 'dict'>:
		contains fields ['header']
	metadata_.header <class 'str'>:
		{'Date': 'Sun, 29 Dec 2024 12: .. tps://authenticatie2.rdw.nl;"}
	resp_status_ <class 'str'>:
		None
	verbose_level_ <class 'str'>:
		1


AttributeError: 'DataFrame' object has no attribute 'eigenaren'

In [None]:
out = rdw_ovi.copy()

In [None]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-ovi/rdw-ovi-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

<a href="#rdw_top" id='rdw_merge'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Data from The National Highway Traffic Safety Administration (NHTSA)
Based on VIN. Product Information Catalog and Vehicle Listing (vPIC)
https://vpic.nhtsa.dot.gov/api/

In [24]:
from vin_lookup import Nhtsa_batch

In [25]:
# empty dictionary
nhtsa_per_vin = dict()

In [26]:
key = 'vpic'
df_ =  drz.loc[:, ['Vin', 'Mfyear']].copy().replace({'': np.nan, 'onbekend': np.nan}) # copy from drz

# borrow mfyear from rdw info
rdw_mfy = pd.merge(  left = rdw_per_reg['registrations'].reset_index(),
                     right = rdw_per_reg['gekentekende_voertuigen'].datum_eerste_toelating.reset_index(),
                     how='left',
                     right_on='kenteken',
                     left_on='kenteken'
                    ).loc[:, ['lot_index', 'datum_eerste_toelating']].set_index('lot_index')
df_ =  pd.concat([df_, (rdw_mfy // 10000).astype(pd.Int16Dtype())], axis=1)
df_.update(df_.loc[:, ['Mfyear', 'datum_eerste_toelating']].bfill(axis=1))
df_.rename(columns={'Vin': 'VIN', 'Mfyear': 'MFY'}, inplace=True)
nhtsa_per_vin[key] = df_.loc[:, ['VIN', 'MFY']]

# lookup vins in batches
Batch = Nhtsa_batch(nhtsa_per_vin[key].iloc[:,:2].dropna(subset='VIN'), 
                    data_dict_fn = f"{cfg['FILE_LOCATION']['code_dir']}/assets/nhtsa-data-dict.csv",
                    verbose=VERBOSE)
Batch.full_parse()

# store in dict
out = Batch.data.copy()
out.loc[:, 'TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')
nhtsa_per_vin[key] = pd.concat([
    nhtsa_per_vin[key],
    out.drop(columns=out.columns[out.columns.str.startswith('system') | out.columns.str.startswith('internal')])
], axis=1)

if VERBOSE > 1:
    display(nhtsa_per_vin[key])
else:
    print('\n'.join(nhtsa_per_vin.keys()))

  df_.update(df_.loc[:, ['Mfyear', 'datum_eerste_toelating']].bfill(axis=1))


batch [1/2]


  add = pd.read_json(rsp)
  add.replace({'Not Applicable': np.nan, '': np.nan}, inplace=True)


batch [2/2]
Class contains
	batch_size_ <class 'str'>:
		50
	data_dict_ <class 'pandas.core.frame.DataFrame'>:
		shape=(140, 8)
	nbatches_ <class 'str'>:
		2
	succes_ <class 'str'>:
		True
	url_ <class 'str'>:
		https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVINValuesBatch
	verboselevel_ <class 'str'>:
		1
	vins_ <class 'pandas.core.series.Series'>:
		shape=(77,)
data (size): (77, 154)
vpic


  add = pd.read_json(rsp)
  add.replace({'Not Applicable': np.nan, '': np.nan}, inplace=True)


In [27]:
# Merge with input and potentential other sources
nhtsa_per_vin['vpic'].index.name = 'lot_index'
Batch.data.index.name = 'lot_index'
df_vins = pd.concat([
    nhtsa_per_vin['vpic'].loc[:, ['VIN', 'MFY']], 
    Batch.data
], axis = 1)
# add timestamp
df_vins.loc[:, 'TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')
# Set index to vin/mfy
df_vins = df_vins.reset_index().set_index(['VIN', 'MFY'])

In [28]:
out = df_vins.copy()

In [31]:
out

Unnamed: 0_level_0,Unnamed: 1_level_0,lot_index,system___count,system___message,system___searchcriteria,system___batch,activesafetysystem___abs,activesafetysystem___activesafetysysnote,activesafetysystem_maintainingsafedistance__adaptivecruisecontrol,activesafetysystem_lightingtechnologies__adaptivedrivingbeam,system___adaptiveheadlights,...,system___vehicledescriptor,general___vehicletype,exterior_dimension__wheelbaselong,exterior_dimension__wheelbaseshort,exterior_body__wheelbasetype,exterior_wheeltire__wheelsizefront,exterior_wheeltire__wheelsizerear,exterior_wheeltire__wheels,exterior_body__windows,TimeStamp
VIN,MFY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
JS1CF111100103143,2006,2025-03-1800,50,Results returned successfully. NOTE: Any missi...,,1,,,,,,...,JS1CF111*00,MOTORCYCLE,,,,,,,,20250301
MLHRH01A6M5101976,2021,2025-03-1802,50,Results returned successfully. NOTE: Any missi...,,1,,,,,,...,MLHRH01A*M5,MOTORCYCLE,,,,,,,,20250301
ZAPC5330100004448,2015,2025-03-1803,50,Results returned successfully. NOTE: Any missi...,,1,,,,,,...,ZAPC5330*00,MOTORCYCLE,,,,,,,,20250301
ZAPM0700000014573,1998,2025-03-1804,50,Results returned successfully. NOTE: Any missi...,,1,,,,,,...,ZAPM0700*00,MOTORCYCLE,,,,,,,,20250301
L5YADCZB7H1156288,2017,2025-03-1806,50,Results returned successfully. NOTE: Any missi...,,1,,,,,,...,L5YADCZB*H1,MOTORCYCLE,,,,,,2,,20250301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WDD2210221A077059,2006,2025-03-7057,27,Results returned successfully. NOTE: Any missi...,,2,,,,,,...,WDD22102*1A,PASSENGER CAR,,,,,,,,20250301
WBAUB71040VA91764,2009,2025-03-7058,27,Results returned successfully. NOTE: Any missi...,,2,,,,,,...,WBAUB710*0V,PASSENGER CAR,,,,,,,,20250301
TSMEXB22S00361579,2011,2025-03-7060,27,Results returned successfully. NOTE: Any missi...,,2,,,,,,...,TSMEXB22*00,,,,,,,,,20250301
W0LGM8EP3C1070306,2012,2025-03-7061,27,Results returned successfully. NOTE: Any missi...,,2,,,,,,...,W0LGM8EP*C1,PASSENGER CAR,,,,,,,,20250301


In [29]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/nhtsa-vpic/nhtsa-vpic-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/nhtsa-vpic/nhtsa-vpic-0-data-2024-12-24.pkl


# Merge datasets
Now we have a couple of dataset that can be merged.

`rdw_per_reg` dictionary  
`rdw_per_confcode` combined in another dictionary `full_codes`  
`rdw_ovi` a single dataframe  
`nhtsa_per_vin` has only one field and is combined in dataframe `df_vins`  


1. Merge dataframes from `rdw_per_reg` with primary key `kenteken`
2. 
    1) Add conformity codes from `rdw_per_confcode`
    2) Add basic conformity info from `rdw_per_confcode.eeg_voertuigtypegoedkeuring`
3. Merge all conformity code information from other apis `full_codes`
4. Merge with OVI
5. Merge with vpic (nhtsa)
6. Merge with auction results


In [30]:
# Merge first set
rich = pd.concat(rdw_per_reg, axis=1)
rich = pd.concat([rich], keys=['rdw'], axis=1)
rich.index.name='kenteken'
print(f'{rich.shape[1]} columns {rich.shape[0]} {rich.index.name} ')

# Add conformity_codes
codes = rdw_per_confcode['conformity_codes'].reset_index().set_index('kenteken')
codes = pd.concat([codes], keys=['conformity_codes'], axis=1)
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.merge(
    codes, # add level
    how='outer',
    left_index = True,
    right_index = True,
)
print(f'{rich.shape[1]} columns {rich.shape[0]} {rich.index.name}')

566 columns 66 kenteken 
575 columns 66 kenteken


In [31]:
# Add data with one level EU keys
codes = rdw_per_confcode['intrekking_typegoedkeuring'].reset_index()
# add levels to dataframe
codes = pd.concat([codes], keys=['intrekking_typegoedkeuring'], axis=1)
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.reset_index().merge(
    codes, 
    how='outer',
    left_on = [('rdw', 'conformity_codes', 'typegoedkeuringsnummer')],
    right_on = [('rdw', 'intrekking_typegoedkeuring', 'typegoedkeuringsnummer')],
).set_index('kenteken')
print(f'{rich.shape[1]} columns {rich.shape[0]} {rich.index.name}')

KeyError: ('rdw', 'intrekking_typegoedkeuring', 'typegoedkeuringsnummer')

In [None]:
# Add full_codes
codes = pd.concat(full_codes, axis=1)#.drop(columns='conformity_codes')
codes = pd.concat([codes], keys=['rdw'], axis=1)
# rename index names to match existing
existing_idx_names = [{
    'typegoedkeuringsnummer': 'typegoedkeuringsnummer',
    'codevarianttgk': 'eeg_variantcode',
    'codeuitvoeringtgk': 'eeg_uitvoeringscode',
    'volgnummerrevisieuitvoering': 'uitvoering_wijzigingsnummer'}[c] for c in codes.index.names]
codes.index.names = existing_idx_names
rich = rich.merge(
    codes,
    how='outer',
    left_on = [('rdw', 'conformity_codes', c) for c in codes.index.names],
    right_index=True
)
print(f'{rich.shape[1]} columns {rich.shape[0]} {rich.index.name}')

In [None]:
# Add ovi
codes = pd.concat([rdw_ovi], keys=['ovi'], axis=1)
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.merge(
    codes,
    how='outer',
    left_index = True,
    right_index = True,
)
print(f'{rich.shape[1]} columns {rich.shape[0]} {rich.index.name}')

In [None]:
# Add vpic
codes = pd.concat(nhtsa_per_vin, axis=1)
codes = pd.concat([codes], keys=['nhtsa'], axis=1)
rich = rich.reset_index().merge(
    codes,
    how='outer',
    left_on = [('rdw', 'registrations', 'lot_index')],
    right_index = True,
).set_index(('rdw', 'registrations', 'lot_index')) # set to 3d index
rich.index.name = 'lot_index' # make 1d index again
print(f'{rich.shape[1]} columns {rich.shape[0]} {rich.index.name}')

In [None]:
rich.columns.map(lambda x: '_'.join(x))
rich

In [None]:
# Three level column index
existing = pd.concat([drz], keys=[''], axis=1)
existing = pd.concat([existing], keys=['drz'], axis=1)
print(f'{existing.shape[1]} columns {existing.shape[0]} {existing.index.name}')
# Add rich to existing to make enriched
enriched = pd.merge(
    left = existing,
    right = rich,
    how = 'left',
    left_index = True,
    right_index = True
)
print(f'{enriched.shape[1]} columns {enriched.shape[0]} {enriched.index.name}')

<a href="#rdw_top" id='rdw_save'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Saving

In [None]:
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-data-{auction_month}-{month_counter}.pkl'
if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    enriched.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

# Next: download images (or parallel)

Because images might be taken down from the drz site, it is advisable to run the notebook that downloads images soon.