<a id='rdw_top'>

# Add extra information from RDW open data

Query to the open data dataset of the RDW.


1. <a href="#rdw_registrations">Registration numbers</a>  
    Apis with license plates as key
2. <a href="#rdw_confcodes">Conformity codes</a>  
    Cars get a conformity code when certified.
3. <a href="#rdw_other_apis">Other APIs</a>  
    Query all conformity codes in belonging to data set.
4. <a href="#rdw_ovi">Website data</a>  
    Get data from OVI RDW website. This takes a while because of time out enforced by website. Use config to disable.
5. <a href="#rdw_merge">Merge results</a>  
    Combine all dataframes and save
6. <a href="#rdw_save">Save results</a>  
- - - - 

### User variables


In [1]:
import sys
import re
import json
from IPython.display import display

In [2]:
with open('../assets/drz-settings-current.json', 'r') as fid:
    cfg = json.load(fid)

OPBOD = cfg['AUCTION']['kind'] == 'opbod'
AUCTION_ID = cfg['AUCTION']['id']
DATE = cfg['AUCTION']['date']
DATA_DIR = cfg['FILE_LOCATION']['data_dir']
auction_month = DATE[:4] + '-' + DATE[4:6]
if cfg['AUCTION']['kind'] == 'inschrijving':
    month_counter = re.sub('(-)(\d{2})', '\g<1>', AUCTION_ID)[5:8]
elif cfg['AUCTION']['kind'] == 'opbod':
    month_counter = re.sub('(-)(\d{2})(\d{2})', '-\g<2>', AUCTION_ID)[5:8]

sys.path.insert(0, cfg['FILE_LOCATION']['code_dir'])

QUICK_MERGE = False
SKIPSAVE = False
OVIDATA = True
VERBOSE = 1

### Modules and functions

In [3]:
import pandas as pd
import numpy as np
import re 
import os
# to keep api key hidden import this from sub dir
import assets.hidden_api_keys as hidden_api_keys
from rdw_info import *

enjoy rdw_info


In [4]:
main_api = 'm9d7-ebf2'
keur_api = 'vkij-7mwc'
apk_api = 'sgfe-77wx'
gebr_api = 'hx2c-gt7k'
toe_api = 'sghb-dzxx'

conf_api = '55kv-xf7m'

### Load auction results

In [5]:
file_name = f'{DATA_DIR}/auctions/results/drz-data-{auction_month}-{month_counter}.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
if not os.path.isfile(file_name):
    # see if -without price- exists
    NO_PRICE = True
    if NO_PRICE:
        file_name = file_name.replace('auctions/results', 'auctions/without-price')
        file_name = file_name.replace('.pkl', '-without-price.pkl')
    if OPBOD:
        file_name = file_name.replace('-opbod-without-price.pkl', '-without-price-opbod.pkl')
#     else:
#         file_name = file_name.replace('.pkl', '-without-price.pkl')
else:
    NO_PRICE = False


print(file_name)
drz = pd.read_pickle(file_name)

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/results/drz-data-2024-05-09.pkl


In [6]:
if QUICK_MERGE:
    raise NotImplementedError

### Collect number plate registrations

In [7]:
# see what lots have a Dutch registration (license number).
hasReg = (~drz.Reg.isnull()) & (drz.Reg != 'onbekend') & (drz.Reg != '') & (~drz.LotType.isin([
    'Vaartuig',
    'Jetski',
    'Sloep',
    'Speedboot',
    'Vaartuig (Type onbekend)',
    'Motorvaartuig met opbouw (Pleziervaartuig)',
]))

print('nr. of registrations:',sum(hasReg))

# adhoc fix
idx = '2022-08-5012' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'LM-82-14'
idx = '2022-29-5001' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'LM-82-14'
idx = '2022-29-2008' # check in pictures. reg is wrong
if idx in drz.index:
    drz.loc[idx, 'Reg'] = 'KT-05-40'



vc = drz.loc[hasReg, 'Reg'].str.upper().str.replace('-','').value_counts()
if any(vc > 1):
    display(vc[vc>1])
    display(drz[drz.Reg.str.upper().str.replace('-','').isin(vc[vc>1].index)])
    raise ValueError('Registration occurs in more than one lot.')
# assert all(vc == 1), [, display(vc[vc>1])]

# # make a copy and add info
# rdw = drz.copy()

nr. of registrations: 122


<a href="#rdw_top" id='rdw_registrations'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Main api 

The main api: `api_gekentekende_voertuigen` points to subsequent apis.

In [8]:
# empty dictionary
rdw_per_reg = dict()

# first element of dict is registrations
key = 'registrations'
rdw_per_reg[key] = drz.loc[hasReg,['Reg', 'LotType']].copy() # copy from drz
rdw_per_reg[key]['kenteken'] = rdw_per_reg[key].Reg.apply(lambda r: r.replace('-','').upper())
rdw_per_reg[key].index.name = 'lot_index'
rdw_per_reg[key] = rdw_per_reg[key].reset_index().set_index('kenteken')
with pd.option_context('display.max_rows', 999):
    display(rdw_per_reg[key].reset_index().set_index(['LotType', 'kenteken']).sort_index())

print('\n'.join(rdw_per_reg.keys()))

Unnamed: 0_level_0,Unnamed: 1_level_0,lot_index,Reg
LotType,kenteken,Unnamed: 2_level_1,Unnamed: 3_level_1
Aanhangwagen (Dubbelasser),WF53XT,2024-09-7037,WF-53-XT
Aanhangwagen (Dubbelasser),WV39LY,2024-09-7054,WV-39-LY
Aanhangwagen (Paardentrailer),WP90NP,2024-09-7019,WP-90-NP
Bedrijfswagen,3VVV29,2024-09-7022,3-VVV-29
Bedrijfswagen,4VKZ29,2024-09-7046,4-VKZ-29
Bedrijfswagen,60VHH5,2024-09-7044,60-VHH-5
Bedrijfswagen,61VRT3,2024-09-7074,61-VRT-3
Bedrijfswagen,6VXS40,2024-09-7151,6-VXS-40
Bedrijfswagen,8VZB94,2024-09-7109,8-VZB-94
Bedrijfswagen,96BBF8,2024-09-7063,96-BBF-8


registrations


In [9]:
# Assess these registrations
regs = rdw_per_reg['registrations'].Reg.values

# Main rdw api
Info = RdwInfo(regs, main_api, hidden_api_keys.socrata_apptoken)
Info.process_api()
key = re.sub('\s', '_', Info.metadata_['name'].lower())
rdw_per_reg[key] = Info.get_df().copy()
print(Info)

[ 196,  256,  148,  280, <NA>, <NA>,  208,  220,  316,  244,  244,  292,  268,
  340,  496,  268,  460,  436, <NA>, <NA>,  280,  268,  256,  628,  244,  280,
  436,  436,  340,  256,  256,  664,  640,  424,  172,  184,  412,  544,  568,
  340,  460,  148,  328,  364,  184,  220,  568,  364,  208,  292,  532, <NA>,
  100,  100,  100,  100, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
 <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
 <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
  496,  544,  568,  688,  604]
Length: 96, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col:

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
Class contains
	api_name_ <class 'str'>:
		m9d7-ebf2
	api_url_ <class 'str'>:
		https://opendata.rdw.nl/resource/m9d7-ebf2.json
	data_ <class 'pandas.core.frame.DataFrame'>:
		shape=(121, 76)
	idx_ <class 'list'>:
		len=122
	metadata_ <class 'dict'>:
		contains fields ['online', 'name', 'pivot_columns', 'primary_keys', 'header']
	metadata_.online <class 'pandas.core.frame.DataFrame'>:
		shape=(96, 17)
	metadata_.name <class 'str'>:
		Gekentekende_voertuigen
	metadata_.pivot_columns["m9d7-ebf2"] <class 'str'>:
		-empty-
	metadata_.primary_keys["m9d7-ebf2"] <class 'str'>:
		kenteken
	metadata_.header <class 'str'>:
		{'Server': 'nginx', 'Date': 'W .. f333c1048e02395f80a4a5c9373d'}
	resp_status_ <class 'str'>:
		200


Sub apis

In [10]:
# Get sub apis from main api
from_key = 'gekentekende_voertuigen'
sub_apis,_,_ = get_sub_apis(rdw_per_reg[from_key])
# add extra apis
sub_apis += ['3xwf-ince', '2ba7-embk', '7ug8-2dtt', 't49b-isb7', keur_api, apk_api, toe_api] #,'a34c-vvps', # some extra apis with registrations
for api_name in sub_apis:
    Info.set_api_name(api_name)
    Info.process_api()
    key = re.sub('\s', '_', Info.metadata_['name'].lower())
    key = re.sub(f'^{from_key}_', '', key)
    rdw_per_reg[key] = Info.get_df().copy()
    print(api_name, key)

# Get apis from apk api
from_key = 'meldingen_keuringsinstantie'
sub_apis,_,_ = get_sub_apis(rdw_per_reg[from_key])
for api_name in set(sub_apis):
    Info.set_api_name(api_name)
    Info.process_api()
    key = re.sub('\s', '_', Info.metadata_['name'].lower())
    key = re.sub(f'^{from_key}_', '', key)
    rdw_per_reg[key] = Info.get_df().copy()
    print(api_name, key)

[ 196,  208,  244,  268,  160,  256,  244,  268,  520,  520, <NA>, <NA>, <NA>,
 <NA>, <NA>, <NA>]
Length: 16, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['2', '3', '1', '4', '5', '6', '9', '8', '7', '10', '16', '12'])
 list(['J', 'N']) list(['N', 'J', 'n', 'j', 'A', 'a', 'M', 'B', 'V'])
 list(['A', 'V', 'a', 'N', 'v', 'n', 'Q', '0', '~', '4'])
 list(['G', 'A', 'L', 

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
3huj-srit assen


[ 196,  340,  364,  388,  460,  364,  400,  340,  352,  388,  388,  520,  520,
  400,  400,  340,  484,  244,  376, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
 <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>]
Length: 36, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)


('field_content', 'non_null')
('field_content', 'not_null')
('field_content', 'null')
('field_content', 'count')
('field_content', 'cardinality')
('field_content', 'average')
('field_content', 'sum')
8ys7-d773 brandstof
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

vezc-m2t6 carrosserie
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

jhie-znh9 carrosserie_specificatie
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['1', '3', '2', 'B', 'A', 'III', 'II', 'I'])
 list(['Klasse I', 'Klasse III', 'Klasse II', 'Klasse B', 'Klasse A'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

kmfi-hrps voertuigklasse
('field_info', 'width')
('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

3xwf-ince rupsbanden
('field_info', 'width')
('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['Caravan', 'Kampeerwagen', 'Voor rolstoelen toegankelijk voertuig', 'Overig voertuig voor speciale doeleinden', 'Mobiele kraan', 'Ambulance', 'Aanhangwagen vervoer van uitzond lading', 'Lijkwagen', 'Dolly', 'Gepantserd voertuig', 'Motorvoertuig vervoer uitzonderl lading', 'Multifunctionele werktuigdrager'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('

2ba7-embk subcategorie_voertuig
('field_info', 'width')
('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

7ug8-2dtt bijzonderheden
('field_content', 'average')
('field_content', 'sum')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['Producent heeft herstel gemeld', 'Openstaande terugroepactie'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, 

t49b-isb7 terugroep_actie_status


[196, 328, <NA>]
Length: 3, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].met

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
vkij-7mwc keuringen


[140, 116, 193, 176, 222, 174, 328, <NA>, <NA>, 100, 100]
Length: 11, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['APK Lichte voertuigen', 'APK Zware voertuigen', 'Controleapparaten', 'Gasinstallaties', 'APK-Landbouw'])
 list(['periodieke controle', 'inbouw', 'manipulatie tacho', 'zegelverbreking tacho', 'uitbouw'])]' has dtype incompatible with float64, please expl

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
sgfe-77wx meldingen_keuringsinstantie


[121, 123, 155, 228, 143, 208, 154, 168, 174, <NA>, <NA>]
Length: 11, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['G3', 'G2', 'R1'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The be

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
sghb-dzxx toegevoegde_objecten


[340, 328, 292, 376, 352, 328, <NA>, <NA>]
Length: 8, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
hx2c-gt7k gebreken


[196, 163, 199, 175, 197, 207, 186, <NA>]
Length: 8, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['APK Lichte voertuigen', 'APK Zware voertuigen', 'Controleapparaten', 'APK-Landbouw', 'Gasinstallaties'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('f

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
a34c-vvps geconstateerde_gebreken


add info from reference tables

In [11]:
# Use reference table to add info
df_left = rdw_per_reg['geconstateerde_gebreken'].copy()
df_right = rdw_per_reg['gebreken'].copy()
on_column = 'gebrek_identificatie'

for left_column, left in df_left.loc[:, df_left.columns.str.startswith(on_column)].items():
    suffix = re.sub(on_column, '', left_column)
    df_merge = pd.merge(
        left=left.reset_index(),
        right=df_right,
        how='left',
        left_on=left_column,
        right_on=on_column,
    ).set_index('kenteken')
    df_merge = df_merge.drop(columns=[left_column, on_column, 'TimeStamp']).add_suffix(suffix)
    df_left = df_left.merge(df_merge, left_index=True, right_index=True)

# add extra table
rdw_per_reg['geconstateerde_gebreken_met_beschrijving'] = df_left
# clean up: 
#    remove reference table
del rdw_per_reg['gebreken']
#    remove table without description
del rdw_per_reg['geconstateerde_gebreken']

In [12]:
out = pd.concat(rdw_per_reg, axis=1)
display(out.loc[:, (slice(None), 'TimeStamp')].bfill(axis=0).iloc[-1,:].to_frame())

Unnamed: 0,Unnamed: 1,WF69JH
gekentekende_voertuigen,TimeStamp,2024-05-15 15:19:02
assen,TimeStamp,2024-05-15 15:19:02
brandstof,TimeStamp,NaT
carrosserie,TimeStamp,NaT
carrosserie_specificatie,TimeStamp,NaT
voertuigklasse,TimeStamp,NaT
rupsbanden,TimeStamp,NaT
subcategorie_voertuig,TimeStamp,NaT
bijzonderheden,TimeStamp,NaT
terugroep_actie_status,TimeStamp,NaT


In [13]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-reg/rdw-reg-full-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/rdw-reg/rdw-reg-full-0-data-2024-05-09.pkl


<a href="#rdw_top" id='rdw_confcodes'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Conformity codes

In [14]:
# empty dict
rdw_per_confcode = dict()
# Conformity codes consists of four fields that make a composite key
conf = rdw_per_reg['gekentekende_voertuigen'][[
    'typegoedkeuringsnummer', 
    'uitvoering', 
    'variant', 
    'volgnummer_wijziging_eu_typegoedkeuring'
]].copy()
conf.dropna(inplace=True)

# Add shorter key "eu_type_goedkeuringssleutel"
conf = conf.merge(
    how='left', 
    right=long_to_short_conf(conf.typegoedkeuringsnummer).drop_duplicates(), 
    left_on='typegoedkeuringsnummer', right_index=True
)

# rename fields
conf.volgnummer_wijziging_eu_typegoedkeuring = conf.volgnummer_wijziging_eu_typegoedkeuring.astype('Int8').astype(str)
conf.rename(columns={
    'uitvoering': 'eeg_uitvoeringscode',
    'variant': 'eeg_variantcode',
    'volgnummer_wijziging_eu_typegoedkeuring': 'uitvoering_wijzigingsnummer',
}, inplace=True)
# drop nan
conf = conf.query('typegoedkeuringsnummer != "nan"')

# duplicates
display(
    conf.loc[:, conf.columns]\
    .reset_index()\
    .groupby('eu_type_goedkeuringssleutel')\
    .nunique()\
    .replace(1,np.NaN)\
    .dropna(how='all')\
    .fillna(1)\
    .astype(int)\
    .sort_values(by='kenteken', ascending=False)
)

key = 'conformity_codes'
rdw_per_confcode[key] = conf.reset_index().set_index(['eu_type_goedkeuringssleutel', 'eeg_variantcode', 'eeg_uitvoeringscode', 'uitvoering_wijzigingsnummer']).copy()

Unnamed: 0_level_0,kenteken,typegoedkeuringsnummer,eeg_uitvoeringscode,eeg_variantcode,uitvoering_wijzigingsnummer
eu_type_goedkeuringssleutel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e1*07/46*0607*26,2,1,2,2,1
e1*168/13*00265*01,2,1,1,2,1
e11*01/116*0238*06,2,1,1,1,1


In [15]:
short_confs = rdw_per_confcode['conformity_codes'].reset_index()\
.set_index('typegoedkeuringsnummer').eu_type_goedkeuringssleutel
full_confs = rdw_per_confcode['conformity_codes'].reset_index()\
.set_index('typegoedkeuringsnummer').loc[:, ['eu_type_goedkeuringssleutel', 'eeg_variantcode', 'eeg_uitvoeringscode', 'uitvoering_wijzigingsnummer']]

In [16]:
# Main conformity code api
Info = RdwInfo(short_confs, conf_api, hidden_api_keys.socrata_apptoken)
Info.process_api()
key = re.sub('\s', '_', Info.metadata_['name'].lower())
key = re.sub(f'^{from_key}_', '', key)
rdw_per_confcode[key] = Info.get_df().copy()
print(key)

[ 424,  436,  460,  436,  508,  400,  616,  520,  316,  208,  544,  472,  544,
  508,  268,  364,  424,  424, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,  484,  460,
  496,  100,  604,  460,  460,  340,  460,  472,  100,  100,  460]
Length: 37, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top'

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
eeg_voertuigtypegoedkeuring


In [17]:
# Get sub apis from main api
from_key = 'eeg_voertuigtypegoedkeuring'
sub_apis,_,_ = get_sub_apis(rdw_per_confcode[from_key])
Info = RdwInfo(full_confs, sub_apis[0], hidden_api_keys.socrata_apptoken)
for api_name in sub_apis:
    Info.set_api_name(api_name)
    Info.process_api()
    key = re.sub('\s', '_', Info.metadata_['name'].lower())
    key = re.sub(f'^{from_key}_', '', key)
    rdw_per_confcode[key] = Info.get_df().copy()
    print(api_name, key)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['N', 'J', 'S', '4', '5', 'F']) list(['A', 'V', '7', '1', '3', '8'])
 list(['N', 'J', '1']) list(['N', 'J']) list(['J', 'N'])
 list(['N', 'J', '3', '1', '0', '7'])
 list(['N', 'J', 'W', '7', '#', '5', 'Z'])
 list(['N', 'J', 'R', 'G', 'F', 'T', 'L', 'P', 'M', 'E', 'K', 'H', 'Q', 'S', 'W', 'V', 'U', 'Y', 'C'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('

ahsi-8uyu as_gegevens_eeg_uitvoering


[ 424,  280,  328,  424,  436,  364,  496,  364,  352,  340,  268,  280,  256,
  280,  388,  448,  424,  364,  448,  364,  340,  388,  424,  316,  412,  424,
  400,  340,  328,  448,  448,  388,  388,  568,  568,  448,  448,  424,  424,
  448,  448,  460,  460,  556,  556,  568,  568,  556,  556,  436,  436,  520,
  520,  472,  472,  460,  460,  448,  508,  508,  520,  520,  520,  520,  520,
  520,  532,  532,  316,  460,  460,  472,  472,  436,  436,  496,  496,  520,
  520,  496,  496,  496,  496,  544,  544, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
 <NA>]
Length: 92, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' 

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
wx3j-69ie basisgegevens_eeg_uitvoering


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

q7fi-ijjh carrosserie_uitvoering_klasse


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(uniq_values[is_int].apply(lambda x: [int(v) for v

w2qp-idms carrosserie_uitvoering


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(uniq_values[is_int].apply(lambda x: [int(v) for v

nypm-t8hx carrosserie_uitvoering_nummerieke_code


[424, 280, 328, 424, 220, 400, 484, <NA>]
Length: 8, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)'

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
mdqe-txpd handelsbenaming_uitvoering


[424, 280, 328, 424, 196, 364, <NA>]
Length: 7, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or d

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
fj7t-hhik merk_uitvoering_toegestaan


[ 424,  280,  328,  424,  220,  208,  364,  256,  508,  340,  292,  364,  352,
  232,  352,  388,  472,  424,  424,  364,  496,  304,  232,  232,  436,  436,
  256,  340,  412,  352,  412,  352,  304,  412,  340,  352,  400,  412,  424,
  436,  328,  328,  472,  472,  388,  388,  508,  508, <NA>]
Length: 49, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['C4', 'E4', 'I

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
g2s6-ehxa motor_uitvoering


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
 list(['D', 'B', 'E', 'H', 'G', 'A', 'C', 'W'])
 list(['5', '6', '4', '3', '2', 'Z', '0', '1']) list(['J', 'N'])
 list(['N', 'J']) list(['D', 'G', 'V', 'W', 'I', 'A'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because

5w6t-p66a motor_uitvoering_brandstof


[424, 280, 328, 424, 436, 484, 508, <NA>]
Length: 8, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  md.loc[:,fld] = md.loc[:,fld].astype(dtype)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)'

('field_content', 'not_null')
('field_content', 'average')
('field_content', 'sum')
mt8t-4ep4 plaatsaanduiding_uitvoering


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

h9pa-e9ta subcategorie_uitvoering


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

2822-t8sx uitvoering_gebruiksgegevens_per_uitgave


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
  md.loc[:, ('field_content', 'factors')].update(md.loc[is_cat&is_str, ('field_content', 'top')].apply(lambda x: [i['item'] for i in x]))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the oper

r7cw-67gs versnellingsbak_uitvoering


Merge dataframes from conformity codes apis

In [18]:
print('x: Data can be merged. (should be unique, 4 level key and contain data)')
full_codes = dict()
for k, df in rdw_per_confcode.items():
    if (k != 'conformity_codes') and (df.index.nlevels == 4) and (df.index.is_unique):
        assert df.index.names == ['eu_type_goedkeuringssleutel', 'eeg_variantcode', 'eeg_uitvoeringscode', 'uitvoering_wijzigingsnummer']
        df.reset_index(inplace=True)
        df.uitvoering_wijzigingsnummer = df.uitvoering_wijzigingsnummer.astype(int).astype(str)
        df.set_index(['eu_type_goedkeuringssleutel', 'eeg_variantcode', 'eeg_uitvoeringscode', 'uitvoering_wijzigingsnummer'], inplace=True)
        full_codes[k] = df
        print(f'[x] {k:64s}', end='')
    else:
        print(f'[ ] {k:64s}', end='')

    print({True: '[idx: unique    ]', False: '[idx: NOT unique]'}[df.index.is_unique],
          f'[keys: {df.index.nlevels}]', 
          f'[shape: {df.shape[0]:3.0f},{df.shape[1]:3.0f}]'
         )

x: Data can be merged. (should be unique, 4 level key and contain data)
[ ] conformity_codes                                                [idx: NOT unique] [keys: 4] [shape: 102,  2]
[ ] eeg_voertuigtypegoedkeuring                                     [idx: unique    ] [keys: 1] [shape:  99, 32]
[x] as_gegevens_eeg_uitvoering                                      [idx: unique    ] [keys: 4] [shape:  87, 37]
[x] basisgegevens_eeg_uitvoering                                    [idx: unique    ] [keys: 4] [shape: 100, 53]
[ ] carrosserie_uitvoering_klasse                                   [idx: unique    ] [keys: 1] [shape:   0,  1]
[x] carrosserie_uitvoering                                          [idx: unique    ] [keys: 4] [shape:  76,  4]
[ ] carrosserie_uitvoering_nummerieke_code                          [idx: unique    ] [keys: 1] [shape:   0,  1]
[x] handelsbenaming_uitvoering                                      [idx: unique    ] [keys: 4] [shape: 100,816]
[x] merk_uitvoering_toeg

In [19]:
# merge
out = pd.concat(full_codes, axis=1)
display(out.loc[:, (slice(None), 'TimeStamp')].bfill(axis=0).iloc[0,:].to_frame())

Unnamed: 0_level_0,Unnamed: 1_level_0,e1*01/116*0217*17
Unnamed: 0_level_1,Unnamed: 1_level_1,LBLF_F1
Unnamed: 0_level_2,Unnamed: 1_level_2,FM6AG005R8P60T40GG
Unnamed: 0_level_3,Unnamed: 1_level_3,0
as_gegevens_eeg_uitvoering,TimeStamp,2024-05-15 15:19:22
basisgegevens_eeg_uitvoering,TimeStamp,2024-05-15 15:19:26
carrosserie_uitvoering,TimeStamp,2024-05-15 15:19:30
handelsbenaming_uitvoering,TimeStamp,2024-05-15 15:19:40
merk_uitvoering_toegestaan,TimeStamp,2024-05-15 15:19:44
motor_uitvoering,TimeStamp,2024-05-15 15:19:47
motor_uitvoering_brandstof,TimeStamp,2024-05-15 15:19:52
plaatsaanduiding_uitvoering,TimeStamp,2024-05-15 15:19:55
uitvoering_gebruiksgegevens_per_uitgave,TimeStamp,2024-05-15 15:20:05
versnellingsbak_uitvoering,TimeStamp,2024-05-15 15:20:08


In [20]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-conf/rdw-conf-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/rdw-conf/rdw-conf-0-data-2024-05-09.pkl


<a href="#rdw_top" id='rdw_ovi'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Data from rdw website (OVI)
Optionally get data from rdw website

In [21]:
if OVIDATA == False:
    rdw_ovi = None
else:
    regs = rdw_per_reg['registrations'].Reg.to_list()
    Info = OviInfo(regs, verbose=VERBOSE)
    Info.process_api()
    print(Info)

    rdw_ovi = Info.data_.copy()
    # make fields lowercase and add "ovi_"
    rdw_ovi.index.name='kenteken'
    rdw_ovi.columns = [re.sub(r'([A-Z])',r'_\1', c).lower() if c != 'TimeStamp' else c for c in rdw_ovi.columns] # after capital, add _ 
    rdw_ovi.columns = [re.sub(r'^_','', c) for c in rdw_ovi.columns] # remove trailing _
    # Basic operations
    rdw_ovi = pd.concat(
        [rdw_ovi, 
         rdw_ovi.eigenaren.str.split('/', expand=True).rename(columns = {0: 'eigenaren_private', 1: 'eigenaren_company'}).astype('Int8')
        ], axis=1)
    rdw_ovi['eigenaren_total'] = rdw_ovi.eigenaren_private + rdw_ovi.eigenaren_company
    #rdw_ovi['ovi_wachten_op_keuring_ind'] = rdw_ovi.ovi_wachten_op_keuring.apply(lambda x: {'Ja': True, 'Nee': False}[x] if isinstance(x, str) else x).astype('boolean')

    if VERBOSE > 1:
        rdw_ovi
    else:
        print(rdw_ovi.shape)

This takes approximately 5 minutes. Done by 17:25:58.


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0-9,[36/38]51MDNL,[39/43]MTDZ66,[36/38]FGZ01Z,[35/38]DTL50P,[33/38]FDB43N,[35/38]13MFFV,[34/38]FJR06B,[30/38]DZJ51T,[33/38]79MJDX,[34/38]FJP65B
10-19,[33/38]FGJ17K,[33/38]77MLLV,[35/38]19MGBP,[0/0]WTR398,[35/38]DLZ50D,[35/38]78MDRN,[35/38]MVXL42,[35/38]FDF50B,[36/38]D969LK,[36/38]DKB50G
20-29,[35/38]D344DZ,[33/39]FKF54G,[33/38]66MNGV,[34/38]82MRDK,[34/38]MLSX19,[32/38]MZDT40,[41/47]S221TV,[44/47]R640FL,[37/47]XF807H,[43/47]68TVZB
30-39,[43/47]H445BZ,[42/47]82PDKX,[42/47]42LNJB,[42/52]J935NG,[44/47]33LFG8,[44/47]J810KP,[43/47]2KZB84,[44/47]8KPJ85,[43/47]32XSFR,[44/47]36XHP6
40-49,[29/35]WP90NP,[45/56]V798DN,[51/57]3VVV29,[44/47]87JJB2,[43/47]GJ816F,[43/47]51ZBGB,[43/47]7XPS43,[41/47]98KXL8,[42/47]93DZLP,[43/47]39ZDVS
50-59,[42/47]55XHTB,[44/47]GX112Z,[43/47]34LFJZ,[42/47]RT209N,[29/34]WF53XT,[38/47]16SKD3,[44/47]92TTH4,[40/47]77TLKL,[45/61]60VHH5,[55/62]4VKZ29
60-69,[47/52]01PZN7,[40/56]9VPJ35,[43/47]97PLF2,[29/34]WV39LY,[45/47]KH080N,[42/47]89XTJV,[45/52]66GHG1,[43/47]71RKVK,[44/47]37LVX1,[48/57]96BBF8
70-79,[51/57]9VVS67,[34/38]FJ236B,[44/47]47TRPK,[43/47]N767RL,[46/47]2XZJ39,[44/47]9ZTR41,[43/47]KF384Z,[41/56]61VRT3,[43/47]GP642T,[42/47]43NFRF
80-89,[44/47]JL047K,[43/47]2KZS69,[45/52]75JNV3,[40/47]96RFRS,[45/47]JP333T,[43/47]55XSTN,[44/47]KX815P,[43/47]H722JG,[42/47]26NHNK,[44/47]R107GS
90-99,[43/47]06TNSG,[45/47]6SKR46,[41/56]96VTD3,[44/47]SV141J,[37/47]N002PF,[40/47]P183BT,[41/47]92LZF9,[52/57]8VZB94,[43/47]38XGDP,[44/47]TF424D


Class contains
	current_reg_ <class 'str'>:
		None
	data_ <class 'pandas.core.frame.DataFrame'>:
		shape=(122, 71)
	idx_ <class 'list'>:
		len=122
	metadata_ <class 'dict'>:
		contains fields ['header']
	metadata_.header <class 'str'>:
		{'Cache-Control': 'private', ' .. ; Domain=.ovi.rdw.nl; Secure'}
	resp_status_ <class 'str'>:
		200
	tree_ <class 'str'>:
		<Element html at 0x7fb5bd57c450>
	verbose_level_ <class 'str'>:
		1
(122, 74)


In [22]:
out = rdw_ovi.copy()

In [23]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-ovi/rdw-ovi-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/rdw-ovi/rdw-ovi-0-data-2024-05-09.pkl


<a href="#rdw_top" id='rdw_merge'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Data from The National Highway Traffic Safety Administration (NHTSA)
Based on VIN. Product Information Catalog and Vehicle Listing (vPIC)
https://vpic.nhtsa.dot.gov/api/

In [24]:
from vin_lookup import Nhtsa_batch

In [25]:
# empty dictionary
nhtsa_per_vin = dict()

In [26]:
key = 'vpic'
df_ =  drz.loc[:, ['Vin', 'Mfyear']].copy().replace({'': np.NaN, 'onbekend': np.NaN}) # copy from drz

# borrow mfyear from rdw info
rdw_mfy = pd.merge(  left = rdw_per_reg['registrations'].reset_index(),
                     right = rdw_per_reg['gekentekende_voertuigen'].datum_eerste_toelating.reset_index(),
                     how='left',
                     right_on='kenteken',
                     left_on='kenteken'
                    ).loc[:, ['lot_index', 'datum_eerste_toelating']].set_index('lot_index')
df_ =  pd.concat([df_, (rdw_mfy // 10000).astype(pd.Int16Dtype())], axis=1)
df_.update(df_.loc[:, ['Mfyear', 'datum_eerste_toelating']].bfill(axis=1))
df_.rename(columns={'Vin': 'VIN', 'Mfyear': 'MFY'}, inplace=True)
nhtsa_per_vin[key] = df_.loc[:, ['VIN', 'MFY']]

# lookup vins in batches
Batch = Nhtsa_batch(nhtsa_per_vin[key].iloc[:,:2].dropna(subset='VIN'), 
                    data_dict_fn = f"{cfg['FILE_LOCATION']['code_dir']}/assets/nhtsa-data-dict.csv",
                    verbose=VERBOSE)
Batch.full_parse()
out = Batch.data.copy()

# store in dict
nhtsa_per_vin[key] = pd.concat([
    nhtsa_per_vin[key],
    out.drop(columns=out.columns[out.columns.str.startswith('system') | out.columns.str.startswith('internal')])
], axis=1)

if VERBOSE > 1:
    display(nhtsa_per_vin[key])
else:
    print('\n'.join(nhtsa_per_vin.keys()))

batch [1/3]


  df_.update(df_.loc[:, ['Mfyear', 'datum_eerste_toelating']].bfill(axis=1))
  add = pd.read_json(rsp)
  add.replace({'Not Applicable': np.NaN, '': np.NaN}, inplace=True)


batch [2/3]


  add = pd.read_json(rsp)
  add.replace({'Not Applicable': np.NaN, '': np.NaN}, inplace=True)


batch [3/3]
Class contains
	batch_size_ <class 'str'>:
		50
	data_dict_ <class 'pandas.core.frame.DataFrame'>:
		shape=(140, 8)
	nbatches_ <class 'str'>:
		3
	succes_ <class 'str'>:
		True
	url_ <class 'str'>:
		https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVINValuesBatch
	verboselevel_ <class 'str'>:
		1
	vins_ <class 'pandas.core.series.Series'>:
		shape=(140,)
data (size): (140, 154)
vpic


  add = pd.read_json(rsp)
  add.replace({'Not Applicable': np.NaN, '': np.NaN}, inplace=True)


In [27]:
# Merge with input
nhtsa_per_vin['vpic'].index.name = 'lot_index'
Batch.data.index.name = 'lot_index'
df_vins = pd.concat([
    nhtsa_per_vin['vpic'].loc[:, ['VIN', 'MFY']], 
    Batch.data
], axis = 1)
# add timestamp
df_vins.loc[:, 'TimeStamp'] = pd.Timestamp.now().strftime('%Y%m%d')
df_vins = df_vins.reset_index().set_index(['VIN', 'MFY'])

In [28]:
# Save
file_name = f'{DATA_DIR}/auctions/enriched-results/nhtsa-vpic/nhtsa-vpic-0-data-{auction_month}-{month_counter}.pkl'

if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/nhtsa-vpic/nhtsa-vpic-0-data-2024-05-09.pkl


# Merge datasets
Now we have a couple of dataset that can be merged.

`rdw_per_reg` dictionary  
`rdw_per_confcode` combined in another dictionary `full_codes`  
`rdw_ovi` a single dataframe  
`nhtsa_per_vin` has only one field and is combined in dataframe `df_vins`  


1. Merge dataframes from `rdw_per_reg` with primary key `kenteken`
2. 
    1) Add conformity codes from `rdw_per_confcode`
    2) Add basic conformity info from `rdw_per_confcode.eeg_voertuigtypegoedkeuring`
3. Merge all conformity code information from other apis `full_codes`
4. Merge with OVI
5. Merge with vpic (nhtsa)
6. Merge with auction results


In [29]:
# Merge first set
rich = pd.concat(rdw_per_reg, axis=1)
rich = pd.concat([rich], keys=['rdw'], axis=1)
rich.index.name='kenteken'
print(rich.shape, )

# Add conformity_codes
codes = rdw_per_confcode['conformity_codes'].reset_index().set_index('kenteken')
codes = pd.concat([codes], keys=['conformity_codes'], axis=1)
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.merge(
    codes, # add level
    how='outer',
    left_index = True,
    right_index = True,
)
print(rich.shape)

# Add basic conformity info
codes = rdw_per_confcode['eeg_voertuigtypegoedkeuring'].reset_index()
# add levels
codes = pd.concat([codes], keys=['eeg_voertuigtypegoedkeuring'], axis=1)
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.reset_index().merge(
    codes, 
    how='outer',
    left_on = [('rdw', 'conformity_codes', 'typegoedkeuringsnummer')],
    right_on = [('rdw', 'eeg_voertuigtypegoedkeuring', 'typegoedkeuringsnummer')],
).set_index('kenteken')
print(rich.shape)

# Add full_codes
codes = pd.concat(full_codes, axis=1)#.drop(columns='conformity_codes')
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.merge(
    codes,
    how='outer',
    left_on = [('rdw', 'conformity_codes', c) for c in codes.index.names],
    right_index=True
)
print(rich.shape)

# Add ovi
codes = pd.concat([rdw_ovi], keys=['ovi'], axis=1)
codes = pd.concat([codes], keys=['rdw'], axis=1)
rich = rich.merge(
    codes,
    how='outer',
    left_index = True,
    right_index = True,
)
print(rich.shape)

# Add vpic
codes = pd.concat(nhtsa_per_vin, axis=1)
codes = pd.concat([codes], keys=['nhtsa'], axis=1)
rich = rich.reset_index().merge(
    codes,
    how='outer',
    left_on = [('rdw', 'registrations', 'lot_index')],
    right_index = True,
).set_index(('rdw', 'registrations', 'lot_index'))
rich.index.name = 'lot_index'
print(rich.shape)

(122, 694)
(122, 699)
(122, 732)
(122, 2311)
(122, 2385)
(148, 2517)


In [30]:
rich.columns.map(lambda x: '_'.join(x))
rich

Unnamed: 0_level_0,kenteken,rdw,rdw,rdw,rdw,rdw,rdw,rdw,rdw,rdw,...,nhtsa,nhtsa,nhtsa,nhtsa,nhtsa,nhtsa,nhtsa,nhtsa,nhtsa,nhtsa
Unnamed: 0_level_1,Unnamed: 1_level_1,registrations,registrations,gekentekende_voertuigen,gekentekende_voertuigen,gekentekende_voertuigen,gekentekende_voertuigen,gekentekende_voertuigen,gekentekende_voertuigen,gekentekende_voertuigen,...,vpic,vpic,vpic,vpic,vpic,vpic,vpic,vpic,vpic,vpic
Unnamed: 0_level_2,Unnamed: 1_level_2,Reg,LotType,voertuigsoort,merk,handelsbenaming,vervaldatum_apk,datum_tenaamstelling,inrichting,aantal_zitplaatsen,...,engine___turbo,engine___valvetraindesign,general___vehicletype,exterior_dimension__wheelbaselong,exterior_dimension__wheelbaseshort,exterior_body__wheelbasetype,exterior_wheeltire__wheelsizefront,exterior_wheeltire__wheelsizerear,exterior_wheeltire__wheels,exterior_body__windows
lot_index,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2024-09-1800,51MDNL,51-MD-NL,Motorfiets,Motorfiets,KAWASAKI,Z1000 ABS,,20240424.0,Niet geregistreerd,2.0,...,,,,,,,,,,
2024-09-1801,MTDZ66,MT-DZ-66,Motorfiets,Motorfiets,SUZUKI,GSX-R750,,20240424.0,Niet geregistreerd,2.0,...,,,,,,,,,,
2024-09-1802,FGZ01Z,FGZ-01-Z,Bromfiets,Bromfiets,PIAGGIO,,,20240404.0,N.v.t.,2.0,...,,,,,,,,,,
2024-09-1803,DTL50P,DTL-50-P,Bromfiets,Bromfiets,PIAGGIO,VESPA SPRINT,,20240404.0,N.v.t.,2.0,...,,,MOTORCYCLE,,,,,,,
2024-09-1805,FDB43N,FDB-43-N,Bromfiets,Bromfiets,PIAGGIO,ZIP,,20240424.0,N.v.t.,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-7146,X266LV,X-266-LV,Personenauto,Personenauto,VOLKSWAGEN,POLO,20250216.0,20240418.0,hatchback,5.0,...,,,PASSENGER CAR,,,,,,,
2024-09-7148,KR194J,KR-194-J,Personenauto,Personenauto,PEUGEOT,308,20241105.0,20240513.0,stationwagen,7.0,...,,,PASSENGER CAR,,,,,,,
2024-09-7151,6VXS40,6-VXS-40,Bedrijfswagen,Bedrijfsauto,VOLKSWAGEN,CADDY,20240911.0,20240510.0,gesloten opbouw,2.0,...,,,TRUCK,,,,,,,
2024-09-9600,WF69JH,WF-69-JH,Boottrailer (Dubbelasser),Aanhangwagen,ATLANTA,AT12G-C,,20240403.0,voor vervoer boten,,...,,,,,,,,,,


In [31]:
existing = pd.concat([drz], keys=[''], axis=1)
existing = pd.concat([existing], keys=['drz'], axis=1)
enriched = pd.merge(
    left = existing,
    right = rich,
    how = 'left',
    left_index = True,
    right_index = True
)

<a href="#rdw_top" id='rdw_save'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Saving

In [32]:
file_name = f'{DATA_DIR}/auctions/enriched-results/rdw-data-{auction_month}-{month_counter}.pkl'
if NO_PRICE:
    file_name = file_name.replace('.pkl', '-without-price.pkl')
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
    
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    enriched.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/data/auctions/enriched-results/rdw-data-2024-05-09.pkl


# Next: download images (or parallel)

Because images might be taken down from the drz site, it is advisable to run the notebook that downloads images soon.