# Preprocess car data

In [1]:
import sys
import os
import re
import json

In [2]:
with open('../assets/drz-settings-current.json', 'r') as fid:
    cfg = json.load(fid)
print(cfg['AUCTION'])

OPBOD = cfg['AUCTION']['kind'] == 'opbod'
AUCTION_ID = cfg['AUCTION']['id']
DATA_DIR = cfg['FILE_LOCATION']['data_dir']
VERBOSE = int(cfg['GENERAL']['verbose'])
SAVE_METHOD = cfg['GENERAL']['save_method']


{'kind': 'inschrijving', 'id': '2024-0010', 'date': '20240525'}


In [3]:
if SAVE_METHOD == 'skip_when_exist':
    do_save = lambda fn: not(os.path.isfile(fn))
elif SAVE_METHOD == 'always_overwrite':
    do_save = lambda _: True
elif SAVE_METHOD == 'skip_save':
    do_save = lambda _: False
else:
    raise NotImplementedError(f'SAVE_METHOD: {SAVE_METHOD} not implemented')

In [4]:
TAG_SINGLE = "nbconvert_instruction:remove_single_output"

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os

In [6]:
# progress bar
def _prog(display_id, cur, end, extra_info = 'running', bar_len = 40):
    pct = cur/end
    bar = ''.join(
        ['|'] * int(bar_len*pct) +
        ['-'] * int(bar_len*(1-pct)) 
    ) + f'{pct*100:3.0f}% [{extra_info}]'
    if display_id is None:
        display_id = display({'text/plain': ''}, raw = True, display_id=True)
    display_id.update({'text/plain': bar}, raw = True)
    
    return display_id

## Example
# display_id = None
# display_id = _prog(display_id, 0, 1337)
# display_id = _prog(display_id, 42, 1337)
# display_id = _prog(display_id, 42, 137, '137')

In [7]:
def _split_indexnr(c):
    M = re.match(r'^(((rdw)|(nhtsa))_[a-z,_,0-9]+?)((_[0-9]+)+)$', c)
    if M is None:
        return None
    
    return M[0], M[1], M[5]

# for c in ['rdw_motor_uitvoering_brandstof_emissie_hc_en_nox_type1_1_1',
#           'rdw_motor_uitvoering_brandstof_emissie_co2_buitenweg_1_1',
#           'rdw_motor_uitvoering_cilinderinhoud_cm3_1',
#           'rdw_handelsbenaming_uitvoering_handelsbenaming_fabrikant_10_100',
#           'rdw_motor_uitvoering_cilinderinhoud_cm3_1',
#           'rdw_motor_uitvoering_cilinderinhoud_cm3_2',
#           'rdw_motor_uitvoering_cilinderinhoud_cm3_3',
#           'rdw_motor_uitvoering_brandstof_emissie_stikstofoxide_type_1_1_1',
#           'rdw_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1_1',
#           'rdw_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1',
#           'nhtsa_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1'          
#          ]:
#    print(_split_indexnr(c))

## Load data

In [8]:
if OPBOD:
    fn = f'{DATA_DIR}/cars-from-all-auctions-opbod.pkl'
else:
    fn = f'{DATA_DIR}/cars-from-all-auctions.pkl'
print(fn)
car = pd.read_pickle(fn)

if VERBOSE > 0:
    display(car.tail(), metadata={"tags":(TAG_SINGLE, )})

/home/tom/bin/satdatsci/Saturday-Datascience/data/cars-from-all-auctions.pkl


In [9]:
data_types = pd.read_excel(f"{cfg['FILE_LOCATION']['code_dir']}/assets/fieldname-lookuptable.ods", 
                           sheet_name='data_type', 
                           header=0, 
                           skipfooter=0, 
                           dtype=str, 
                           index_col='final_column_name', 
                           usecols=['final_column_name', 'prefix', 'source_id', 'index_levels', 'nullable',
                                   'modifier_name1', 'modifier_name2', 'modifier_name3', 'modifier_name4', 'modifier_name5'])
data_types = data_types.dropna(how='all', axis=0).dropna(how='all', axis=1)
data_types.drop_duplicates()

Unnamed: 0_level_0,prefix,source_id,index_levels,nullable,modifier_name1,modifier_name2,modifier_name3,modifier_name4,modifier_name5
final_column_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
drz__Source,drz__,100,0,True,str,,,,
drz__Price,drz__,100,0,True,int,zero2null,,,
drz__Draw,drz__,100,0,False,nan2null,false_ifnull,,,
drz__Raw_text,drz__,100,0,True,string_to_list_rawtext,,,,
drz__N_images,drz__,100,0,True,int,,,,
...,...,...,...,...,...,...,...,...,...
rdw_carrosserie_uitvoering_nummerieke_code_TimeStamp,rdw_carrosserie_uitvoering_nummerieke_code_,322,0,,,,,,
rdw_motor_uitvoering_brandstof_emissie_koolmonaxide_etc_test_X_X,rdw_motor_uitvoering_brandstof_,307,2,,,,,,
rdw_subcategorie_nederland,,,0,True,str,,,,
rdw_subcategorie_uitvoering_composite_key_X,rdw_subcategorie_uitvoering_,324,1,,,,,,


In [10]:
cols = []
for col in car.columns:
    c = _split_indexnr(col)
    if c is not None:
        col = c[1] + re.sub('\d+', 'X', c[2])
    cols += [col]
print('exist in data, but no cast information available')
for c in set(cols).difference(data_types.index):
    if (c.endswith('_X')) and (c+'_X' in data_types.index):
        print('\t + _X', c)
    if (c.endswith('_X')) and (c[:-2] in data_types.index):
        print('\t - _X', c)
    
    else:
        print('\t', c)
print('cast information available, but not in data')
for c in set(data_types.index).difference(cols):
    print('\t', c)

exist in data, but no cast information available
	 + _X rdw_toegevoegde_objecten_merkcode_toegevoegd_object_X_X
	 rdw_toegevoegde_objecten_merkcode_toegevoegd_object_X_X
	 + _X rdw_toegevoegde_objecten_soort_toe_te_voegen_object_omschrijving_X_X
	 rdw_toegevoegde_objecten_soort_toe_te_voegen_object_omschrijving_X_X
	 - _X rdw_meldingen_keuringsinstantie_soort_erkenning_keuringsinstantie_X_X
	 - _X rdw_meldingen_keuringsinstantie_meld_tijd_door_keuringsinstantie_X_X
	 - _X rdw_meldingen_keuringsinstantie_api_gebrek_constateringen_X_X
	 - _X rdw_meldingen_keuringsinstantie_meld_datum_door_keuringsinstantie_X_X
	 + _X rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_uitvoering_numeriek_europees_X_X
	 rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_uitvoering_numeriek_europees_X_X
	 + _X rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_volgnummer_X_X
	 rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_volgnummer_X_X
	 + _X rdw_toegevoegde_objecten_demontagedatum_X_X

## Adhoc repair

When all fails

In [11]:
if OPBOD:
    print('skip')
else:
    print(car.shape)
    # '2603 Afkomstig van JFC HQ Brunssum.'
    ix='2017-5-2603'
    car.loc[ix,"drz__LotNr"] = '2603'
    car.loc[ix, 'drz__jfc'] = True
    # K2000098227 Afkomstig van JFC HQ Brunssum.
    ix='2020-9-8227'
    car.loc[ix,"drz__LotNr"] = '8227'
    car.loc[ix, 'drz__jfc'] = True
    
    # "bouwjaar verklaring noodzakelijk."
    # car.loc[[v=="verklaring noodzakelijk." for v in car.Mfyear]]
    # car[car.Mfyear.str.contains('verklaring noodzakelijk.')==True]
    # ixs = ['2020-1-7177']
    # for ix in ixs:
    #     car.loc[ix, 'drz__Mfyear'] = ''

    # fix issus with one lot that has no type
    # print(car.drz__ItemType[~ (car.drz__ItemType.apply(type) == str)])
    # [print(l) for l in eval(car.loc['2017-6-7121','Raw_text'])]
    # car.loc['2017-6-7121','Images']
    ix = '2017-6-7121'
    car.loc[ix,'drz__ItemType'] = 'golf'
    #car.loc[ix,'drz__model'] = car.loc[ix,'drz__ItemType'] 

    # car.loc[car.drz__ItemBrand == 'Kampeerwagen/camper',:]
    # car.loc["2017-5-2408",:]
    ix = '2017-5-2408'
    #car.loc[ix,'drz__ItemBrand'] = 'VOLKSWAGEN'
    car.drop(ix,inplace=True) # remove alltogether
    
    # car.loc[car.drz__ItemType == 'benz',:]
    # car.loc[ix,'Images']
    # This is a w204 mfyear < 2011
    ix = '2017-5-2618'
    car.loc[ix,'drz__ItemType'] = 'c cdi'

    # car.loc["2018-1-3046","Raw_text"]
    # This is combined lot
    ix = "2018-1-3046"
    car.drop(ix,inplace=True) # remove alltogether
    
    # car.ForeignReg=='Het voertuig is voorzien van taxi-kentekenplaten. Taxiregistratie kunt u laten be&#235;indigen via de RDW. Vervanging van de blauwe door gele'
    # Taxi
    ixs = ['2017-11-8302', '2017-11-8305', '2018-1-8163', '2018-4-2404']
    regs = ['54-GLL-5','57-XZ-FV','70-TLF-3','JP-934-S']
    for ix,reg in zip(ixs,regs):
        car.loc[ix,'drz__taxi'] = True
        car.loc[ix,'drz__Reg'] = reg

    # Typo in registration K1900022009
    # 8-SKL-15 not 8-SLK-15
    # sel = car.loc[:,'drz__Reg'] == '8-SLK-15'
    # car.loc[sel,'drz__Reg'] 
    ix = '2019-2-2009'
    car.loc[ix,'drz__Reg'] = '8-SKL-15'
    car.loc[ix, [c for c in car.columns if c.startswith('rdw_')]] = None
    
    # NAP is provided first and impacts Odometer reading
    # sel = car.drz__OdoKM.str.contains('logisch').fillna(False)
    # car.loc[sel, ['drz__NAP', 'drz__OdoKM', 'drz__Raw_text']]
    ixs = ['2019-9-9106', '2019-9-9249']
    naps = ['logisch', 'onlogisch']
    kms = ['251.571', '']
    for ix,nap,km in zip(ixs,naps,kms):
        car.loc[ix,'drz__NAP'] = nap
        car.loc[ix,'drz__OdoKM'] = km
        
    # Text in lot was missing a character: "58.83"
    ix = '2020-12-7138'
    car.loc[ix, 'drz__OdoKM']
    car.loc[ix, 'drz__OdoKM'] = '58.683'
    #car.loc[ix, 'drz__OdoKM_num'] = 58683

    # date format is different
    # car.loc[car.drz__Mfdate.str.contains('-').fillna(False), 'drz__Mfdate']
    ix = '2020-12-7263'
    car.loc[ix, 'drz__Mfdate'] = car.loc[ix, 'drz__Mfdate'].replace('-', '.')
    
    # Outside looks just like audi 80 estate
    # car.loc[car.drz__modelspec.str.lower().str.contains('porsche').fillna(False), ['drz__ItemBrand', 'drz__ItemType', 'drz__model', 'drz__modelspec']]
    ix = '2021-05-8098'
    car.loc[ix, 'drz__ItemType'] = '80; avant rs2 232 kw audi/porsche'

    # 206+ has different front (like 207)
    # car.loc[car.rdw_gekentekende_voertuigen_typegoedkeuringsnummer.str.startswith('e2*2001/116*0374').fillna(False), 
    #         car.columns.str.contains('Item') +
    #         car.columns.str.contains('model') +
    #         car.columns.str.contains('typegoedkeurings')
    # ]
    ix = '2021-11-705111'
    car.loc[ix, 'drz__ItemType'] = '206+'    
    #car.loc[ix,'drz__model'] = car.loc[ix,'drz__ItemType'] 

    # Date ends with a '.'
    # sel = car.loc[:, car.columns.str.startswith('drz__')].applymap(lambda x: x.endswith('.') if isinstance(x,str) else False).any()
    # car.loc[:, car.columns.str.startswith('drz__')].loc[:, sel]
    for c in ['drz__Mfdate', 'drz__APKdate']:
        ixs = car.loc[car.loc[:,c].str.endswith('.').fillna(False)].index
        car.update(car.loc[ixs, c].apply(lambda s: s[:-1]))
    
    # Model name is in brandname
    # car.loc[car.drz__ItemBrand.str.contains('307'), 
    #         car.columns.str.lower().str.contains('_vin') +
    #         car.columns.str.contains('_Reg') +
    #         car.columns.str.contains('Item') +
    #         car.columns.str.contains('drz__model') +
    #         (car.columns.str.contains('vpic') & car.columns.str.contains('make')) +
    #         (car.columns.str.contains('vpic') & car.columns.str.contains('model'))
    #        ]
    # reg = 'KV-05-47'; vin = 'VF33ERHYB83318541'
    # reg = 'LM-82-11'; vin = 'VF33CRHYB83116947'
    ixs = ['2023-04-810607', '2023-04-810707']
    for ix in ixs:
        car.loc[ix, 'drz__ItemBrand'] = 'PEUGEOT'
        car.loc[ix, 'drz__ItemType'] = '307'
        if car.loc[ix, 'drz__Reg'] == 'KV-05-47':
            car.loc[ix, 'drz__ItemType'] += '; sw'

    # no price, but no mention "Niet gegund" in bold
    # car[car.drz__Price < 0]
    ixs = ['2023-10-703920']
    for ix in ixs:
        car.loc[ix, 'drz__Price'] = 0
    # RDW registered this as "VW" not "VOLKSWAGEN"
    # car.loc[
    #     car.loc[:, ['rdw_gekentekende_voertuigen_merk', 'rdw_ovi_merk']].apply(lambda x: x.str.contains('VW')).any(axis=1)
    #     , ['rdw_gekentekende_voertuigen_merk', 'rdw_ovi_merk',
    #        'drz__ItemBrand',
    #        'nhtsa_vpic_general___make', 'brand', 
    #        'drz__ItemType', 
    #        'rdw_basisgegevens_eeg_uitvoering_handelsbenaming', 'rdw_gekentekende_voertuigen_handelsbenaming', 'rdw_ovi_handelsbenaming',
    #        'drz__model',
    #        'nhtsa_vpic_general___model', 'model',
    #       ]]
    ixs = ['2023-11-702122', '2024-02-703403']
    for ix in ixs:
        car.loc[ix, ['rdw_gekentekende_voertuigen_merk', 'rdw_ovi_merk']] = 'VOLKSWAGEN'                
                
    print(car.shape)

(11832, 3929)
(11830, 3929)


  ixs = car.loc[car.loc[:,c].str.endswith('.').fillna(False)].index
  ixs = car.loc[car.loc[:,c].str.endswith('.').fillna(False)].index


### Typecast

In [12]:

def int_1000sep46(values, nullable):
    #nan_vals = ['N.v.t.', 'nvt', 'n.v.t', 'onbekend', '', 'nan', np.NaN]
    assert isinstance(values, pd.Series), type(values)

    #nan_vals += ['volgens NAP logisch', 'volgens nap logisch', 'volgens NAP onlogisch', 'volgens nap onlogisch']
    out = values.copy()
    # replace to None
    #out.replace(nan_vals, None, inplace=True)
    # change thousand seperator
    out = out.apply(lambda x: re.sub('\,','.',re.sub('\.','',x)) if isinstance(x, str) else x)
    # change all to string
    out = out.apply(lambda x: int(x) if isinstance(x, float) and pd.notna(x) else x)
    out = out.apply(lambda x: str(x) if isinstance(x, int) and pd.notna(x) else x)
    
    if nullable:
        return out.astype('Int64')
    else:
        return out.astype('Int')

def mls_to_km(values, nullable, ml2km = 1.609344):
    assert isinstance(values, pd.Series), type(values)
    out = values.copy()
    return out.astype('Float64') * ml2km

def times10(values, nullable):
    assert isinstance(values, pd.Series), type(values)
    out = values.copy()
    return out * 10

def false_ifnull(values, nullable):
    #nan_vals = ['N.v.t.', 'nvt', 'n.v.t', 'onbekend', '', 'nan', np.NaN]
    assert isinstance(values, pd.Series), type(values)

    out = values.copy()
    # replace to False
    
    #out.replace(nan_vals, False, inplace=True)
    out.fillna(False, inplace=True)
    if nullable:
        return out.astype('Int8')
    else:
        return out.astype(bool)
    
def uniform_brand(values, nullable, replace_dict = {
    "ASTON MARTIN":"ASTON-MARTIN",
    'AUTO UNION':'AUDI',
    'JAGUAR CARS':'JAGUAR',
    "MERCEDES BENZ":"MERCEDES-BENZ",
    "MERCEDES":"MERCEDES-BENZ",
    "MERCDES-BENZ": "MERCEDES-BENZ",
    "DAIMLERCHRYSLER AG": "MERCEDES-BENZ",
    "MICRO COMPACT CAR SMART":"SMART",
    "MICRO COMPACT CAR":"SMART",
    "LANDROVER": "LAND ROVER",
    "LAND-ROVER": "LAND ROVER",
    "JAGUAR LAND ROVER": "LAND ROVER",
    "CITRO": "CITROËN",
    "CITROÃÂ\x8bN": "CITROËN",
    "CITROEN": "CITROËN",
    "G.M.C.": "GMC",
    "VOLKWAGEN": "VOLKSWAGEN",
    "TESLA MOTORS": "TESLA",
    "KAISER-JEEP": "JEEP",
    "AUTOMOBILI LAMBORGHINI S.": "LAMBORGHINI",
}):
    # Rename to conventional brand name

    assert isinstance(values, pd.Series), type(values)
    
    out = values.copy()
    out.replace(replace_dict, inplace=True)
    
    return out


def add_model_spec(s, spec):
    '''Adds specification at the end of the model name if not already in name'''
    # When brand name has a specification that needs to go in the model name.
    # E.g. Mercedes <AMG> and Audi <QUATTRO>
    import re 
    
    # addition should not exist
    if not re.search('(?i)' + spec, s):
        
        # add separator
        if not (s.endswith(';')):
            s += ';'
        # add specification
        s += ' ' + spec
    return s

def string_to_list_rawtext(s, nullable):
    '''Convert string to list'''
    if type(s) == str:
        s = s.strip(' ')
        if s.startswith("['") and s.endswith("']"):
            s=eval(s)
        else:
            NotImplementedError
        
    return s

In [13]:
def string_to_int_lotnr(s):
    '''Convert string to int'''
    if type(s) == str:
        s = s.strip(' ')
        if (s[0] == 'K') and (len(s) == 11):
            # "K1800092200"
            s=int(s[-4:])
        else:
            try:
                s=int(s)
            except ValueError:
                s=-1
    return s


In [14]:
car.drz__lot_counter = pd.concat([
    car.drz__lot_counter.astype('Int32'), 
    car.drz__LotNr.map(string_to_int_lotnr)
], axis=1).bfill(axis=1).loc[:, 'drz__lot_counter']


  car.drz__lot_counter = pd.concat([


In [15]:
if 2 > VERBOSE > 0:
    display_id = None
    
for i, fld in enumerate(car.columns[::-1]):

    # Rename _1 to _X
    split = _split_indexnr(fld)
    if split is not None:
        _, pt1, pt2 = split
        idx = pt1 + re.sub('\d+', 'X', pt2)
    else:
        idx = fld

    # hot fixes
    if idx in (
        'rdw_toegevoegde_objecten_demontagedatum_X_X',
        'rdw_toegevoegde_objecten_soort_toe_te_voegen_object_omschrijving_X_X',
        'rdw_toegevoegde_objecten_merk_object_toegevoegd_X_X',
        'rdw_toegevoegde_objecten_merkcode_toegevoegd_object_X_X',
        'rdw_toegevoegde_objecten_uitvoerings_volgnr_toegev_obj_X_X',
        'rdw_toegevoegde_objecten_montagedatum_dt_X_X',
        'rdw_toegevoegde_objecten_classificatie_toegevoegd_obj_X_X',
        'rdw_toegevoegde_objecten_gasinstallatie_tank_inhoud_X_X',
        'rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_volgnummer_X_X',
        'rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_uitvoering_numeriek_volgnummer_X_X',
        'rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_uitvoering_numeriek_europees_X_X'
    ):
        idx += '_X'
        
    if idx in (
        'rdw_meldingen_keuringsinstantie_soort_erkenning_omschrijving_X_X',
        'rdw_meldingen_keuringsinstantie_meld_datum_door_keuringsinstantie_dt_X_X',
        'rdw_meldingen_keuringsinstantie_api_gebrek_beschrijving_X_X',
        'rdw_meldingen_keuringsinstantie_api_gebrek_constateringen_X_X',
        'rdw_meldingen_keuringsinstantie_meld_datum_door_keuringsinstantie_X_X', 
        'rdw_meldingen_keuringsinstantie_vervaldatum_keuring_dt_X_X', 
        'rdw_meldingen_keuringsinstantie_vervaldatum_keuring_X_X', 
        'rdw_meldingen_keuringsinstantie_meld_datum_door_keuringsinstantie_dt_X_X', 
        'rdw_meldingen_keuringsinstantie_soort_erkenning_keuringsinstantie_X_X', 
        'rdw_meldingen_keuringsinstantie_soort_melding_ki_omschrijving_X_X', 
        'rdw_meldingen_keuringsinstantie_meld_tijd_door_keuringsinstantie_X_X', 
        'rdw_meldingen_keuringsinstantie_soort_erkenning_omschrijving_X_X', 
    ):
        idx = idx[:-2]
               
        

    # apply modifiers
    mod_info = data_types.loc[idx]
    for mod_name in mod_info[mod_info.index.str.startswith('modifier_name')].dropna():
        if 2 > VERBOSE > 0:
            display_id = _prog(display_id, i, car.shape[1], f'\n\tapplying {mod_name}\n\t      to {fld}\n')
        
        # Modifiers
        if mod_name == 'add_suffix_num':
            car = pd.concat([car.drop(columns=[fld + '_num'], errors='ignore'), car.loc[:,[fld]].add_suffix('_num')], axis=1)
            fld += '_num'
            continue
            
        if mod_name == 'mls_to_km':
            car = pd.concat([car.drop(columns=[fld + '_KM'], errors='ignore'), car.loc[:,[fld]].add_suffix('_KM')], axis=1)
            fld += '_KM'

        if mod_name == 'brand_to_model':
            fld2 = 'drz__ItemType'
            rename_dict = {
                'QUATTRO': ('AUDI', 'quattro'),
                'MERCEDES-AMG': ('MERCEDES-BENZ', 'amg'),
                'ALPINA': ('BMW', 'alpina'),
                'BMW 3ER REIHE': ('BMW', '3er reihe'),
                'RANGE ROVER': ('LAND ROVER', 'range rover'),
                'FORD C MAX': ('FORD', 'c max'),
                'PORSCHE CAYENNE': ('PORSCHE', 'cayenne'),  
                'PORSCHE PANAMERA': ('PORSCHE', 'panamera'),
                'AUDI/PORSCHE': ('AUDI', 'porsche'),
                'AUDI A4': ('AUDI', 'a4'),
                'MINI COOPER': ('MINI', 'cooper')

            }
            for old, (new, spec) in rename_dict.items():
                sel = car.loc[:, fld] == old
                car.loc[sel, fld2] = car.loc[sel, fld2].apply(lambda s: add_model_spec(s,spec))
                car.loc[sel, fld] = new                
            continue
            
        if mod_name == 'trim_brand':
            fld2 = 'drz__ItemBrand'
            car.loc[:, fld] = car.loc[:, [fld2, fld]].apply(
                lambda x: re.sub(x[0]+' ', '', x[1], flags=re.I) 
                if (isinstance(x[1],str) and isinstance(x[0], str)) else x[1], axis=1
            )
            continue

        if mod_name == 'nap':
            fld2 = 'drz__no_nap'
            rn = {
                'volgens NAP logisch': False,
                'volgens nap logisch': False,
                'volgens NAP onlogisch': True,
                'volgens nap onlogisch': True
            }
            # add NAP score to correct field
            car.loc[:, fld2] = car.loc[:, fld2].combine_first(
                car.loc[: ,fld].apply(lambda x: rn[x] if x in rn.keys() else np.NaN)
            )
            # remove from field
            car.loc[:, fld].replace({k: np.NaN for k in rn.keys()}, inplace=True)
            continue
            
        if mod_name == 'abrv_mpv':
            rn = {'Multipurpose vehicle (MPV)': 'MPV'}
            car.loc[:, fld].replace(rn, inplace=True)
            continue
            
        if mod_name == 'split_spec':
            car = pd.concat([
                car,
                car.loc[:, fld].str.split(pat='; ?', expand=True, n=1).rename(columns={0: 'drz__model', 1: 'drz__modelspec'})
            ], axis=1)
            continue

        if mod_name == 'split_bodycode':
            # rdw_ovi_carrosserie_omschrijving "type (CODE)" -> "type"
            # rdw_ovi_carrosserie_carrosserietype -> "CODE"
            fld2 = 'rdw_ovi_carrosserie_carrosserietype'
            M = car.loc[:, fld].apply(lambda x: re.match('^(.*)\s\(([A-Z]+)\)$', x) if isinstance(x, str) else x)
            split = pd.concat([M.apply(lambda x: x[1] if isinstance(x, re.Match) else x), 
                               M.apply(lambda x: x[2] if isinstance(x, re.Match) else x)], axis=1, keys=[fld, fld2]).astype(car.loc[:, fld].dtype)
            car.update(split[fld])
            car = pd.concat([
                car, 
                split[fld2]
            ], axis=1)
            continue
        
        if mod_name == 'str':
            if mod_info.nullable == 'True':
                car.loc[:,[fld]] = car.loc[:,[fld]].fillna('') # empty if NaN
            car.loc[:,[fld]] = car.loc[:,[fld]].astype('string')
            continue

        if (mod_name == 'int') | (mod_name == 'int_yyyy') | (mod_name == 'int_kg') | (mod_name == 'int_cm') | (mod_name == 'euro_int') | (mod_name == 'int_kmph_fuel'):
            if mod_name == 'int_kg':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x: re.sub('\s*kg$', '', x) if isinstance(x, str) else x)
            if mod_name == 'int_cm':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x: re.sub('\s*cm$', '', x) if isinstance(x, str) else x)
            if mod_name == 'euro_int':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x: re.sub('^(€|(â¬))\s*', '', x) if isinstance(x, str) else x)
            if mod_name == 'int_kmph_fuel':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x: re.sub('\s*km/h\s*\(\w+\)$', '', x) if isinstance(x, str) else x)
                
                
            if mod_info.nullable == 'True':
                dtype = 'Int32'
                # drop non numerical 
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x:float(np.NaN) if (not x.isnumeric() if isinstance(x,str) else False) else x).astype('Float64')
            else:
                dtype = 'int'
            
            car.loc[:,[fld]] = car.loc[:,[fld]].astype(dtype)
            continue
            
        if mod_name == 'bool':
            if mod_info.nullable == 'True':
                dtype = 'boolean'
            else:
                dtype = 'bool'
                # Waring! existing NA will imputed as True
            car.loc[:,[fld]] = car.loc[:,[fld]].astype(dtype)
            continue
        
        if mod_name.startswith('date_') |  mod_name.startswith('datetime_') |  mod_name.startswith('time_'):
            # <float> 20221225.0 
            car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x: str(x)[:-2] if (isinstance(x,float) and (str(x)[-2:]=='.0')) else x)
            if mod_name == 'date_yyyymmdd':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format='%Y%m%d') 
                                                             if (isinstance(t, str) and len(t)==8) else t)
            elif mod_name == 'date_dd47mm47yyyy':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%d/%m/%Y') 
                                                             if (isinstance(t, str) and len(t)==10) else t)
            elif mod_name == 'date_dd46mm46yyyy':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%d.%m.%Y') 
                                                             if (isinstance(t, str) and len(t)==10) else t)
                # trailing .
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%d.%m.%Y.') 
                                                             if (isinstance(t, str) and len(t)==11) else t)
                # short month
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%d.%m.%Y') 
                                                             if (isinstance(t, str) and len(t)==9) else t)
            elif mod_name == 'date_dd45mm45yyyy':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%d-%m-%Y') 
                                                             if (isinstance(t, str) and len(t)==10) else t)
            elif mod_name == 'datetime_yyyy45mm45ddTHH58mm58ss45000':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%Y-%m-%dT%H:%M:%S.%f') 
                                                             if (isinstance(t, str) and len(t)==23) else t)
            elif mod_name == 'time_HH58MM':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda t: pd.to_datetime(t, format=r'%H:%M') 
                                                             if (isinstance(t, str) and len(t)==5) else t)
            else:
                NotImplementedError

            if mod_info.nullable == 'True':
                car.loc[:,[fld]] = car.loc[:,[fld]].fillna(pd.NaT)
            continue
        
        if mod_name in ('JN', 'YN', 'JaNee', 'YesNo'):
            if mod_name == 'JN':
                rn_dict = {'J': True, 'N': False}
            elif mod_name == 'JaNee':
                rn_dict = {'Ja': True, 'Nee': False}
            elif mod_name == 'YesNo':
                rn_dict = {'Yes': True, 'No': False}
            elif mod_name == 'YN':
                rn_dict = {'Y': True, 'N': False}
            else:
                NotImplementedError
            car.loc[:,[fld]] = car.loc[:,[fld]].replace(rn_dict)
            continue
            
        if (mod_name == 'float') | (mod_name == 'float_kwpkg'):
            if mod_name == 'float_kwpkg':
                car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x: re.sub('\s*kW/kg$', '', x) if isinstance(x, str) else x)
            
            car.loc[:,[fld]] = car.loc[:,[fld]].astype(float)
            continue
        if mod_name == 'str_upper':
            car.loc[:,[fld]] = car.loc[:,[fld]].apply(lambda x: x.str.upper())
            continue        
        if mod_name == 'str_lower':
            car.loc[:,[fld]] = car.loc[:,[fld]].apply(lambda x: x.str.lower())
            continue
            
        if mod_name == 'list_datetime_yyyy45mm45ddTHH58mm58ss45000':
            #skip car.loc[:,cols] = car.loc[:,cols].applymap(lambda x: [] if ((not isinstance(x, list)) and pd.isna(x)) else x)
            continue
        
        if mod_name == 'false2null':
            car.loc[:,[fld]] = car.loc[:,[fld]].replace({False: np.NaN})
            continue
        if mod_name == 'zero2null':
            car.loc[:,[fld]] = car.loc[:,[fld]].replace({0: np.NaN})
            continue
        if mod_name == 'nan2null':
            nan_vals = ['N.v.t.', 'nvt', 'n.v.t', 'onbekend', '', 'nan', 'Niet geregistreerd', 'Geen verstrekking in Open Data']
            for v in nan_vals:
                car.loc[:,[fld]] = car.loc[:,[fld]].replace({v: np.NaN})
            continue
            
        # EVAL function
        if callable(eval(mod_name)):
            car.loc[:,[fld]] = car.loc[:,[fld]].apply(lambda x: eval(f'{mod_name}(x, {mod_info.nullable})'))
        else:
            NotImplementedError
            
    # Sanity check when date field
    if any([mod_name.startswith('date_') |  mod_name.startswith('datetime_') for mod_name in mod_info[mod_info.index.str.startswith('modifier_name')].dropna()]):
        assert car.loc[:,fld].map(lambda x: isinstance(x,(pd.Timestamp, type(pd.NaT)))).all()
    
    # Done
    if '_X' not in idx:
        mod_info[mod_info.index.str.startswith('modifier_name')] = np.NaN 
    
if 2 > VERBOSE > 0:
    display_id = _prog(display_id, i, car.shape[1], f'done')

|||||||||||||||||||||||||||||||||||||||100% [done]

  car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x:float(np.NaN) if (not x.isnumeric() if isinstance(x,str) else False) else x).astype('Float64')
[<NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
 ...
 <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>]
Length: 11830, dtype: Float64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x:float(np.NaN) if (not x.isnumeric() if isinstance(x,str) else False) else x).astype('Float64')
  car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda x:float(np.NaN) if (not x.isnumeric() if isinstance(x,str) else False) else x).astype('Float64')
[<NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>,
 ...
 <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>]
Length: 11830, dtype: Float64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  car.loc[:,[fld]] = car.loc[:,[fld]].applymap(lambda

## Odometer
convert miles to km and make all numerical

In [16]:
# update KM if MLS is know
car.drz__OdoKM_num = car.drz__OdoKM_num.astype('Float64').combine_first(car.drz__OdoMLS_num_KM)

In [17]:
# plot
if VERBOSE > 0:
    # print new column and source info
    display(car.loc[:,['drz__OdoKM_num', 'drz__OdoMLS_num']]\
            .notna()\
            .sum(axis=1)\
            .value_counts()\
            .rename(index={1: 'km', 2: 'miles', 0:'no odo reading'})\
            .to_frame(name='nr_of_cars')
           )

    # plot odometer
    fig,ax=plt.subplots(figsize=[16,8])
    ax.set_xlabel('lot')
    ax.set_ylabel('odometer (km)')

    car.loc[:,["drz__OdoKM_num"]].astype('Float64').plot(marker=',',linestyle='',alpha=1,ax=ax, mec='None', mfc='lightgray')
    car.loc[:,["drz__OdoMLS_num_KM"]].astype('Float64').plot(marker='+',linestyle='',alpha=1,ax=ax, color='k')


Unnamed: 0,nr_of_cars
km,11218
no odo reading,353
miles,259


### Concatenate columns with index numbers

In [18]:
# Get info from dataframe columns
new_names = np.array([i[1] + '_concat' for i in map(_split_indexnr, car.columns) if i is not None])
old_names = np.array([i[0] for i in map(_split_indexnr, car.columns) if i is not None])
counter = np.array([i[2] for i in map(_split_indexnr, car.columns) if i is not None])

if 2 > VERBOSE > 0:
    # initiate progress bar
    display_id = None
for prg, new_name in enumerate(set(new_names)):
    
    # select columns in df
    sel = new_names == new_name
    
    # make dict from columns
    new_dicts = car[old_names[sel]].apply(lambda row: {
        k[1:]: v # {'1_1_1': 'value'}, "[1:]" to trim off leading "_" 
        for k,v in zip(counter[sel], row) 
        if ~((isinstance(v,float)) and (np.isnan(v)))
    }, axis=1)
    
    # add series to new column 
    car[new_name] = new_dicts
    
    # Remove old columns
    car.drop(columns=old_names[sel], inplace=True)
    
    # progress
    if 2 > VERBOSE > 0:
        display_id = _prog(display_id, prg, len(set(new_names)), new_name)
    elif VERBOSE > 1:
        print(f'{counter[sel][0]:7s} .. {counter[sel][-1]:7s} ({sum(sel):3.0f}) -> {new_name:s}')
        
if 2 > VERBOSE > 0:
    display_id = _prog(display_id, prg, len(set(new_names)), f'{len(old_names)} columns merged to {prg} new ones')

|||||||||||||||||||||||||||||||||||||||100% [3413 columns merged to 256 new ones]

  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = new_dicts
  car[new_name] = ne

In [19]:
if VERBOSE > 1:
    # Unknown registrations
    # Might be able to fix it by looking at the raw text.
    ixs = car.loc[car.Reg.str.lower() == 'onbekend'].index
    display(car.loc[ixs,['Reg','ForeignReg','Raw_text']])
    for ix in ixs:
        rt = car.loc[ix,'Raw_text']
        if type(rt) == list:
            txt = '</br>&nbsp;&nbsp;&nbsp;&nbsp;'.join(rt)
        else:
            txt = ',</br>&nbsp;&nbsp;&nbsp;&nbsp;'.join(rt.split(','))
        
        txt = txt.replace('kenteken', '<B><font color="red">kenteken</font></B>')
        display({'text/html': f'<b>{ix}</b></br>&nbsp;&nbsp;&nbsp;&nbsp;{txt}'},
                raw=True, metadata={'tags': (TAG_SINGLE, )})

if VERBOSE > 1:
    # FUTURE: Do something with foreign registrations
    # Parsing did not always get it right.
    display({'text/html':
        '<b>Foreign registrations:</b></br>&nbsp;&nbsp;&nbsp;&nbsp;' +  
        '</br>&nbsp;&nbsp;&nbsp;&nbsp;'.join(list(car.ForeignReg.dropna().unique())) +
        '</br>'
    }, raw=True)#
    # pd.DataFrame(car.ForeignReg.value_counts())

if VERBOSE > 1:
    # Steering wheel in center? "M"?
    display(
        car.loc[
            ~car.rdw_basisgegevens_kant_van_het_stuur.isin(['R', 'L']), 
            ['rdw_basisgegevens_kant_van_het_stuur', 'Raw_text']].dropna()
    )

    # De zijde van het voertuig waar het stuurwiel is gemonteerd.
    # Waarden 
    # L    Links
    # R    Rechts
    # M    Midden
    # src: https://www.rdw.nl/-/media/rdw/rdw/pdf/sitecollectiondocuments/over-rdw/naslagwerk/beschrijving-dataset-typegoedkeuring-v10.pdf
    # car.rdw_basisgegevens_kant_van_het_stuur.value_counts()



## Other repairs

In [20]:
# reparse raw text NAP
# Wrong ones have 'isch'

# re pattern
patt = '^Km-stand volgens nap (?P<val>(logisch)|(onlogisch))$'
# loop over wrong ones
for ix in car[car.drz__NAP == 'isch'].index:
    # line by line
    for line in car.loc[ix,'drz__Raw_text']:
        M = re.match(patt,line)
        if M:
            car.loc[ix,"drz__NAP"] = M.group('val')
            
            
# Year of manufacturing unknown
# Mfyear: "onbekend"
car.drz__Mfyear.replace('onbekend','', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car.drz__Mfyear.replace('onbekend','', inplace=True)


- - - - 
## Make data type consistent

show data types per column

In [21]:
if VERBOSE > 0:
    
    nan_types = [
        "<class 'pandas._libs.missing.NAType'>", 
        "<class 'pandas._libs.tslibs.nattype.NaTType'>",
        "<class 'NoneType'>",
    ]
    
    # print type per column and example values
    df_ = pd.DataFrame(columns = ['Data Type',  'Example values'], index = [car.columns.values])
    df_ = pd.concat([df_], keys=[0]).swaplevel()
    df_.index.set_names(['Column', 'Type counter'], inplace=True)
    for c in car.columns:
        cnt=-1
        for t in car[c].apply(type).unique():
            if str(t) in nan_types:
                continue
            elif (str(t) == "<class 'float'>") and (all(car[c][car[c].apply(type) == t].isna())):
                continue
            cnt+=1

            # column name, data type
            df_.loc[(c, cnt), 'Data Type'] = str(t)
            # skip large 
            if c in ['SupInfo','Raw_text','rdwinfo','Images']:
                v = '..skip..'
                df_.loc[(c, cnt), 'Example values'] = v
                continue
            if list in car[c].apply(type).unique():
                v = 'max nr of items: ' +\
                str(car[c].apply(lambda s:len(s) if list==type(s) else 0).max())
                df_.loc[(c, cnt), 'Example values'] = v
                continue
            elif dict in car[c].apply(type).unique():
                if t == dict:
                    v = 'max nr of keys: ' +\
                    str(car[c].apply(lambda s: len(s)).max())
                    df_.loc[(c, cnt), 'Example values'] = v
                    continue

            # values
            v = car[c].unique()
            if len(v) < 10:
                # print all
                df_.loc[(c, cnt), 'Example values'] = ', '.join([f'{vv}' for vv in v])
            else:
                # print first and last
                df_.loc[(c, cnt), 'Example values'] = '{} .. {}'.format(v[0],v[-1])

if VERBOSE > 1:
    with pd.option_context('display.max_rows', 999):
        display(df_)

if VERBOSE > 0:
    if df_.reset_index().loc[:,'Type counter'].nunique() > 1:
        print('These columns contain more than one type')

        with pd.option_context("display.max_rows", 999):
            display(
                df_.reset_index()\
                .pivot(columns='Type counter', index='Column', values='Data Type')\
                .dropna(subset=[1])\
                .sort_values(by=[0, 1])\
                .fillna('')
        )

if VERBOSE > 1:
    gb=df_.groupby('Data Type')
    for g in gb.groups:
        with pd.option_context("display.max_rows", 999):
            display(gb.get_group(g).sort_index())


These columns contain more than one type


Type counter,0,1,2
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
drz__OdoKM,<class 'float'>,<class 'str'>,<class 'int'>
drz__Raw_text,<class 'list'>,<class 'str'>,
drz__Title,<class 'lxml.etree._ElementUnicodeResult'>,<class 'str'>,
rdw_ovi_tijd_aanvang_tenaamstelling,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,
drz__LotNr,<class 'str'>,<class 'int'>,
rdw_ovi_bijzonderheid_tekst,<class 'str'>,<class 'lxml.etree._ElementUnicodeResult'>,
rdw_ovi_car_vtg_num_eu_omschrijving,<class 'str'>,<class 'lxml.etree._ElementUnicodeResult'>,
rdw_ovi_eigenaren,<class 'str'>,<class 'lxml.etree._ElementUnicodeResult'>,
rdw_ovi_emissieklasse_diesel,<class 'str'>,<class 'lxml.etree._ElementUnicodeResult'>,
rdw_ovi_inrichting_code_omschrijving,<class 'str'>,<class 'lxml.etree._ElementUnicodeResult'>,


## Dummies to category (reverse one-hot-encode)

Categorize info from auction by converting from dummies to categories. Convert different boolean fields into one field with string.

In [22]:
# fuel
fuels = ['LPG','Benzine','Diesel','Hybrid']
# new field
car['drz__fuel'] = (car.loc[:,['drz__'+f.lower() for f in fuels]] * fuels).fillna('').apply(lambda s: '/'.join([ss for ss in s if len(ss) > 0]), axis=1)

  car['drz__fuel'] = (car.loc[:,['drz__'+f.lower() for f in fuels]] * fuels).fillna('').apply(lambda s: '/'.join([ss for ss in s if len(ss) > 0]), axis=1)


## Date and time operations

Age of car, APK etc.

In [23]:
# Date of auction based on index name
car['drz__auctiondate'] = [pd.to_datetime(re.search('([0-9]{4}-[0-9]+)-.*',i)[1],format='%Y-%m') for i in car.index.values]

# Choose MF year if full date not available
car['drz__MF'] = car.drz__Mfdate.combine_first(car.drz__Mfyear.apply(lambda t: pd.to_datetime(t,format='%Y')))


  car['drz__auctiondate'] = [pd.to_datetime(re.search('([0-9]{4}-[0-9]+)-.*',i)[1],format='%Y-%m') for i in car.index.values]
  car['drz__MF'] = car.drz__Mfdate.combine_first(car.drz__Mfyear.apply(lambda t: pd.to_datetime(t,format='%Y')))



- - - -
# Add rdw info into dataframe

### Age of query

Older queries might not have accurate information that was current at auction (e.g. inspection date)

In [24]:
# Collect all rdw timestamps, and get youngest
car['rdw__ts'] = car.loc[:, 
                         car.columns.str.contains('TimeStamp') & 
                         car.columns.str.contains('rdw')
                        ].apply(
    lambda x: max(x.dropna()) if any(x.notna()) else x[0]
    , axis=1)
car.rdw__ts = car.rdw__ts.apply(lambda d: pd.to_datetime(d) if isinstance(d, str) else d)
car.rdw__ts.fillna(pd.NaT, inplace=True)
car.rdw__ts.apply(lambda x: str(type(x))).value_counts(dropna=False)

  lambda x: max(x.dropna()) if any(x.notna()) else x[0]
  car['rdw__ts'] = car.loc[:,
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car.rdw__ts.fillna(pd.NaT, inplace=True)


rdw__ts
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    10371
<class 'pandas._libs.tslibs.nattype.NaTType'>          1459
Name: count, dtype: int64

In [25]:
if VERBOSE > 0:
    # plot query age
    age = (car.drz__auctiondate - car.rdw__ts.apply(lambda x: x.tz_localize(None))).apply(lambda x: x.days).astype('Int32')
    age.plot(marker=',', figsize=[16,2], linestyle='')
    plt.xlabel('lot')
    plt.ylabel('age (days)')
    plt.title('Age of rdw query since auction')


In [26]:
# Collect all NHTSA timestamps, and get youngest
car['nhtsa__ts'] = car.loc[:, 
                           car.columns.str.contains('TimeStamp') & 
                           car.columns.str.contains('nhtsa')
                          ].apply(
    lambda x: max(x.dropna()) if any(x.notna()) else x[0]
    , axis=1).dropna()
car.nhtsa__ts = car.nhtsa__ts.apply(lambda d: pd.to_datetime(d) if isinstance(d, str) else d)
car.nhtsa__ts.apply(lambda x: str(type(x))).value_counts(dropna=False)

  lambda x: max(x.dropna()) if any(x.notna()) else x[0]
  car['nhtsa__ts'] = car.loc[:,


nhtsa__ts
<class 'pandas._libs.tslibs.nattype.NaTType'>         11634
<class 'pandas._libs.tslibs.timestamps.Timestamp'>      196
Name: count, dtype: int64

In [27]:
if VERBOSE > 0:
    # plot query age
    age = (car.drz__auctiondate - car.nhtsa__ts.apply(lambda x: x.tz_localize(None))).apply(lambda x: x.days).astype('Int32')
    age.plot(marker='s', figsize=[16,2], linestyle='')
    plt.xlabel('lot')
    plt.ylabel('age (days)')
    plt.title('Age of NHTSA query since auction')


In [28]:
if VERBOSE > 0:
    df_ = pd.DataFrame(index=car.index)
    plot_cols = ['drz__', 'rdw_ovi_', 'rdw_gekentekende_voertuigen_', 'nhtsa_vpic_']
    for pfx, cols in data_types.groupby('prefix'):
        if pfx not in plot_cols:
            continue
        cols = car.columns.str.startswith(pfx)
        df_[pfx] = car.loc[:, cols].notna().sum(axis=1)

    f, (ax1, ax2) = plt.subplots(1, 2, figsize=[16,8])
    df_[plot_cols].plot(marker='x', linestyle='', secondary_y = False, ax = ax1)
    (df_.fillna(0)/df_.max(axis=0))[plot_cols].plot(marker='+', linestyle='', secondary_y = True, ax = ax2)

In [29]:
class Handle_concat:
    def add_sep(d, sep='/'):
        list_of_str = [v if v is not None else '' for v in d.values()]
        list_of_str = [v for v in list_of_str if len(v)>0]
        if len(list_of_str) == 0:
            return ''
        elif len(list_of_str) == 1:
            return list_of_str[0]
        return sep.join(list_of_str)

    def avg(d):
        list_of_num = [v if (v is not None) and (pd.notna(v)) else np.NaN for v in d.values()]
        if len(list_of_num) == 0:
            return np.NaN
        if all(np.isnan(list_of_num)):
            return np.NaN
        return np.nanmean(list_of_num)

    def max_num(d):
        list_of_num = [v if (v is not None) and (pd.notna(v)) else np.NaN for v in d.values()]
        if len(list_of_num) == 0:
            return np.NaN
        if all(np.isnan(list_of_num)):
            return np.NaN
        return np.nanmax(list_of_num)

    def max_str(d):
        list_of_str = [v if v is not None else '' for v in d.values()]
        if len(list_of_str) == 0:
            return ''
        return max(list_of_str)

    def first_num(d):
        list_of_num = [v if (v is not None) and (pd.notna(v)) else np.NaN for v in d.values()]
        if len(list_of_num) == 0:
            return None
        return list_of_num[0]
    def first_str(d):
        list_of_str = [v if v is not None else '' for v in d.values()]
        if len(list_of_str) == 0:
            return ''
        return list_of_str[0]

# Handle_concat.add_sep({'1': '', '2': None})
# Handle_concat.add_sep({'1': '', '2': None})
# Handle_concat.add_sep({'1': '', '2': 'foo'})
# Handle_concat.add_sep({'1': 'foo', '2': ''})
# Handle_concat.add_sep({'1': 'foo', '2': 'bar'})
# Handle_concat.add_sep({'1': 'foo', '2': 'bar', '3': 'baz'}, sep=' - ')

# Handle_concat._avg({'1': 120.0, '2': 100.0})
# Handle_concat._avg({'1': 120.0})
# Handle_concat._avg({'1': 120.0, '2': None})
# Handle_concat._avg({'1': None, '2': None})


In [30]:
# aggregate rdw fields with index numbers stored in dicts
#   this can be mean, string join, first or whatever

car['rdw_brandstof_nettomaximumvermogen'] = car.rdw_brandstof_nettomaximumvermogen_concat.apply(Handle_concat.avg)
car['rdw_brandstof_brandstofverbruik_gecombineerd'] = car.rdw_brandstof_brandstofverbruik_gecombineerd_concat.apply(Handle_concat.avg)
car['rdw_brandstof_brandstof_omschrijving'] = car.rdw_brandstof_brandstof_omschrijving_concat.apply(Handle_concat.add_sep)
car['rdw_carrosserie_type_carrosserie_europese_omschrijving'] = car.rdw_carrosserie_type_carrosserie_europese_omschrijving_concat.apply(Handle_concat.first_str)
car['rdw_carrosserie_carrosserietype'] = car.rdw_carrosserie_carrosserietype_concat.apply(Handle_concat.first_str)
car['rdw_motor_uitvoering_aantal_cilinders'] = car.rdw_motor_uitvoering_aantal_cilinders_concat.apply(Handle_concat.first_str)
car['rdw_motor_uitvoering_cilinderinhoud_cm3'] = car.rdw_motor_uitvoering_cilinderinhoud_cm3_concat.apply(Handle_concat.first_num)
car['rdw_versnellingsbak_uitvoering_type_versnellingsbak'] = car.rdw_versnellingsbak_uitvoering_type_versnellingsbak_concat.apply(Handle_concat.first_str)
car['rdw_uitvoering_gebruiksgegevens_per_uitgave_verbruikcategorie_uitvoering'] = car.rdw_uitvoering_gebruiksgegevens_per_uitgave_verbruikcategorie_uitvoering_concat.apply(Handle_concat.max_str)
car['rdw_motor_uitvoering_hybride_elektrisch_voertuig'] = car.rdw_motor_uitvoering_hybride_elektrisch_voertuig_concat.apply(Handle_concat.max_num)
car['rdw_brandstof_klasse_hybride_elektrisch_voertuig'] = car.rdw_brandstof_klasse_hybride_elektrisch_voertuig_concat.apply(Handle_concat.first_str)
car['rdw_versnellingsbak_uitvoering_aantal_versnellingen_bovengrens'] = car.rdw_versnellingsbak_uitvoering_aantal_versnellingen_bovengrens_concat.apply(Handle_concat.max_num)
car['rdw_versnellingsbak_uitvoering_aantal_versnellingen_ondergrens'] = car.rdw_versnellingsbak_uitvoering_aantal_versnellingen_ondergrens_concat.apply(Handle_concat.max_num)
car['rdw_motor_uitvoering_brandstof_netto_max_vermogen_bovengrens'] = car.rdw_motor_uitvoering_brandstof_netto_max_vermogen_bovengrens_concat.apply(Handle_concat.max_num)
car['rdw_motor_uitvoering_brandstof_netto_max_vermogen_ondergrens'] = car.rdw_motor_uitvoering_brandstof_netto_max_vermogen_ondergrens_concat.apply(Handle_concat.max_num)
if 'rdw_brandstof_opgegeven_maximum_snelheid_concat' in car.columns:
    car['rdw_brandstof_opgegeven_maximum_snelheid'] = car.rdw_brandstof_opgegeven_maximum_snelheid_concat.apply(Handle_concat.max_num)

car.rdw_motor_uitvoering_hybride_elektrisch_voertuig = car.rdw_motor_uitvoering_hybride_elektrisch_voertuig.replace({1: True, 0: False}).astype('boolean')

# Add LPG specification to fuel
is_lpg = car.rdw_brandstof_brandstof_omschrijving.str.lower().str.contains('lpg') | car.rdw_brandstof_brandstof_omschrijving.str.lower().str.contains('cng')
car.loc[is_lpg, 'rdw_brandstof_brandstof_omschrijving'] = car.loc[is_lpg, ['rdw_brandstof_brandstof_omschrijving', 'rdw_gekentekende_voertuigen_type_gasinstallatie']].apply('/'.join, axis='columns')

if VERBOSE > 1:
    display(car.loc[car['rdw_brandstof_brandstof_omschrijving'].drop_duplicates().index, [
        'rdw_brandstof_brandstof_omschrijving', 'rdw_brandstof_brandstof_omschrijving_concat', 'rdw_gekentekende_voertuigen_type_gasinstallatie'
    ]])
    

  car['rdw_brandstof_nettomaximumvermogen'] = car.rdw_brandstof_nettomaximumvermogen_concat.apply(Handle_concat.avg)
  car['rdw_brandstof_brandstofverbruik_gecombineerd'] = car.rdw_brandstof_brandstofverbruik_gecombineerd_concat.apply(Handle_concat.avg)
  car['rdw_brandstof_brandstof_omschrijving'] = car.rdw_brandstof_brandstof_omschrijving_concat.apply(Handle_concat.add_sep)
  car['rdw_carrosserie_type_carrosserie_europese_omschrijving'] = car.rdw_carrosserie_type_carrosserie_europese_omschrijving_concat.apply(Handle_concat.first_str)
  car['rdw_carrosserie_carrosserietype'] = car.rdw_carrosserie_carrosserietype_concat.apply(Handle_concat.first_str)
  car['rdw_motor_uitvoering_aantal_cilinders'] = car.rdw_motor_uitvoering_aantal_cilinders_concat.apply(Handle_concat.first_str)
  car['rdw_motor_uitvoering_cilinderinhoud_cm3'] = car.rdw_motor_uitvoering_cilinderinhoud_cm3_concat.apply(Handle_concat.first_num)
  car['rdw_versnellingsbak_uitvoering_type_versnellingsbak'] = car.rdw_versnell

## preprocessing of rdw info

In [31]:
car['rdw__rhd'] = car.rdw_basisgegevens_eeg_uitvoering_kant_van_het_stuur.astype('O').replace({'L': False, 'R': True})
car['rdw__automatic'] = car.rdw_versnellingsbak_uitvoering_type_versnellingsbak.replace({'A': True, 'H': False, 'C': True, 'G': True, 'F': True, 'M': False})
# available options: HACGFWOM
# M: landscaper
# O: postnl
# W: scooter
# F: electric MB

# As of 2024-03-23 options are with following counts
# src: https://opendata.rdw.nl/Typegoedkeuring/Open-Data-RDW-Versnellingsbak-Uitvoering/r7cw-67gs/explore/query/SELECT%20%60type_versnellingsbak%60%2C%20count%28%60volgnummer%60%29%20AS%20%60count_volgnummer%60%0AGROUP%20BY%20%60type_versnellingsbak%60%0AORDER%20BY%20%60count_volgnummer%60%20DESC%20NULL%20LAST/page/aggregate
# M: 3,240,497 (replaces H?)
# A: 2,360,077
# C: 49,158
# G: 36,137
# <empty>: 10,463
# F: 3,523
# W: 2,146
# O: 802


  car['rdw__rhd'] = car.rdw_basisgegevens_eeg_uitvoering_kant_van_het_stuur.astype('O').replace({'L': False, 'R': True})
  car['rdw__automatic'] = car.rdw_versnellingsbak_uitvoering_type_versnellingsbak.replace({'A': True, 'H': False, 'C': True, 'G': True, 'F': True, 'M': False})


### Use auction info or RDW info

In [32]:
# *: extra column, but not used
fldpairs = [
    ['rdw_gekentekende_voertuigen_taxi_indicator',
     'drz__taxi', 
     'taxi'],
    ['rdw_gekentekende_voertuigen_datum_eerste_toelating_dt', 
     'rdw_gekentekende_voertuigen_datum_eerste_toelating', 
     'rdw_ovi_eerste_toelatingsdatum', 
     '*drz__Mfdate', 
     '*drz__Mfyear', 
     '*nhtsa_vpic_MFY', 
     'MF'],
    ['rdw_gekentekende_voertuigen_datum_eerste_tenaamstelling_in_nederland_dt', 
     'rdw_gekentekende_voertuigen_datum_eerste_tenaamstelling_in_nederland', 
     'rdw_ovi_eerste_afgifte_nederland',
     'regnl'],
    ['rdw_gekentekende_voertuigen_vervaldatum_apk_dt',
     'rdw_gekentekende_voertuigen_vervaldatum_apk',
     'rdw_ovi_vervaldatum_apk_keuring',
     'drz__APKdate',
     'apk'],
    ['rdw_carrosserie_type_carrosserie_europese_omschrijving',
     'rdw_ovi_carrosserie_omschrijving',
     '*rdw_basisgegevens_eeg_uitvoering_eur_codering_carrosserietype',
     '*rdw_carrosserie_carrosserietype',
     '*rdw_ovi_carrosserie_carrosserietype',
     '*rdw_ovi_inrichting_code_omschrijving',
     '*nhtsa_vpic_exterior_body__bodyclass',
     'bodytype'],
    ['rdw_brandstof_brandstof_omschrijving',
     'drz__fuel',
     '*nhtsa_vpic_engine___fueltypeprimary',
     '*nhtsa_vpic_engine___fueltypesecondary',
     'fuel'],
    ['rdw_gekentekende_voertuigen_merk',
     'rdw_ovi_merk',
     'drz__ItemBrand',
     'nhtsa_vpic_general___make',
     'brand'],
    ['rdw_gekentekende_voertuigen_handelsbenaming',
     'rdw_ovi_handelsbenaming',
     'drz__model',
     'nhtsa_vpic_general___model',
     '*brand',
     '*drz__ItemType',
     '*rdw_basisgegevens_eeg_uitvoering_handelsbenaming',
     'model'],
    ['rdw_basisgegevens_eeg_uitvoering_handelsbenaming',
     'drz__modelspec',
     '*brand',
     '*drz__ItemType',
     'modelspec'],
    ['rdw_gekentekende_voertuigen_aantal_deuren',
     'rdw_ovi_aantal_deuren',
     'rdw_basisgegevens_eeg_uitvoering_aantal_deuren_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_aantal_deuren_ondergrens',
     'nhtsa_vpic_exterior_body__doors',
     'nDoor'],
    ['rdw_gekentekende_voertuigen_aantal_zitplaatsen',
     'rdw_ovi_aantal_zitplaatsen',
     'rdw_basisgegevens_eeg_uitvoering_aantal_zitplaatsen_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_aantal_zitplaatsen_ondergrens',
     'nhtsa_vpic_interior_seat__seats',
     'nSeat'],
    ['rdw_gekentekende_voertuigen_aantal_cilinders',
     'rdw_ovi_aantal_cilinders',
     'rdw_motor_uitvoering_aantal_cilinders',
     'nhtsa_vpic_engine___enginecylinders',
     'nCyl'],
    ['rdw_gekentekende_voertuigen_cilinderinhoud',
     'rdw_ovi_cilinder_inhoud',
     'rdw_motor_uitvoering_cilinderinhoud_cm3',
     'nhtsa_vpic_engine___displacementcc',
     '*nhtsa_vpic_engine___displacementci',
     '*nhtsa_vpic_engine___displacementl',
     'cylvol'],
    ['rdw__rhd',
     'drz__rhd',
     '*nhtsa_vpic_interior___steeringlocation',
     'rhd'],
    ['rdw_gekentekende_voertuigen_wielbasis',
     'rdw_basisgegevens_eeg_uitvoering_wielbasis_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_wielbasis_ondergrens',
     'nhtsa_vpic_exterior_dimension__wheelbaselong',
     'nhtsa_vpic_exterior_dimension__wheelbaseshort',
     'wheelbase'],
    ['rdw_gekentekende_voertuigen_breedte',
     'rdw_ovi_breedte',
     'rdw_basisgegevens_eeg_uitvoering_breedte_voertuig_uitvoering_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_breedte_voertuig_uitvoering_ondergrens',
     'rdw_ovi_breedte_min_max',
     'width'],
    ['rdw_gekentekende_voertuigen_lengte',
     'rdw_ovi_lengte',
     'rdw_basisgegevens_eeg_uitvoering_lengte_voertuig_uitvoering_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_lengte_voertuig_uitvoering_ondergrens',
     'rdw_ovi_lengte_min_max',
     'length'],
    ['rdw_gekentekende_voertuigen_massa_ledig_voertuig',
     'rdw_ovi_massa_ledig_voertuig',
     'rdw_basisgegevens_eeg_uitvoering_massa_leeg_voertuig_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_massa_leeg_voertuig_ondergrens',
     'rdw_gekentekende_voertuigen_massa_rijklaar',
     'rdw_ovi_massa_bedrijfsklaar',
     'rdw_ovi_massa_rijklaar_min_max',
     'rdw_basisgegevens_eeg_uitvoering_min_massa_voertuig',
     'rdw_basisgegevens_eeg_uitvoering_massa_bedrijfsklaar_toestand_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_massa_bedrijfsklaar_toestand_ondergrens',
     'rdw_basisgegevens_eeg_uitvoering_max_massa_voertuig_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_max_massa_voertuig_ondergrens',
     'rdw_basisgegevens_eeg_uitvoering_max_massa_vrtg_techn_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_max_massa_vrtg_techn_ondergrens',
     'rdw_gekentekende_voertuigen_toegestane_maximum_massa_voertuig',
     'rdw_gekentekende_voertuigen_technische_max_massa_voertuig',
     'rdw_basisgegevens_eeg_uitvoering_max_massa_voertuig_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_max_massa_voertuig_ondergrens',
     'rdw_ovi_technische_maximum_massa_voertuig',
     'rdw_ovi_maximum_massa_voertuig', 
     'rdw_ovi_technische_maximum_massa_voertuig_min_max',
     '*nhtsa_vpic_exterior_dimension__gcwr',
     '*nhtsa_vpic_exterior_dimension__gcwr_to',
     '*nhtsa_vpic_exterior_dimension__gvwr',
     '*nhtsa_vpic_exterior_dimension__gvwr_to',
     'weight'],
    ['rdw__automatic',
     'drz__automatic',
     '*nhtsa_vpic_mechanical_transmission__transmissionstyle',
     'automatic'],
    ['rdw_versnellingsbak_uitvoering_aantal_versnellingen_bovengrens',
     'rdw_versnellingsbak_uitvoering_aantal_versnellingen_ondergrens',
     'nhtsa_vpic_mechanical_transmission__transmissionspeeds',
     'nGear'],
    ['rdw_motor_uitvoering_hybride_elektrisch_voertuig',
     'drz__hybrid',
     '*nhtsa_vpic_engine___electrificationlevel',
     '*rdw_brandstof_klasse_hybride_elektrisch_voertuig',
     'hybrid'],
    ['rdw_gekentekende_voertuigen_bruto_bpm',
     'rdw_ovi_bpm_bedrag',
     'bpm'],
    ['rdw_gekentekende_voertuigen_catalogusprijs',
     'rdw_ovi_catalogus_prijs',
     '*nhtsa_vpic_general___baseprice',
     'newprice'],
    ['rdw_brandstof_nettomaximumvermogen',
     'rdw_motor_uitvoering_brandstof_netto_max_vermogen_bovengrens',
     'rdw_motor_uitvoering_brandstof_netto_max_vermogen_ondergrens',
     'rdw_ovi_vermogen_q',
     'nhtsa_vpic_engine___enginehp',
     'nhtsa_vpic_engine___enginehp_to',
     'power'],
    ['rdw_gekentekende_voertuigen_maximale_constructiesnelheid',
     'rdw_ovi_maximum_constructie_snelheid',
     'rdw_ovi_opgegeven_max_snelheid',
     'rdw_basisgegevens_eeg_uitvoering_max_constructie_snelheid_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_max_constructie_snelheid_ondergrens',
     '*nhtsa_vpic_engine___topspeedmph',
     'maxspeed'],
    ['rdw_gekentekende_voertuigen_hoogte_voertuig',
     'rdw_basisgegevens_eeg_uitvoering_hoogte_voertuig_uitvoering_bovengrens',
     'rdw_basisgegevens_eeg_uitvoering_hoogte_voertuig_uitvoering_ondergrens',
     'height'],
    ['rdw_ovi_eigenaren_private',
     '*rdw_ovi_eigenaren',
     'private_owners'],
    ['rdw_ovi_eigenaren_company',
     '*rdw_ovi_eigenaren',
     'company_owners'],
    ['rdw_uitvoering_gebruiksgegevens_per_uitgave_verbruikcategorie_uitvoering',
     'rdw_gekentekende_voertuigen_zuinigheidsclassificatie',
     'energylab'],
    ['rdw_ovi_wachten_op_keuring',
     'rdw_gekentekende_voertuigen_wacht_op_keuren',
     'drz__wok',
     'under_survey'],
    ['rdw_gekentekende_voertuigen_eerste_kleur',
     '*rdw_gekentekende_voertuigen_tweede_kleur',
     'rdw_ovi_kleur',
     'color'],
]


for all_flds in fldpairs:
    
    show_flds = [f[1:] for f in all_flds if f.startswith('*')]
    flds = [f for f in all_flds if not f.startswith('*')]

    # add result field if not exist
    if OPBOD:
        not_exist = [f for f in flds if f not in car.columns]
        df_ = car.loc[:, np.array(flds)[~np.in1d(flds, not_exist)]]
        df_[not_exist] = np.NaN

    else:
        if flds[-1] not in car.columns:
            df_ = car.loc[:, flds[:-1]]
            df_[flds[-1]] = np.NaN
        else:
            df_ = car.loc[:, flds]
    df_.replace({'': np.NaN}, inplace=True)

    # branch off df for display
    df_disp = df_.copy().astype('O')
    df_disp.fillna('.', inplace=True)

    # select rows that are not all (but one) NaN
    notallna = df_.iloc[:,:-1].notna().sum(axis=1) > 1
    
    # start filling values from left to right
    df_ = df_.bfill(axis=1)
    
    # Last value is result
    choice = df_.iloc[:,0]
    choice.name = f'>{flds[-1]}<'
    car[flds[-1]] = choice # <- UPDATE DATA
    
    # display differences
    
    if VERBOSE > 0:
        
        df_disp = pd.concat([
                car.loc[:, show_flds].add_prefix('*'),
                df_disp], axis=1)
        
        # accented are same
        if flds[-1] == 'brand':
            df_.replace({'CITROËN': 'CITROEN'}, inplace=True)
            
            
        df_disp = pd.concat([df_disp, choice], axis=1)

        nuq = df_.iloc[:,:-1].nunique(axis=1, dropna=True)
        isdiff = (nuq > 1) & notallna


        if all(~isdiff):
            display({'text/html': 
                     f'<b>{flds[-1]}</b>: same in all auctions <b>{", ".join(flds[:-1])}</b>'}, raw=True)
        else:
            if VERBOSE > 1:
                display(df_disp[isdiff])
            else:
                # only current auction
                islast = df_disp.index.str.startswith('-'.join(df_disp.index[-1].split('-')[:-1]))
                if all(~(isdiff & islast)):
                    display({'text/html': 
                             f'<b>{flds[-1]}</b>: same in last auctions <b>{", ".join(flds[:-1])}</b>'}, raw=True)
                else:
                    display(df_disp[isdiff & islast])


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_vervaldatum_apk_dt,rdw_gekentekende_voertuigen_vervaldatum_apk,rdw_ovi_vervaldatum_apk_keuring,drz__APKdate,apk,>apk<
2024-05-700608,2025-05-28 00:00:00,2025-05-28 00:00:00,2025-05-28 00:00:00,2024-05-28 00:00:00,.,2025-05-28 00:00:00
2024-05-701808,2025-06-29 00:00:00,2025-06-29 00:00:00,2025-06-29 00:00:00,2024-06-29 00:00:00,.,2025-06-29 00:00:00
2024-05-702308,2024-04-25 00:00:00,2024-04-25 00:00:00,2025-05-07 00:00:00,.,.,2024-04-25 00:00:00
2024-05-706108,2026-05-29 00:00:00,2026-05-29 00:00:00,2026-05-29 00:00:00,2024-05-29 00:00:00,.,2026-05-29 00:00:00
2024-05-270009,2024-03-31 00:00:00,2024-03-31 00:00:00,2025-05-15 00:00:00,.,.,2024-03-31 00:00:00
2024-05-712909,2024-05-17 00:00:00,2024-05-17 00:00:00,2025-05-17 00:00:00,2024-05-17 00:00:00,.,2024-05-17 00:00:00


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,*rdw_basisgegevens_eeg_uitvoering_eur_codering_carrosserietype,*rdw_carrosserie_carrosserietype,*rdw_ovi_carrosserie_carrosserietype,*rdw_ovi_inrichting_code_omschrijving,*nhtsa_vpic_exterior_body__bodyclass,rdw_carrosserie_type_carrosserie_europese_omschrijving,rdw_ovi_carrosserie_omschrijving,bodytype,>bodytype<
2024-05-701108,,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-703208,,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-706408,AF,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-702609,,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-702809,AF,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-705709,AF,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-705909,AF,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-707009,AF,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-707209,,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)
2024-05-707709,,AF,AF,MPV,,Multipurpose vehicle (MPV),MPV,.,Multipurpose vehicle (MPV)


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,*nhtsa_vpic_engine___fueltypeprimary,*nhtsa_vpic_engine___fueltypesecondary,rdw_brandstof_brandstof_omschrijving,drz__fuel,fuel,>fuel<
2024-05-706408,,,Benzine/Elektriciteit,Hybrid,.,Benzine/Elektriciteit
2024-05-700009,Gasoline,,Benzine/LPG/G3 gasinstallatie,LPG,.,Benzine/LPG/G3 gasinstallatie
2024-05-712009,,,Diesel/Elektriciteit,Hybrid,.,Diesel/Elektriciteit
2024-05-713709,,,Benzine/Elektriciteit,Hybrid,.,Benzine/Elektriciteit
2024-05-714509,Gasoline,Electric,Benzine/Elektriciteit,Hybrid,.,Benzine/Elektriciteit
2024-05-704110,,,Benzine/Elektriciteit,Hybrid,.,Benzine/Elektriciteit
2024-05-704910,Gasoline,,Benzine/LPG/G3 gasinstallatie,Benzine,.,Benzine/LPG/G3 gasinstallatie


  car[flds[-1]] = choice # <- UPDATE DATA
  df_.replace({'CITROËN': 'CITROEN'}, inplace=True)


Unnamed: 0,rdw_gekentekende_voertuigen_merk,rdw_ovi_merk,drz__ItemBrand,nhtsa_vpic_general___make,brand,>brand<
2024-05-702008,RENAULT,RENAULT,RENAULT,EAGLE,.,RENAULT
2024-05-702509,RENAULT,RENAULT,RENAULT,EAGLE,.,RENAULT
2024-05-703809,.,.,RENAULT,EAGLE,.,RENAULT
2024-05-708209,RENAULT,RENAULT,RENAULT,EAGLE,.,RENAULT
2024-05-709309,RENAULT,RENAULT,RENAULT,EAGLE,.,RENAULT
2024-05-710109,.,.,RENAULT,EAGLE,.,RENAULT
2024-05-701110,RENAULT,RENAULT,RENAULT,EAGLE,.,RENAULT
2024-05-701910,RENAULT,RENAULT,RENAULT,EAGLE,.,RENAULT


  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,*brand,*drz__ItemType,*rdw_basisgegevens_eeg_uitvoering_handelsbenaming,rdw_gekentekende_voertuigen_handelsbenaming,rdw_ovi_handelsbenaming,drz__model,nhtsa_vpic_general___model,model,>model<
2024-05-704008,VOLKSWAGEN,golf 66 kw e2,,golf cabr. 66 kw e2,golf cabr. 66 kw e2,golf 66 kw e2,.,.,golf cabr. 66 kw e2
2024-05-704408,AUDI,tt 8n,,8n,8n,tt 8n,.,.,8n
2024-05-705808,VOLKSWAGEN,transporter,,transporter bestel tdi 96 kw,transporter bestel tdi 96 kw,transporter,.,.,transporter bestel tdi 96 kw
2024-05-700009,DODGE,Challenger r/t,,challenger r/t,challenger r/t,Challenger r/t,challenger,.,challenger r/t
2024-05-700209,FORD,Focus,,focus,focus,Focus,.,.,focus
2024-05-700409,VOLVO,v50; 2.4i 140 pk,2.4i 140 pk,v50,v50,v50,s40,.,v50
2024-05-700909,SAAB,9-3 sport sedan 2.0,9-3 sport sedan 2.0 t,9-3,9-3,9-3 sport sedan 2.0,9-3,.,9-3
2024-05-705709,SEAT,Altea,,altea,altea,Altea,.,.,altea
2024-05-705909,KIA,Sportage; 2.0 m/t,2.0 m/t,sportage,sportage,Sportage,.,.,sportage
2024-05-708409,CITROËN,c1,,citroen c1,citroen c1,c1,.,.,citroen c1


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_aantal_deuren,rdw_ovi_aantal_deuren,rdw_basisgegevens_eeg_uitvoering_aantal_deuren_bovengrens,rdw_basisgegevens_eeg_uitvoering_aantal_deuren_ondergrens,nhtsa_vpic_exterior_body__doors,nDoor,>nDoor<
2024-05-700508,.,.,4.0,3.0,.,.,4.0
2024-05-700409,4.0,.,4.0,4.0,5,.,4.0
2024-05-702709,.,.,5.0,3.0,.,.,5.0
2024-05-703209,.,.,5.0,3.0,.,.,5.0
2024-05-713009,5.0,.,5.0,5.0,4,.,5.0
2024-05-714509,4.0,.,4.0,4.0,5,.,4.0
2024-05-714609,5.0,.,5.0,3.0,.,.,5.0
2024-05-700310,4.0,.,4.0,4.0,5,.,4.0
2024-05-700810,4.0,.,5.0,5.0,.,.,4.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_aantal_zitplaatsen,rdw_ovi_aantal_zitplaatsen,rdw_basisgegevens_eeg_uitvoering_aantal_zitplaatsen_bovengrens,rdw_basisgegevens_eeg_uitvoering_aantal_zitplaatsen_ondergrens,nhtsa_vpic_interior_seat__seats,nSeat,>nSeat<
2024-05-700508,5.0,5,6.0,.,.,.,5.0
2024-05-705608,5.0,5,5.0,1.0,.,.,5.0
2024-05-708509,5.0,5,5.0,4.0,.,.,5.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_aantal_cilinders,rdw_ovi_aantal_cilinders,rdw_motor_uitvoering_aantal_cilinders,nhtsa_vpic_engine___enginecylinders,nCyl,>nCyl<
2024-05-701308,4.0,4,.,6,.,4.0
2024-05-704010,4.0,4,.,6,.,4.0


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,*nhtsa_vpic_engine___displacementci,*nhtsa_vpic_engine___displacementl,rdw_gekentekende_voertuigen_cilinderinhoud,rdw_ovi_cilinder_inhoud,rdw_motor_uitvoering_cilinderinhoud_cm3,nhtsa_vpic_engine___displacementcc,cylvol,>cylvol<
2024-05-701008,115.945114,1.9,1997.0,.,1997.0,1905.0,.,1997.0
2024-05-701308,122.047488,2.0,1984.0,.,.,2000.0,.,1984.0
2024-05-702808,201.378356,3.3,3342.0,.,3342.0,3300.0,.,3342.0
2024-05-700009,347.835341,5.7,5654.0,.,.,5700.0,.,5654.0
2024-05-700909,122.047488,2.0,1998.0,.,1998.0,2000.0,.,1998.0
2024-05-709109,115.945114,1.9,1997.0,.,1997.0,1905.0,.,1997.0
2024-05-711109,97.637991,1.6,1364.0,.,1364.0,1600.0,.,1364.0
2024-05-713009,134.252237,2.2,2184.0,.,2184.0,2200.0,.,2184.0
2024-05-714509,109.842739,1.8,1798.0,.,1798.0,1800.0,.,1798.0
2024-05-703710,115.945114,1.9,1997.0,.,1997.0,1905.0,.,1997.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_wielbasis,rdw_basisgegevens_eeg_uitvoering_wielbasis_bovengrens,rdw_basisgegevens_eeg_uitvoering_wielbasis_ondergrens,nhtsa_vpic_exterior_dimension__wheelbaselong,nhtsa_vpic_exterior_dimension__wheelbaseshort,wheelbase,>wheelbase<
2024-05-700008,2850.0,2851.0,2851.0,.,.,.,2850.0
2024-05-700408,2450.0,2451.0,2451.0,.,.,.,2450.0
2024-05-700608,2450.0,2451.0,2451.0,.,.,.,2450.0
2024-05-700908,2700.0,2699.0,2699.0,.,.,.,2700.0
2024-05-701008,2610.0,2608.0,2608.0,.,.,.,2610.0
...,...,...,...,...,...,...,...
2024-05-703710,2610.0,2608.0,2608.0,.,.,.,2610.0
2024-05-703910,2910.0,2905.0,2905.0,.,.,.,2910.0
2024-05-704010,2810.0,.,.,.,110.5,.,2810.0
2024-05-704510,2610.0,2605.0,2605.0,.,.,.,2610.0


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_breedte,rdw_ovi_breedte,rdw_basisgegevens_eeg_uitvoering_breedte_voertuig_uitvoering_bovengrens,rdw_basisgegevens_eeg_uitvoering_breedte_voertuig_uitvoering_ondergrens,rdw_ovi_breedte_min_max,width,>width<
2024-05-700008,1830.0,.,1827.0,1827.0,.,.,1830.0
2024-05-700508,2030.0,2030,2032.0,2032.0,.,.,2030.0
2024-05-700908,1780.0,.,1777.0,1777.0,.,.,1780.0
2024-05-701208,.,.,1799.0,1790.0,.,.,1799.0
2024-05-702608,.,.,1799.0,1790.0,.,.,1799.0
2024-05-703008,1750.0,.,1751.0,1751.0,.,.,1750.0
2024-05-703508,1830.0,.,1825.0,1825.0,.,.,1830.0
2024-05-705608,1870.0,.,1871.0,1871.0,.,.,1870.0
2024-05-705908,1800.0,.,1796.0,1796.0,.,.,1800.0
2024-05-706108,.,.,1850.0,1796.0,.,.,1850.0


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_lengte,rdw_ovi_lengte,rdw_basisgegevens_eeg_uitvoering_lengte_voertuig_uitvoering_bovengrens,rdw_basisgegevens_eeg_uitvoering_lengte_voertuig_uitvoering_ondergrens,rdw_ovi_lengte_min_max,length,>length<
2024-05-700008,4710.0,.,4713.0,4713.0,.,.,4710.0
2024-05-700408,.,.,4111.0,4000.0,.,.,4111.0
2024-05-700508,.,0,5487.0,5339.0,.,.,0.0
2024-05-700608,.,.,3993.0,3897.0,.,.,3993.0
2024-05-700908,4690.0,.,4691.0,4691.0,.,.,4690.0
...,...,...,...,...,...,...,...
2024-05-704110,4280.0,.,4275.0,4275.0,.,.,4280.0
2024-05-704510,.,.,4550.0,4427.0,.,.,4550.0
2024-05-704710,.,.,4336.0,4234.0,.,.,4336.0
2024-05-705010,.,.,4357.0,4255.0,.,.,4357.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,*nhtsa_vpic_exterior_dimension__gcwr,*nhtsa_vpic_exterior_dimension__gcwr_to,*nhtsa_vpic_exterior_dimension__gvwr,*nhtsa_vpic_exterior_dimension__gvwr_to,rdw_gekentekende_voertuigen_massa_ledig_voertuig,rdw_ovi_massa_ledig_voertuig,rdw_basisgegevens_eeg_uitvoering_massa_leeg_voertuig_bovengrens,rdw_basisgegevens_eeg_uitvoering_massa_leeg_voertuig_ondergrens,rdw_gekentekende_voertuigen_massa_rijklaar,rdw_ovi_massa_bedrijfsklaar,...,rdw_basisgegevens_eeg_uitvoering_max_massa_vrtg_techn_ondergrens,rdw_gekentekende_voertuigen_toegestane_maximum_massa_voertuig,rdw_gekentekende_voertuigen_technische_max_massa_voertuig,rdw_basisgegevens_eeg_uitvoering_max_massa_voertuig_bovengrens,rdw_basisgegevens_eeg_uitvoering_max_massa_voertuig_ondergrens,rdw_ovi_technische_maximum_massa_voertuig,rdw_ovi_maximum_massa_voertuig,rdw_ovi_technische_maximum_massa_voertuig_min_max,weight,>weight<
2024-05-700008,,,,,1645.0,1645,.,.,1745.0,1745,...,2245.0,2245.0,2245.0,.,.,2245,2245,.,.,1645.0
2024-05-700408,,,,,1016.0,1016,.,.,1116.0,1116,...,1571.0,1571.0,1571.0,.,.,1571,1571,.,.,1016.0
2024-05-700508,,,,,1966.0,1966,.,.,2066.0,2066,...,.,2940.0,2940.0,.,.,2940,2940,.,.,1966.0
2024-05-700608,,,,,1028.0,1028,.,.,1128.0,1128,...,1600.0,1600.0,1600.0,.,.,1600,1600,.,.,1028.0
2024-05-700908,,,,,1485.0,1485,.,.,1585.0,1585,...,2075.0,2075.0,2075.0,.,.,2075,2075,.,.,1485.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-704510,,,,,1410.0,1410,.,.,1510.0,1510,...,2080.0,2080.0,2080.0,.,.,2080,2080,.,.,1410.0
2024-05-704710,,,,,1395.0,1395,.,.,1495.0,1495,...,1990.0,1990.0,1990.0,.,.,1990,1990,.,.,1395.0
2024-05-704910,,,"Class 1: 6,000 lb or less (2,722 kg or less)","Class 1: 6,000 lb or less (2,722 kg or less)",1898.0,1898,.,.,1998.0,1998,...,.,2439.0,2439.0,.,.,2439,2439,.,.,1898.0
2024-05-705010,,,,,1213.0,1213,.,.,1313.0,1313,...,1820.0,1820.0,1820.0,.,.,1820,1820,.,.,1213.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_versnellingsbak_uitvoering_aantal_versnellingen_bovengrens,rdw_versnellingsbak_uitvoering_aantal_versnellingen_ondergrens,nhtsa_vpic_mechanical_transmission__transmissionspeeds,nGear,>nGear<
2024-05-709309,6.0,6.0,3,.,6.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_.replace({'': np.NaN}, inplace=True)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_brandstof_nettomaximumvermogen,rdw_motor_uitvoering_brandstof_netto_max_vermogen_bovengrens,rdw_motor_uitvoering_brandstof_netto_max_vermogen_ondergrens,rdw_ovi_vermogen_q,nhtsa_vpic_engine___enginehp,nhtsa_vpic_engine___enginehp_to,power,>power<
2024-05-701008,100.0,100.0,100.0,.,123.0,.,.,100.0
2024-05-701308,188.0,.,.,.,252.0,.,.,188.0
2024-05-702808,272.0,272.0,272.0,.,365.0,.,.,272.0
2024-05-709109,130.0,130.0,130.0,.,123.0,.,.,130.0
2024-05-714509,73.0,73.0,73.0,.,98.0,.,.,73.0
2024-05-703710,100.0,100.0,100.0,.,123.0,.,.,100.0
2024-05-703910,272.0,272.0,272.0,.,365.0,.,.,272.0
2024-05-704010,188.0,.,.,.,252.0,.,.,188.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_gekentekende_voertuigen_hoogte_voertuig,rdw_basisgegevens_eeg_uitvoering_hoogte_voertuig_uitvoering_bovengrens,rdw_basisgegevens_eeg_uitvoering_hoogte_voertuig_uitvoering_ondergrens,height,>height<
2024-05-700408,.,1551.0,1498.0,.,1551.0
2024-05-700908,.,1419.0,1416.0,.,1419.0
2024-05-701008,.,1512.0,1457.0,.,1512.0
2024-05-701208,.,1465.0,1436.0,.,1465.0
2024-05-701808,.,1595.0,1573.0,.,1595.0
...,...,...,...,...,...
2024-05-703910,.,1420.0,1400.0,.,1420.0
2024-05-704510,.,1706.0,1665.0,.,1706.0
2024-05-704710,.,1573.0,1531.0,.,1573.0
2024-05-705010,.,1491.0,1437.0,.,1491.0


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_uitvoering_gebruiksgegevens_per_uitgave_verbruikcategorie_uitvoering,rdw_gekentekende_voertuigen_zuinigheidsclassificatie,energylab,>energylab<
2024-05-700408,D,C,.,D
2024-05-700608,G,E,.,G
2024-05-700908,F,D,.,F
2024-05-701008,G,D,.,G
2024-05-701108,D,A,.,D
...,...,...,...,...
2024-05-703910,G,,.,G
2024-05-704510,F,B,.,F
2024-05-704710,F,,.,F
2024-05-705010,C,B,.,C


  df_.replace({'': np.NaN}, inplace=True)
  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


Unnamed: 0,rdw_ovi_wachten_op_keuring,rdw_gekentekende_voertuigen_wacht_op_keuren,drz__wok,under_survey,>under_survey<
2024-05-702010,True,.,False,.,True


  df_ = df_.bfill(axis=1)
  car[flds[-1]] = choice # <- UPDATE DATA


- - - - 
# calculate extra info

In [33]:
# four wheel drive
car['fourwd'] = car.rdw_basisgegevens_eeg_uitvoering_aantal_aangedreven_assen > 1

  car['fourwd'] = car.rdw_basisgegevens_eeg_uitvoering_aantal_aangedreven_assen > 1


In [34]:
date_cols = (car.columns != 'rdw_merk_registratie_datum_dt') & (car.columns.str.endswith('_dt')) | car.columns.isin(['MF', 'apk'])
age = car.loc[:, date_cols].replace({np.NaN: pd.NaT}).apply(lambda c: car.drz__auctiondate - c).add_suffix('_age')
if all(age.columns.isin(car.columns)):
    car.update(age)
else:
    car = pd.concat([car, age], axis=1)
car["import_age"] = (car.regnl - car.MF).replace({np.NaN: pd.NaT})

  age = car.loc[:, date_cols].replace({np.NaN: pd.NaT}).apply(lambda c: car.drz__auctiondate - c).add_suffix('_age')
  age = car.loc[:, date_cols].replace({np.NaN: pd.NaT}).apply(lambda c: car.drz__auctiondate - c).add_suffix('_age')
  car["import_age"] = (car.regnl - car.MF).replace({np.NaN: pd.NaT})


In [None]:
# plot age
if VERBOSE > 0:
    df_ = car[[c for c in car.columns if c.endswith('_age')]]\
    .applymap(lambda x: x.days/365.25 if isinstance(x, pd.Timedelta) else x/365.25)\
    .replace({pd.NaT: np.nan}).copy()
if VERBOSE > 1:
    for k,s in df_.iteritems():
        plt.figure(figsize=[16,2])
        s.plot(marker=',', linestyle='', alpha=1, figsize=[16,2], ms=2)
        plt.title(k)
        
    plt.ylabel('age (year)')
    
elif VERBOSE > 0:
    df_.plot(marker='s', linestyle='', alpha=0.4, figsize=[16,8], ms=2)
    plt.legend()   
    plt.xlabel('lot')
    plt.ylabel('age (year)')
    


# subselection and save

### Save data for ML

In [36]:
save_cols = [
    "drz__Price",
    "brand",
    "model",
    "MF_age",
    "fuel",
    "drz__OdoKM_num",
    "apk_age",
    "import_age",
    "bodytype",
    "cylvol",
    "nCyl",
    "power",
    "weight",
    "bpm",
    "newprice",
    "nSeat",
    "nDoor", 
    "color",
    'fourwd',
    'maxspeed',
    'length',
    'height',
    'width',
    'automatic',
    'nGear',
    'energylab',
    'private_owners',
    'company_owners',
    'under_survey',
]
map_lowercase = {
    'drz__Price':'price',
    'MF_age':'age',
    'drz__OdoKM_num':'odometer',
    'apk_age':'days_since_inspection_invalid',
    'import_age':'age_at_import',
    'bodytype':'body_type',
    'cylvol':'displacement',
    'nCyl':'number_of_cylinders',
    'bpm':'registration_tax',
    'newprice':'original_sale_price',
    'nSeat':'number_of_seats',
    'nDoor':'number_of_doors',
    'maxspeed':'top_speed',
    'automatic':'automatic_gearbox',
    'nGear':'number_of_gears',
    'energylab': 'energy_label',
}

if VERBOSE > 1:
    print('Columns >> .. << are saved as car dataset')
    
    ncol = 8
    l = [*car.columns]
    l = sorted(l)
    l = [f'>> {i} <<' if i in save_cols else i for i in l]
    l_padded = np.ceil(len(l) / ncol)*ncol
    l += [''] * int(l_padded - len(l))
    df_ = pd.DataFrame(np.reshape(l, (-1,ncol)), columns = [''] * ncol)
    df_['ix'] = ''
    df_.set_index('ix', inplace=True)
    df_.index.name=''
    with pd.option_context("display.max_rows", 999, "max_colwidth", 32):
        display(df_)

out = car.loc[:,save_cols].rename(columns=map_lowercase)

if VERBOSE > 0:
    with pd.option_context("display.max_columns", 999):
        display(out.tail(), metadata = {'tags': (TAG_SINGLE, )})

# save data
file_name = f'{DATA_DIR}/cars-for-ml.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
# save
if True | do_save(file_name): # always save
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

/home/tom/bin/satdatsci/Saturday-Datascience/data/cars-for-ml.pkl


### Save data for image classification

In [37]:
save_cols = [
    "drz__Images",
    "brand",
    "model",
    "modelspec",
    "color",
    "MF_age",
    "bodytype",
    "drz__cabriolet",
    "nDoor", 
    "length", 
    'height',
    'width',
    "wheelbase",
    "drz__ForeignReg",
    "drz__Reg",
    "taxi",
]
map_lowercase = {
    'drz__Images':'image_urls',
    'modelspec':'model_specification',
    'MF_age':'age',
    'bodytype':'body_type',
    'drz__cabriolet':'convertible',
    'nDoor':'number_of_doors',
    'drz__ForeignReg':'foreign_registration',
    'drz__Reg':'registration_number'
}

out = car.loc[:,save_cols].rename(columns=map_lowercase)
if VERBOSE > 0:
    with pd.option_context("display.max_columns", 999):
        display(out.tail(), metadata={'tags': (TAG_SINGLE, )})


# save data
file_name = f'{DATA_DIR}/cars-for-imageclf.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')

# save
if True | do_save(file_name): # always save
    print(file_name, out.shape)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

/home/tom/bin/satdatsci/Saturday-Datascience/data/cars-for-imageclf.pkl (11830, 16)


# Write example table to file
based on: https://stackoverflow.com/a/33869154

In [38]:
def pandas_df_to_markdown_table(df):
    fmt = ['-----' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return df_formatted.to_csv(sep="|", index=False)


In [39]:
fn = f'{DATA_DIR}/cars-for-ml.pkl'
print(f'load {fn}')
out = pd.read_pickle(fn)

file_name = f"{cfg['FILE_LOCATION']['app_dir']}/assets/example-table-of-ml.md"
example = out.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'
example.price = example.price.astype(str)

# convert to md
try:
    table_text = example.reset_index().to_markdown()
except ImportError:
    print('Fallback')
    table_text = pandas_df_to_markdown_table(example.reset_index())

# save
if do_save(file_name):
    with open(file_name,'w') as file:
        file.write(table_text)

    print('A markdown table is available as\n\t{}'.format(file_name))
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    display(example, metadata={'tags': (TAG_SINGLE, )})
    

load /home/tom/bin/satdatsci/Saturday-Datascience/data/cars-for-ml.pkl
Skip. /home/tom/bin/satdatsci/Saturday-Datascience/assets/example-table-of-ml.md exists or saving is disabled in settings.


In [40]:
fn = f'{DATA_DIR}/cars-for-imageclf.pkl'
print(f'load {fn}')
out = pd.read_pickle(fn)

file_name = f"{cfg['FILE_LOCATION']['app_dir']}/assets/example-table-of-imageclf.md"
example = out.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'


# convert to md
try:
    table_text = example.reset_index().to_markdown()
except ImportError:
    print('Fallback')
    table_text = pandas_df_to_markdown_table(example.reset_index())

# save
if do_save(file_name):
    with open(file_name,'w') as file:
        file.write(table_text)

    print('A markdown table is available as\n\t{}'.format(file_name))
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    display(example, metadata={'tags': (TAG_SINGLE, )})
    

load /home/tom/bin/satdatsci/Saturday-Datascience/data/cars-for-imageclf.pkl
Skip. /home/tom/bin/satdatsci/Saturday-Datascience/assets/example-table-of-imageclf.md exists or saving is disabled in settings.


In [41]:
with pd.option_context('display.max_rows', 999):
    vc = car.brand.value_counts().sort_index()
    print(vc.shape[0], '(expected 72)')
    display(vc)


72 (expected 72)


brand
AIXAM                          1
ALFA ROMEO                    93
ASTON-MARTIN                  13
AUDI                        1095
AUSTIN-HEALEY                  1
AUVERLAND                      1
BENTLEY                       19
BMW                          980
BUICK                          1
CADILLAC                       9
CHEVROLET                     99
CHRYSLER                      48
CITROËN                      368
DACIA                         16
DAEWOO                        15
DAF                            2
DAIHATSU                      34
DAIMLER                        2
DATSUN                         2
DAX                            1
DODGE                         24
DS                             2
FERRARI                       17
FIAT                         335
FORD                         437
GMC                            2
HONDA                         81
HUMMER                         5
HYMER                          1
HYUNDAI                      116
INFI