# Preprocess car data

In [1]:
import os
# setting path
os.chdir(r'..')

import drz_config
cfg = drz_config.read_config()
DATE = cfg['DATE']
VERBOSE = cfg['VERBOSE']
OPBOD = cfg['OPBOD']
SKIPSAVE = cfg['SKIPSAVE']

TAG_SINGLE = "nbconvert_instruction:remove_single_output"

#VERBOSE = 10

if VERBOSE > 0:
    display(cfg)

{'settings_fn': '../code/assets/drz-auction-settings.ini',
 'DATE': '2022-11',
 'VERBOSE': 1,
 'OPBOD': False,
 'URL': 'http://verkoop.domeinenrz.nl/verkoop_bij_inschrijving_2022-0031',
 'EXTEND_URL': False,
 'CLOSEDDATA': True,
 'closed_data_fields': '*',
 'SKIPSAVE': False}

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os

## Load data

In [3]:
if OPBOD:
    fn = '../data/cars-from-all-auctions-opbod.pkl'
else:
    fn = '../data/cars-from-all-auctions.pkl'
print(fn)
car = pd.read_pickle(fn)

if VERBOSE > 0:
    display(car.tail(), metadata={"tags":(TAG_SINGLE, )})

../data/cars-from-all-auctions.pkl


### Rename columns
to keep track of source of field

In [4]:
car.rename(columns={
    'rhd': 'rhd_drz',
    'automatic': 'automatic_drz',
    'hybrid': 'hybrid_drz',
    'taxi': 'taxi_drz',
    'ItemType': 'model_drz',
    'ItemBrand': 'brand_drz',
}, inplace=True)

## Odometer
convert miles to km and make all numerical

In [5]:
def odo_str2float(df):
    '''
    Convert odometer to numerical values.
    Also convert Miles to KMs.
    '''
    
    ml2km = 1.609344
    
    # substitute to nan
    for repair_to_nan in ['onbekend', 
                          'volgens NAP logisch', 'volgens nap logisch', 
                          'volgens NAP onlogisch', 'volgens nap onlogisch']:
        df.OdoKM = df.OdoKM.apply(lambda x:x.replace(repair_to_nan,'nan') if (type(x) == str) and (len(x)>0) else 'nan')
        df.OdoMLS = df.OdoMLS.apply(lambda x:x.replace(repair_to_nan,'nan') if (type(x) == str) and (len(x)>0) else 'nan')
    # float
    df["OdoKM_num"] = df.OdoKM.apply(lambda x:x.replace('.','')).astype(float)
    df["OdoMLS_num"] = df.OdoMLS.apply(lambda x:x.replace('.','')).astype(float)
    # convert miles to km
    df["Odo"] = df.OdoKM_num.copy()
    chooseMls = (df.OdoKM_num.isna()) & ~(df.OdoMLS_num.isna())
    df.loc[chooseMls,"Odo"] = df.loc[chooseMls,"OdoMLS_num"].apply(float) * ml2km

In [6]:
# do miles to km conversion
odo_str2float(car)

if VERBOSE > 0:
    # Check if conversion went okay
    #   If km or mls exist there should be a "Odo" reading
    assert (
        car.loc[:,['OdoKM','OdoKM_num','OdoMLS','OdoMLS_num']].astype(float).isna().all(axis=1)\
        & car.Odo.notna()
    ).any() == False
    
    # print new column and source info
    #display(car.loc[:,['Odo','OdoKM','OdoKM_num','OdoMLS','OdoMLS_num']])
    display(pd.DataFrame(
        car.loc[:,['Odo', 'OdoMLS_num']].notna().sum(axis=1)\
        .value_counts()\
        .rename(index={1: 'km', 2: 'miles', 0:'no odo reading'}),
        columns=['nr of cars'])
           )

    # plot odometer
    fig,ax=plt.subplots(figsize=[16,8])
    ax.set_xlabel('lot')
    ax.set_ylabel('odometer (km or mls)')

    car.loc[:,["OdoKM_num"]].plot(marker='s',linestyle='',alpha=1,ax=ax, color='lightgray')
    car.loc[:,["Odo"]].plot(marker='+',color='darkgray',linestyle='',alpha=1,ax=ax)
    car.loc[:,["OdoMLS_num"]].plot(marker='s',linestyle='',alpha=1,ax=ax)


Unnamed: 0,nr of cars
km,9124
no odo reading,287
miles,228


## Model and brand

Enforce some consistency in naming


In [7]:
# Rename to conventional brand name
car.brand_drz.replace({
    "ASTON MARTIN":"ASTON-MARTIN",
    'AUTO UNION':'AUDI',
    'JAGUAR CARS':'JAGUAR',
    "MERCEDES BENZ":"MERCEDES-BENZ",
    "MERCEDES":"MERCEDES-BENZ",
    "MICRO COMPACT CAR SMART":"SMART",
    "MICRO COMPACT CAR":"SMART",
    "LANDROVER": "LAND ROVER",
    "LAND-ROVER": "LAND ROVER",
    "CITRO": "CITROËN",
    "CITROÃÂ\x8bN": "CITROËN",
    "CITROEN": "CITROËN",
    "G.M.C.": "GMC",
    "VOLKWAGEN": "VOLKSWAGEN",
},inplace=True)


# When brand name has a specification that needs to go in the model name.
# E.g. Mercedes <AMG> and Audi <QUATTRO>
def add_model_spec(s,spec):
    '''Adds specification at the end of the model name if not already in name'''
    import re 
    
    # addition should not exist
    if not re.search('(?i)' + spec, s):
        
        # add separator
        if not (s.endswith(';')):
            s += ';'
        # add specification
        s += ' ' + spec
    return s

sel = car.brand_drz == 'QUATTRO'
car.loc[sel,'model_drz'] = car.loc[sel,'model_drz'].apply(lambda s: add_model_spec(s,'quattro'))
car.loc[sel,'brand_drz'] = 'AUDI'

sel = car.brand_drz == 'MERCEDES-AMG'
car.loc[sel,'model_drz'] = car.loc[sel,'model_drz'].apply(lambda s: add_model_spec(s,'amg'))
car.loc[sel,'brand_drz'] = 'MERCEDES-BENZ'

sel = car.brand_drz == "ALPINA"
car.loc[sel,'model_drz'] = car.loc[sel,'model_drz'].apply(lambda s: add_model_spec(s,'alpina'))
car.loc[sel,'brand_drz'] = 'BMW'

sel = car.brand_drz == "BMW 3ER REIHE"
car.loc[sel,'model_drz'] = car.loc[sel,'model_drz'].apply(lambda s: add_model_spec(s,'3er reihe'))
car.loc[sel,'brand_drz'] = 'BMW'

sel = car.brand_drz == 'RANGE ROVER'
car.loc[sel,'model_drz'] = car.loc[sel,'model_drz'].apply(lambda s: add_model_spec(s,'range rover'))
car.loc[sel,'brand_drz'] = 'LAND ROVER'

sel = car.brand_drz == "FORD C MAX"
car.loc[sel,'model_drz'] = car.loc[sel,'model_drz'].apply(lambda s: add_model_spec(s,'c max'))
car.loc[sel,'brand_drz'] = 'FORD'



In [8]:
import html
car["model_drz"] = car.model_drz.fillna('').apply(lambda s: html.unescape(s))


In [9]:
car.loc[:, 'modelspec_drz'] = ''
car.update(
    car.model_drz.str.split(pat=';', expand=True, n=1).fillna('').rename(columns={0: 'model_drz', 1: 'modelspec_drz'})
)

### Concatenate columns with index numbers

In [10]:
def _split_indexnr(c):
    M = re.match(r'^(((rdw)|(nhtsa))_[a-z,_,]+(_cm3)?)((_[0-9]+)+)$', c)
    if M is None:
        return None
    
    return M[0], M[1], M[6]


## Example
# cs = [
#     'rdw_motor_uitvoering_cilinderinhoud_cm3_1',
#     'rdw_motor_uitvoering_cilinderinhoud_cm3_2',
#     'rdw_motor_uitvoering_cilinderinhoud_cm3_3',
#     'rdw_motor_uitvoering_brandstof_emissie_stikstofoxide_type_1_1_1',
#     'rdw_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1_1',
#     'rdw_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1',
#     'nhtsa_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1'
# ]
# _split_indexnr(cs[6])
# pd.DataFrame([_split_indexnr(c) for c in cs], columns=['old', 'new', 'counter'])

In [11]:
def _prog_old(cur, end, last=0):
    
    pct = cur/end
    
    if pct > last/100:
        print(f'{last}%')
        last += 10 # every 10% an update
    elif cur >= end:
        print('done')
    else:
        print('.', end='')
    
    return last

## Example
# last = _prog(11,100,10)
# last = _prog(12,100,last)
# last = _prog(13,100,last)

def _prog(display_id, cur, end, extra_info = 'running', bar_len = 40):
    pct = cur/end
    bar = ''.join(
        ['|'] * int(bar_len*pct) +
        ['-'] * int(bar_len*(1-pct)) 
    ) + f'{pct*100:3.0f}% [{extra_info}]'
    if display_id is None:
        display_id = display({'text/plain': ''}, raw = True, display_id=True)
    display_id.update({'text/plain': bar}, raw = True)
    
    return display_id

## Example
# display_id = None
# display_id = _prog(display_id, 0, 1337)
# display_id = _prog(display_id, 42, 1337)
# display_id = _prog(display_id, 42, 137, '137')

In [12]:
# Get info from dataframe columns
new_names = np.array([i[1] + '_concat' for i in map(_split_indexnr, car.columns) if i is not None])
old_names = np.array([i[0] for i in map(_split_indexnr, car.columns) if i is not None])
counter = np.array([i[2] for i in map(_split_indexnr, car.columns) if i is not None])

if 2 > VERBOSE > 0:
    # initiate progress bar
    display_id = None
for prg, new_name in enumerate(set(new_names)):
    
    # select columns in df
    sel = new_names == new_name
    
    # make dict from columns
    new_dicts = car[old_names[sel]].apply(lambda row: {
        k[1:]: v # {'1_1_1': 'value'}, "[1:]" to trim off leading "_" 
        for k,v in zip(counter[sel], row) 
        if ~((isinstance(v,float)) and (np.isnan(v)))
    }, axis=1)
    
    # add series to new column 
    car[new_name] = new_dicts
    
    # Remove old columns
    car.drop(columns=old_names[sel], inplace=True)
    
    # progress
    if 2 > VERBOSE > 0:
        display_id = _prog(display_id, prg, len(set(new_names)), new_name)
    elif VERBOSE > 1:
        print(f'{counter[sel][0]:7s} .. {counter[sel][-1]:7s} ({sum(sel):3.0f}) -> {new_name:s}')
        
if 2 > VERBOSE > 0:
    display_id = _prog(display_id, prg, len(set(new_names)), f'{len(old_names)} columns merged to {prg} new ones')

||||||||||||||||||||||||||||||||||||||| 99% [1552 columns merged to 195 new ones]

In [13]:
if VERBOSE > 1:
    # Unknown registrations
    # Might be able to fix it by looking at the raw text.
    ixs = car.loc[car.Reg.str.lower() == 'onbekend'].index
    display(car.loc[ixs,['Reg','ForeignReg','Raw_text']])
    for ix in ixs:
        rt = car.loc[ix,'Raw_text']
        if type(rt) == list:
            txt = '</br>&nbsp;&nbsp;&nbsp;&nbsp;'.join(rt)
        else:
            txt = ',</br>&nbsp;&nbsp;&nbsp;&nbsp;'.join(rt.split(','))
        
        txt = txt.replace('kenteken', '<B><font color="red">kenteken</font></B>')
        display({'text/html': f'<b>{ix}</b></br>&nbsp;&nbsp;&nbsp;&nbsp;{txt}'},
                raw=True, metadata={'tags': (TAG_SINGLE, )})

if VERBOSE > 1:
    # FUTURE: Do something with foreign registrations
    # Parsing did not always get it right.
    display({'text/html':
        '<b>Foreign registrations:</b></br>&nbsp;&nbsp;&nbsp;&nbsp;' +  
        '</br>&nbsp;&nbsp;&nbsp;&nbsp;'.join(list(car.ForeignReg.dropna().unique())) +
        '</br>'
    }, raw=True)#
    # pd.DataFrame(car.ForeignReg.value_counts())

if VERBOSE > 1:
    # Steering wheel in center? "M"?
    display(
        car.loc[
            ~car.rdw_basisgegevens_kant_van_het_stuur.isin(['R', 'L']), 
            ['rdw_basisgegevens_kant_van_het_stuur', 'Raw_text']].dropna()
    )

    # De zijde van het voertuig waar het stuurwiel is gemonteerd.
    # Waarden 
    # L     Links
    # R    Rechts
    # M    Midden
    # src: https://www.rdw.nl/-/media/rdw/rdw/pdf/sitecollectiondocuments/over-rdw/naslagwerk/beschrijving-dataset-typegoedkeuring-v10.pdf
    # car.rdw_basisgegevens_kant_van_het_stuur.value_counts()



## Other repairs

In [14]:
# reparse raw text NAP
# Wrong ones have 'isch'

# re pattern
patt = '^Km-stand volgens nap (?P<val>(logisch)|(onlogisch))$'
# loop over wrong ones
for ix in car[car.NAP == 'isch'].index:
    # line by line
    for line in car.loc[ix,'Raw_text']:
        M = re.match(patt,line)
        if M:
            car.loc[ix,"NAP"] = M.group('val')
            
            
# Year of manufacturing unknown
# Mfyear: "onbekend"
car.Mfyear.replace('onbekend','', inplace=True)

## Adhoc repair

When all fails

In [15]:
if OPBOD:
    print('skip')
else:
    # '2603 Afkomstig van JFC HQ Brunssum.'
    ix='2017-5-2603'
    car.loc[ix,"LotNr"] = '2603'
    car.loc[ix, 'jfc'] = True
    # K2000098227 Afkomstig van JFC HQ Brunssum.
    ix='2020-9-8227'
    car.loc[ix,"LotNr"] = '8227'
    car.loc[ix, 'jfc'] = True
    
    # "bouwjaar verklaring noodzakelijk."
    # car.loc[[v=="verklaring noodzakelijk." for v in car.Mfyear]]
    # car[car.Mfyear.str.contains('verklaring noodzakelijk.')==True]
    ixs = ['2020-1-7177']
    for ix in ixs:
        car.loc[ix, 'Mfyear'] = ''

    # fix issus with one lot that has no type
    # print(car.model_drz[~ (car.model_drz.apply(type) == str)])
    # [print(l) for l in eval(car.loc['2017-6-7121','Raw_text'])]
    # car.loc['2017-6-7121','Images']
    ix = '2017-6-7121'
    car.loc[ix,'model_drz'] = 'golf'

    # car.loc[car.brand_drz == 'Kampeerwagen/camper',:]
    # car.loc["2017-5-2408",:]
    ix = '2017-5-2408'
    #car.loc[ix,'brand_drz'] = 'VOLKSWAGEN'
    car.drop(ix,inplace=True) # remove alltogether


    # car.loc[car.brand_drz == 'AUDI A4',:]
    ix = '2018-6-7195'
    car.loc[ix, 'brand_drz'] = 'AUDI'
    car.loc[ix, 'model_drz'] = 'a4'
    car.loc[ix, 'modelspec_drz'] = car.loc[ix,"model_drz"]

    # car.loc[car.brand_drz.str.contains('PANAMERA'), ['brand_drz', 'brand_rdw', 'brand_rdwovi', 'model_drz', 'model_rdw', 'model_rdwovi', 'modelspec_drz']]
    ix = '2019-6-2408'
    car.loc[ix, 'brand_drz'] = 'PORSCHE'
    car.loc[ix, 'model_drz'] = 'panamera'
    car.loc[ix, 'modelspec_drz'] = car.loc[ix,"model_drz"]
    
    # car[car.brand_drz.str.contains('CAYENNE')]
    ix = '2021-06-710206'
    car.loc[ix, 'brand_drz'] = 'PORSCHE'
    car.loc[ix, 'model_drz'] = 'cayenne'
    car.loc[ix, 'modelspec_drz'] = car.loc[ix,"model_drz"]
    
    # car.loc[car.brand_drz == 'MINI COOPER',:]
    ix = '2018-10-2210'
    car.loc[ix,'brand_drz'] = 'MINI'
    car.loc[ix,'model_drz'] = 'cooper'
    car.loc[ix, 'modelspec_drz'] = car.loc[ix,"model_drz"]
    
    # car.loc[car.brand_drz == 'DAIMLERCHRYSLER AG', ['Reg', 'brand_drz', 'rdw_merk', 'rdw_ovi_merk', 'model_drz', 'Raw_text']]
    ixs = ['2019-9-9262', '2019-11-8141']
    for ix in ixs:
        car.loc[ix,'brand_drz'] = 'MERCEDES-BENZ'
        
    # car.loc[car.brand_drz.str.lower().str.contains('jaguar land'), ['brand_drz', 'model_drz', 'modelspec_drz']]
    ixs = ['2021-04-7068', '2021-08-800518']
    for ix in ixs:
        car.loc[ix,'brand_drz'] = 'LAND ROVER'
    

    # car.loc[car.model_drz == 'benz',:]
    # car.loc[ix,'Images']
    # This is a w204 mfyear < 2011
    ix = '2017-5-2618'
    car.loc[ix,'model_drz'] = 'c cdi'

    # car.loc["2018-1-3046","Raw_text"]
    # This is combined lot
    car.drop("2018-1-3046",inplace=True) # remove alltogether

    # car.ForeignReg=='Het voertuig is voorzien van taxi-kentekenplaten. Taxiregistratie kunt u laten be&#235;indigen via de RDW. Vervanging van de blauwe door gele'
    # Taxi
    ixs = ['2017-11-8302', '2017-11-8305', '2018-1-8163']
    regs = ['54-GLL-5','57-XZ-FV','70-TLF-3']
    for ix,reg in zip(ixs,regs):
        car.loc[ix,'taxi_drz'] = True
        car.loc[ix,'Reg'] = reg

    # Typo in registration K1900022009
    # 8-SKL-15 not 8-SLK-15
    ix = '2019-2-2009'
    car.loc[ix,'Reg'] = '8-SKL-15'
    if car.loc[ix,'rdwinfo']['kenteken'][0] == '8SLK15':
        car.loc[ix,'rdwinfo'] = None

    # NAP is provided first and impacts Odometer reading
    ixs = ['2019-9-9106', '2019-9-9249']
    naps = ['logisch', 'onlogisch']
    kms = ['251.571', '', '']
    #display(car.loc[ixs,['NAP', 'OdoKM', 'Raw_text']])
    for ix,nap,km in zip(ixs,naps,kms):
        car.loc[ix,'NAP'] = nap
        car.loc[ix,'OdoKM'] = km
        
    
    # Text in lot was missing a character: "58.83"
    ix = '2020-12-7138'
    car.loc[ix, 'OdoKM'] = '58.683'
    
    # date format is different
    ix = '2020-12-7263'
    car.loc[ix, 'Mfdate'] = car.loc[ix, 'Mfdate'].replace('-','.')
    
    # Typo
    
    # car.loc[car.brand_drz == 'MERCDES-BENZ',:]
    ix = '2018-9-8162'
    car.loc[ix,'brand_drz'] = 'MERCEDES-BENZ'
    
    # car[car.brand_drz.str.lower().str.startswith('volkw')]
    ix = '2021-05-8116'
    car.loc[ix, 'brand_drz'] = 'VOLKSWAGEN'
    
    # Outside looks just like audi 80 estate
    # car.loc[car.brand_drz.str.startswith('AUDI/')].Reg
    ix = '2021-05-8098'
    car.loc[ix, 'brand_drz'] = 'AUDI'
    car.loc[ix, 'model_drz'] = 'audi 80'
    car.loc[ix, 'modelspec_drz'] = 'avant rs2 232 kw audi/porsche'
    
    # 206+ has different front (like 207)
    # car.loc[car.rdw_typegoedkeuringsnummer.str.startswith('e2*2001/116*0374'), car.columns.str.contains('model') + car.columns.str.contains('rdw_typegoedkeuringsnummer')]
    ix = '2021-11-705111'
    car.loc[ix, 'model_drz'] = '206+'
    
    # Date ends with a '.'
    for d in ['Mfdate', 'APKdate']:
        ixs = car.loc[car.loc[:,d].str.endswith('.').fillna(False)].index
        car.update(car.loc[ixs, d].apply(lambda s: s[:-1]))


- - - - 
## Make data type consistent

In [16]:
def string_to_list_images(s):
    '''Convert string to list'''
    if type(s) == str:
        s=s.replace('[',"['",1)
        s=s[::-1].replace(']',"]'",1)[::-1]
        s=eval(s.replace(', ',"' , '"))
        
    return s

def string_to_int_lotnr(s):
    '''Convert string to int'''
    if type(s) == str:
        if s[0] == 'K':
            # "K1800092200"
            s=int(s[-4:])
        else:
            s=int(s)
        
    return s

def string_to_list_rawtext(s):
    '''Convert string to list'''
    if type(s) == str:
        if s[1] == "'":
            s=eval(s)
        else:
            s=string_to_list_images(s)
        
    return s

def parse_suffix(s, suffix):
    _suffix_pattern = f'^\s*([0-9,\.,\,]+)\s*{suffix}'
    if not isinstance(s, str):
        return s
    tokens = re.findall(_suffix_pattern, s)
    if len(tokens) == 0:
        return s
    elif len(tokens) > 1:
        print(tokens)
        raise Exception('fix this. too many results')
        return s
    return tokens[0][0]

def parse_cm(s): 
    return parse_suffix(s, r'((cm)|(CM))')
def parse_kg(s): 
    return parse_suffix(s, r'((kg)|(KG))')

def parse_cm_OLD(s):
    if not isinstance(s, str):
        return s
    tokens = re.findall('^\s*([0-9,\.,\,]+)\s*cm|CM', s)
    if len(tokens) == 0:
        return s
    elif len(tokens) > 1:
        print(tokens)
        raise Exception('fix this. too many results')
        return s
    return tokens[0]

def parse_kg_OLD(s):
    if not isinstance(s, str):
        return s
    tokens = re.findall('^\s*([0-9,\.,\,]+)\s*kg|KG', s)
    if len(tokens) == 0:
        return s
    elif len(tokens) > 1:
        print(tokens)
        raise Exception('fix this. too many results')
        return s
    return tokens[0]

def parse_euro(s):
    if not isinstance(s, str):
        return s
    tokens = re.findall('^€\s*([0-9,\.,\,]+)', s)
    if len(tokens) == 0:
        return s
    elif len(tokens) > 1:
        print(tokens)
        raise Exception('fix this. too many results')
        return s
    return tokens[0]



def parse_jn(s):
    return s.replace({'Ja': True, 'Nee': False})

def str_to_upper(s):
    return s.str.upper()


In [17]:
# Convert to the same data type

car.Images = car.Images.apply(lambda r: string_to_list_images(r))
car.LotNr = car.LotNr.apply(string_to_int_lotnr)
car.Raw_text = car.Raw_text.apply(string_to_list_rawtext)

In [18]:
# parse string .. cm, EUR .. ..kg etc
car.update(
    car.apply({
        'rdw_ovi_afstand_voorzijde_vrtg_tot_hartkoppeling': parse_cm,
        'rdw_ovi_cilinder_inhoud': parse_cm,
        'rdw_ovi_wielbasis': parse_cm,
        'rdw_ovi_breedte': parse_cm,
        'rdw_ovi_lengte': parse_cm,
        
        'rdw_ovi_bpm_bedrag': parse_euro,
        'rdw_ovi_catalogus_prijs': parse_euro,

        'rdw_ovi_massa_bedrijfsklaar': parse_kg,
        'rdw_ovi_massa_ledig_voertuig': parse_kg,
        'rdw_ovi_maximum_massa_geremd': parse_kg,
        'rdw_ovi_maximum_massa_ongeremd': parse_kg,
        'rdw_ovi_maximum_massa_samenstel': parse_kg,
        'rdw_ovi_maximum_massa_voertuig': parse_kg,
        'rdw_ovi_technische_maximum_massa_voertuig': parse_kg,
        'rdw_ovi_maximum_massa_autonoom_geremd': parse_kg,
        'rdw_ovi_maximum_massa_middenas_geremd': parse_kg,
        'rdw_ovi_maximum_massa_oplegger_geremd': parse_kg,
        'rdw_ovi_laadvermogen': parse_kg,

        'rdw_ovi_vermogen_q': lambda s: parse_suffix(s, '(kW/kg)'), #Vermogen gedeeld door massa rijklaar

        'rdw_wam_verzekerd': parse_jn, 
        'rdw_openstaande_terugroepactie_indicator': parse_jn, 
        'rdw_export_indicator': parse_jn, 
        'rdw_taxi_indicator': parse_jn, 
        'rdw_retrofit_roetfilter': parse_jn, 
        'rdw_ovi_wachten_op_keuring': parse_jn, 
        'rdw_ovi_w_a_verzekerd': parse_jn, 

        'rdw_ovi_kleur': str_to_upper,
    })
)


In [19]:
# convert 1/0 to Booleans 
# nan: False
# empty string : maintain
# Note that missing values will become False

for c in ['automatic_drz','cabriolet','rhd_drz','no_road','crewcab','carwrap','d_lic','btw21', 'taxi_drz',
          'benzine','diesel','lpg','electric','hybrid_drz',
          'jfc','locked','no_key','no_cvo','no_igk','no_odo','wo_frame','used_parts', 'legguard', 'key_fixed', 'no_nap', 
          'import','early_reg','maybe_reg','no_inireg','no_nlreg193','no_nlreg194', 'no_nlreg19', 'no_orireg','no_reg','no_regneeded','no_rdw','rdw150','no_vin', 'first_reg_abroad', 
          'right_of_withdrawal', 
          'rdw_taxi_indicator', 'rdw_ovi_under_survey', 
          'disclaim1','disclaim2','disclaim12','disclaim3','disclaim4', 'disclaim5', 'disclaim_cr6', 'disclaim6']:
    if not OPBOD:
        assert c in car.columns, print(f'Column "{c}" not in data')
    else:
        # Debug
        if c not in car.columns:
            print('skip', c)
            continue

    car[c] = car[c].replace({0:False,1:True}).fillna(False)

In [20]:
# Get last query from list of rdw querries
if 'rdwinfo' not in car.columns:
    print('skip')
else:
    last_query = car.rdwinfo.apply(lambda x:pd.to_datetime([i['TimeStamp'] for i in x],format='%Y%m%d').argmax() if type(x) == list else np.NaN)
    for queries,last,idx in zip(car.rdwinfo,last_query,car.index):
        if np.isnan(last):
            continue
        car.loc[idx,'rdwinfo'] = [queries[int(last)]]

### Fill empty

In [21]:
car.Draw = car.Draw.fillna(False)
car.Mfyear = car.Mfyear.replace('',np.NaN).fillna(-1).astype(int)
car.Note = car.Note.astype(str)

In [22]:
# replace "niet geregistreerd" with nan
rep_cols = [c for c in car.columns if c.startswith('rdw_')]
car[rep_cols] = car[rep_cols].replace({
    'Niet geregistreerd': np.NaN,
    'Geen verstrekking in Open Data': np.NaN,
})

In [23]:
# Lists
# NaN: [] (empty list)
cols = [
    'rdw_merk_registratie_datum', 
    'rdw_merkcode',
    'rdw_merk_uitvoering_toegestaan_merk_registratie_datum',
    'rdw_merk_uitvoering_toegestaan_merkcode',
]
# Should exist in dataframe
assert len(set(cols).difference(car.columns)) == 0
## Debug
# print('skip', np.setdiff1d(cols, car.columns))

car.loc[:,cols] = car.loc[:,cols].applymap(lambda x: [] if ((not isinstance(x, list)) and pd.isna(x)) else x)




# Integers
# NaN: -1
cols = [
    'N_images',
    'rdw_aantal_cilinders',
    'rdw_aantal_deuren',
    'rdw_basisgegevens_aantal_deuren_bovengrens',
    'rdw_basisgegevens_aantal_deuren_ondergrens',
    'rdw_aantal_rolstoelplaatsen',
    'rdw_aantal_wielen',
    'rdw_basisgegevens_aantal_wielen',
    'rdw_aantal_zitplaatsen',
    'rdw_basisgegevens_aantal_zitplaatsen_bovengrens',
    'rdw_basisgegevens_aantal_zitplaatsen_ondergrens',
    'rdw_basisgegevens_aantal_aangedreven_assen',
    'rdw_uitvoering_wijzigingsnummer',
    'rdw_eeg_basis_goedkeuringsnummer',
    'rdw_eeg_uitbreiding_goedkeuringsnummer',
    'rdw_ovi_aantal_assen',
    'rdw_ovi_aantal_cilinders',
    'rdw_ovi_aantal_rolstoelplaatsen', #Niet geregistreerd
    'rdw_ovi_aantal_wielen',
    'rdw_ovi_aantal_zitplaatsen', #Niet geregistreerd
    'rdw_ovi_aantal_deuren', 
    'rdw_ovi_afstand_voorzijde_vrtg_tot_hartkoppeling', #Niet geregistreerd ".. cm"
    'rdw_ovi_cilinder_inhoud', #Niet geregistreerd ".. cm^3"
    'rdw_ovi_wielbasis', #..cm
    'rdw_ovi_bpm_bedrag', # eur ..
    'rdw_ovi_catalogus_prijs', #Niet geregistreerd "EUR .."
    'rdw_ovi_massa_bedrijfsklaar', #' ..kg'
    'rdw_ovi_massa_ledig_voertuig', #..kg
    'rdw_ovi_maximum_massa_geremd', #..kg
    'rdw_ovi_maximum_massa_ongeremd', #..kg
    'rdw_ovi_maximum_massa_samenstel', #..kg
    'rdw_ovi_maximum_massa_voertuig', #..kg
    'rdw_ovi_technische_maximum_massa_voertuig', #..kg
    'rdw_ovi_tellerstand_datum', #yyyy
    'rdw_ovi_aantal_staanplaatsen', #0
    'rdw_ovi_maximum_massa_autonoom_geremd', #..kg
    'rdw_ovi_maximum_massa_middenas_geremd', #..kg
    'rdw_ovi_maximum_massa_oplegger_geremd', #..kg
    'rdw_ovi_laadvermogen', #..kg
    'rdw_ovi_breedte', #..cm (not float)
    'rdw_ovi_lengte', #..cm (not float)
    'nhtsa_engine___enginecylinders', 
    'nhtsa_exterior_body__doors', 
    'nhtsa_general___modelyear',
    'nhtsa_mechanical_transmission__transmissionspeeds', 
]

# Should exist in dataframe
assert len(set(cols).difference(car.columns)) == 0
## Debug
# print('skip', np.setdiff1d(cols, car.columns))

car.loc[:,cols] = car.loc[:,cols].fillna(-1).astype(int)




# Strings
# NaN: ''
cols = [
    'APKdate', 
    'BTW', 
    'ForeignReg', 
    'model_drz', 
    'Mfdate', 
    'NAP', 
    'Reg', 
    'Source', 
    'SupInfo',
    'rdw_kenteken',
    'rdw_Reg',
    'rdw_api_gekentekende_voertuigen_assen',
    'rdw_api_gekentekende_voertuigen_brandstof',
    'rdw_api_gekentekende_voertuigen_carrosserie',
    'rdw_api_gekentekende_voertuigen_carrosserie_specifiek',
    'rdw_api_gekentekende_voertuigen_voertuigklasse',
    'rdw_eerste_kleur',
    'rdw_europese_voertuigcategorie',
    'rdw_export_indicator',
    'rdw_handelsbenaming',
    'rdw_inrichting',
    'rdw_merk',
    'rdw_openstaande_terugroepactie_indicator',
    'rdw_plaats_chassisnummer',
    'rdw_retrofit_roetfilter',
    'rdw_tweede_kleur',
    'rdw_type',
    'rdw_typegoedkeuringsnummer',
    'rdw_uitvoering',
    'rdw_variant',
    'rdw_voertuigsoort',
    'rdw_wacht_op_keuren',
    'rdw_wam_verzekerd',
    'rdw_zuinigheidslabel',    
    'rdw_type_gasinstallatie',
    'rdw_TimeStamp_x',
    'rdw_TimeStamp_y',
    'rdw_eu_type_goedkeuringssleutel',
    'rdw_eu_type_goedkeuringssleutel_y',
    'rdw_eeg_uitvoeringscode',
    'rdw_eeg_variantcode',
    'rdw_api_as_gegevens_eeg_uitvoering',
    'rdw_api_basisgegevens_eeg_uitvoering',
    'rdw_api_carrosserie_uitvoering',
    'rdw_api_carrosserie_uitvoering_klasse',
    'rdw_api_carrosserie_uitvoering_nummerieke_code',
    'rdw_api_handelsbenaming_uitvoering',
    'rdw_api_merk_uitvoering_toegestaan',
    'rdw_api_motor_uitvoering',
    'rdw_api_motor_uitvoering_brandstof',
    'rdw_api_plaatsaanduiding_uitvoering',
    'rdw_api_subcategorie_uitvoering',
    'rdw_api_uitvoeringverbruik_per_uitgave',
    'rdw_api_versnellingsbak_uitvoering',
    'rdw_eeg_ece_voertuig_categorie_bij_type',
    'rdw_europese_typegoedkeuring_status',
    'rdw_fabrikant',
    'rdw_landcode_eeg_typegoedkeuring',
    'rdw_eeg_voertuig_cat_toevoeging',
    'rdw_richtlijn_nr_laatste_wijziging',
    'rdw_type_fabrikant',
    'rdw_basisgegevens_status_voertiug_kentekening',
    'rdw_basisgegevens_compleet_voertuig_indicator',
    'rdw_basisgegevens_links_rechts_rijdend',
    'rdw_basisgegevens__24ghz_kortbereik_radar',
    'rdw_basisgegevens_eur_codering_carrosserietype',
    'rdw_basisgegevens_handelsbenaming',
    'rdw_basisgegevens_kant_van_het_stuur',
    'rdw_subcategorie_uitvoering_subcateg_uitvoering_europees',
    'rdw_basisgegevens_tweede_brandstofcode_voertuig',
    'rdw_basisgegevens_eeg_uitvoering_cat_toevoeging',
    'rdw_brandstof_omschrijving',
    'rdw_carrosserietype', 
    'rdw_type_carrosserie_europese_omschrijving',
    'rdw_carrosserie_voertuig_nummer_europese_omschrijving',
    'rdw_aangedreven_as', #"N"
    'rdw_hefas', #"N"
    'rdw_plaatscode_as', #V, A
    'rdw_emissiecode_omschrijving',
    'rdw_milieuklasse_eg_goedkeuring_licht',
    'rdw_vervaldatum_tachograaf', #dd/mm/yyyy
    'rdw_ovi_bijzonderheid_tekst', 
    'rdw_ovi_carrosserie_omschrijving', 
    'rdw_ovi_inrichting_code_omschrijving', 
    'rdw_ovi_voertuigsoort', 
    'rdw_ovi_datum_aanvang_tenaamstelling', # dd-mm-yyyy
    'rdw_ovi_eerste_afgifte_nederland', # dd-mm-yyyy
    'rdw_ovi_eerste_toelatingsdatum', # dd-mm-yyyy
    'rdw_ovi_vervaldatum_apk_keuring', # dd-mm-yyyy
    'rdw_ovi_eigenaren', #x / x
    'rdw_ovi_emissieklasse_diesel', #niet v toep.
    'rdw_ovi_geexporteerd', #nee
    'rdw_ovi_gestolen', #nee
    'rdw_ovi_w_a_verzekerd', #ja, nee
    'rdw_ovi_wachten_op_keuring', #ja nee
    'rdw_ovi_handelsbenaming', 
    'rdw_ovi_merk', 
    'rdw_ovi_type', 
    'rdw_ovi_typegoedkeuring', 
    'rdw_ovi_uitvoering', 
    'rdw_ovi_variant', 
    'rdw_ovi_kleur', 
    'rdw_ovi_materiele_gevolgens', 
    'rdw_ovi_mogelijk_gevaar', 
    'rdw_ovi_omschrijving_defect', 
    'rdw_ovi_omschrijving_herstel', 
    'rdw_ovi_status_terugroepacties', 
    'rdw_ovi_plaats_chassis_omschrijving', 
    'rdw_ovi_tellerstand_oordeel', 
    'rdw_ovi_tellerstand_toelichting', 
    'rdw_ovi_tijd_aanvang_tenaamstelling', #hh:mm
    'rdw_ovi_type_gas_installatie', 
    'rdw_datum_eerste_afgifte_nederland_dt',
    'rdw_datum_eerste_toelating_dt',
    'rdw_datum_tenaamstelling_dt',
    'rdw_vervaldatum_apk_dt',
    'rdw_tellerstandoordeel', #00/05
    'rdw_code_toelichting_tellerstandoordeel',
    'rdw_tenaamstellen_mogelijk', # Ja/Nee
    'rdw_ovi_tenaamstellen_mogelijk', # Ja/Nee
    'rdw_zuinigheidsclassificatie', #A, B,..
    'rdw_europese_uitvoeringcategorie_toevoeging', #e
    'rdw_europese_voertuigcategorie_toevoeging', #e
    'rdw_ovi_sub_cat_omschrijving', #Kampeer..
    'rdw_LotType', #Personen..
    'rdw_ovi_afg_dat_kent', #dd-mm-yyyy
    'rdw_ovi_datum_gdk', #dd-mm-yyyy
    'rdw_merk_registratie_datum_dt', 
    'rdw_basisgegevens_begindatum_restant_voorraad_dt', 
    'rdw_basisgegevens_begindatum_uitvoering_dt', 
    'rdw_basisgegevens_einddatum_restant_voorraad_dt', 
    'rdw_basisgegevens_einddatum_uitvoering_dt', 
    'rdw_basisgegevens_uitvoering_registratie_datum_dt', 
    'rdw_eeg_typegoedkeuringsdatum_dt', 
    'rdw_europese_typegoedkeuring_einddatum_dt', 
    'rdw_europese_typegoedkeuring_status_datum_dt', 
    'rdw_europese_typegoedkeurings_registratie_datum_dt', 
    'rdw_keuringen_vervaldatum_keuring_dt', 
    'rdw_datum_eerste_tenaamstelling_in_nederland_dt', 
    'rdw_registratie_datum_goedkeuring_afschrijvingsmoment_bpm_dt',
    'rdw_api_gebrek_beschrijving',
    'rdw_api_gebrek_constateringen',
    'rdw_ovi_car_vtg_num_eu_omschrijving', #rdw_carrosserie_voertuig_nummer_europese_omschrijving "Gesloten opbouw (03)"
    'rdw_datum_eerste_afgifte_nederland', #rdw_datum_eerste_afgifte_nederland_dt
    'rdw_datum_eerste_toelating', #rdw_datum_eerste_toelating_dt
    'rdw_datum_tenaamstelling', #rdw_datum_tenaamstelling_dt
    'rdw_vervaldatum_apk', #rdw_vervaldatum_apk_dt
    'rdw_ovi_vermogen_q', #0.00 kW/kg
    'OpH',
    'Vin',
    'rdw_carrosserie_uitvoering_klasse_type_carrosserie_europees', 
    'rdw_ovi_opgegeven_max_snelheid', #45 km/h (Diesel)
    'rdw_ovi_voertuigomschrijving',
    'rdw_subcategorie_nederland',
    'nhtsa_TimeStamp',
    'nhtsa_activesafetysystem___tpms',
    'nhtsa_engine___coolingtype', 
    'nhtsa_engine___engineconfiguration', 
    'nhtsa_engine___enginemanufacturer', 
    'nhtsa_engine___enginemodel', 
    'nhtsa_engine___fueltypeprimary', 
    'nhtsa_engine___otherengineinfo', 
    'nhtsa_engine___valvetraindesign', 
    'nhtsa_exterior_body__bodyclass', 
    'nhtsa_general___destinationmarket',
    'nhtsa_general___make',
    'nhtsa_general___manufacturer', 
    'nhtsa_general___model',
    'nhtsa_general___plantcity',
    'nhtsa_general___plantcompanyname',
    'nhtsa_general___plantcountry',
    'nhtsa_general___vehicletype',
    'nhtsa_mechanical_drivetrain__drivetype',
    'nhtsa_mechanical_transmission__transmissionstyle', 
    'nhtsa_passivesafetysystem___otherrestraintsysteminfo', 
    'nhtsa_passivesafetysystem___seatbeltsall', 
    'nhtsa_passivesafetysystem_airbaglocation__airbagloccurtain',
    'nhtsa_passivesafetysystem_airbaglocation__airbaglocfront',
    'nhtsa_passivesafetysystem_airbaglocation__airbaglocside',
]

# Should exist in dataframe
if not OPBOD:
    assert len(set(cols).difference(car.columns)) == 0, print(f'Column "{c}" not in data')
else:
    # Debug
    print('skip', np.setdiff1d(cols, car.columns))
    cols = np.intersect1d(cols, car.columns)

car.loc[:,cols] = car.loc[:,cols].fillna('').astype(str)

In [24]:
# Replace zero price (not sold) with NaN
car.Price = car.Price.replace({0.0:np.NaN})

In [25]:
# is etree element. Need to fix this
car[car.Title.apply(type) != str]

Unnamed: 0,Source,Title,Price,Draw,Raw_text,N_images,Images,lot_counter,Note,LotNr,...,rdw_motor_uitvoering_emissie_koolwaterstof_concat,rdw_toegevoegde_objecten_merk_object_toegevoegd_concat,rdw_motor_uitvoering_brandstof_netto_max_vermogen_ondergrens_concat,rdw_brandstof_uitstoot_deeltjes_licht_concat,rdw_motor_uitvoering_brandstof_vermogen_ondergrens_concat,rdw_carrosserie_uitvoering_nummerieke_code_carrosserie_uitvoering_numeriek_europees_concat,rdw_assen_technisch_toegestane_maximum_aslast_concat,rdw_brandstof_nominaal_continu_maximumvermogen_concat,rdw_assen_wettelijk_toegestane_maximum_aslast_concat,rdw_brandstof_actie_radius_enkel_elektrisch_stad_wltp_concat
2014-10-2000,~/data/satdatsci-pdf/catalogusdrz_1425676510.xml,Kavelnummer: 2000,,False,"[type VANGUISH VOLANTE, automatic, Duits kente...",0,[],,False,2000,...,{},{},{},{},{},{},{},{},{},{}
2014-10-2001,~/data/satdatsci-pdf/catalogusdrz_1425676510.xml,Kavelnummer: 2001,16852.0,False,"[type E 220 CDI, automatic, kenteken 33-NRK-4,...",0,[],,False,2001,...,{},{},{},{},{},{},{},{},{},{}
2014-10-2002,~/data/satdatsci-pdf/catalogusdrz_1425676510.xml,Kavelnummer: 2002,21751.0,False,"[type S 350, automatic, kenteken 25-TGN-1, afg...",0,[],,False,2002,...,{},{},{},{},{},{},{},{},{},{}
2014-10-2003,~/data/satdatsci-pdf/catalogusdrz_1425676510.xml,Kavelnummer: 2003,19000.0,False,"[type E 200 CGI, cabrio, automatic, kenteken 2...",0,[],,False,2003,...,{},{},{'1_1': 135.0},{'1': 0.0017},{},{},"{'1': 1030.0, '2': 1150.0}",{},"{'1': 1030.0, '2': 1150.0}",{}
2014-10-2004,~/data/satdatsci-pdf/catalogusdrz_1425676510.xml,Kavelnummer: 2004,26778.0,False,"[type E350, automatic, Duits kenteken HH1033A,...",0,[],,False,2004,...,{},{},{},{},{},{},{},{},{},{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-03-8301,~/data/satdatsci-pdf/catalogusdrz_1425658416.xml,Kavelnummer: 8301,1792.0,False,"[type berlingo 1.6 hdi, Brits kenteken YD10FUE...",0,[],,False,8301,...,{},{},{},{},{},{},{},{},{},{}
2015-03-8302,~/data/satdatsci-pdf/catalogusdrz_1425658416.xml,Kavelnummer: 8302,1114.0,False,"[type transit connect 200s lr van 1., kenteken...",0,[],,False,8302,...,{},{},{},{},{},{},{},{},{},{}
2015-03-8305,~/data/satdatsci-pdf/catalogusdrz_1425658416.xml,Kavelnummer: 8305,1357.0,False,"[type l 508 dg, kampeerauto, kenteken XF-61-JN...",0,[],,False,8305,...,{},{},{},{},{},{},{},{},{},{}
2015-03-8311,~/data/satdatsci-pdf/catalogusdrz_1425658416.xml,Kavelnummer: 8311,1414.0,False,"[type partner; 1.4, kenteken 38-LL-DZ, afgelez...",0,[],,False,8311,...,{},{},{},{},{},{},{},{},{},{}


show data types per column

In [26]:
if VERBOSE > 0:
    # print type per column and example values
    df_ = pd.DataFrame(columns = ['Data Type',  'Example values'], index = [car.columns.values])
    df_ = pd.concat([df_], keys=[0]).swaplevel()
    df_.index.set_names(['Column', 'Type counter'], inplace=True)
    for c in car.columns:
        cnt=-1
        for t in car[c].apply(type).unique():
            cnt+=1

            # column name, data type
            df_.loc[(c, cnt), 'Data Type'] = str(t)
            # skip large 
            if c in ['SupInfo','Raw_text','rdwinfo','Images']:
                v = '..skip..'
                df_.loc[(c, cnt), 'Example values'] = v
                continue
            if list in car[c].apply(type).unique():
                v = 'max nr of items: ' +\
                str(car[c].apply(lambda s:len(s) if list==type(s) else 0).max())
                df_.loc[(c, cnt), 'Example values'] = v
                continue
            elif dict in car[c].apply(type).unique():
                if t == dict:
                    v = 'max nr of keys: ' +\
                    str(car[c].apply(lambda s: len(s)).max())
                    df_.loc[(c, cnt), 'Example values'] = v
                    continue

            # values
            v = car[c].unique()
            if len(v) < 10:
                # print all
                df_.loc[(c, cnt), 'Example values'] = ', '.join([f'{vv}' for vv in v])
            else:
                # print first and last
                df_.loc[(c, cnt), 'Example values'] = '{} .. {}'.format(v[0],v[-1])

    
if VERBOSE > 1:
    with pd.option_context('display.max_rows', 999):
        display(df_)

if VERBOSE > 0:
    if df_.reset_index().loc[:,'Type counter'].nunique() > 1:
        print('These columns contain more than one type')

        with pd.option_context("display.max_rows", 999):
            display(
                df_.reset_index()\
                .pivot(columns='Type counter', index='Column', values='Data Type')\
                .dropna(subset=[1])\
                .sort_values(by=[0, 1])\
                .fillna('')
        )

if VERBOSE > 1:
    gb=df_.groupby('Data Type')
    for g in gb.groups:
        with pd.option_context("display.max_rows", 999):
            display(gb.get_group(g).sort_index())


These columns contain more than one type


Type counter,0,1,2,3
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rdwinfo,<class 'float'>,<class 'dict'>,<class 'NoneType'>,<class 'list'>
nhtsa_engine___displacementcc,<class 'float'>,<class 'str'>,,
nhtsa_engine___displacementci,<class 'float'>,<class 'str'>,,
nhtsa_engine___displacementl,<class 'float'>,<class 'str'>,,
nhtsa_engine___enginehp,<class 'float'>,<class 'str'>,,
nhtsa_engine___enginekw,<class 'float'>,<class 'str'>,,
Title,<class 'lxml.etree._ElementUnicodeResult'>,<class 'str'>,,


## Dummies to category (reverse one-hot-encode)

Categorize info from auction by converting from dummies to categories. Convert different boolean fields into one field with string.

In [27]:
# fuel
Fuels = ['lpg','benzine','diesel','hybrid']

# new field
car["fuel_drz"] = str()
# loop over columns
for fuel in np.intersect1d(Fuels,list(car.columns)): # warning: This is case sensitive
    car.loc[car.loc[:,fuel] == True,"fuel_drz"] = fuel


## Date and time operations

Age of car, APK etc.

In [28]:
# Date of auction based on index name
car["now"] = [pd.to_datetime(re.search('([0-9]{4}-[0-9]+)-.*',i)[1],format='%Y-%m') for i in car.index.values]

# APK / Manufacture date / Year
car['apk_drz'] = car.APKdate.apply(lambda t: pd.to_datetime(t,format='%d.%m.%Y') if len(t) != 0 else np.NaN)
car['MF_full_ser'] = car.Mfdate.apply(lambda t: pd.to_datetime(t,format='%d.%m.%Y') if len(t) != 0 else np.NaN)
car['MF_year_ser'] = car.Mfyear.apply(lambda t: pd.to_datetime(t,format='%Y') if t>0 else np.NaN)

# Choose MF year or full date
car["MF_drz"] = car.MF_full_ser.copy()
chooseShort = car.MF_full_ser.isna() & car.MF_year_ser.notna()
car.loc[chooseShort,"MF_drz"] = car.loc[chooseShort,"MF_year_ser"]


- - - -
# Add rdw info into dataframe

### Age of query

Older queries might not have accurate information that was current at auction (e.g. inspection date)

In [29]:
# get time stamp from added columns (new format since Apr 2019)
car['rdw_ser'] = car.rdw_TimeStamp_x.apply(lambda x:pd.to_datetime(x, format='%Y%m%d') if len(x)==8 else np.NaN)

# get time stamp from dict
if 'rdwinfo' in car.columns:
    sel = car['rdw_ser'].isna()
    car.loc[sel,'rdw_ser'] = car.loc[sel,'rdwinfo'].apply(lambda x:pd.to_datetime(x['TimeStamp'],format='%Y%m%d') if type(x)==dict else np.NaN)

In [None]:
if VERBOSE > 0:
    # plot query age
    sel = car['rdw_ser'].isna()
    car['RDW_age'] = np.NaN
    car.loc[~sel, "RDW_age"] = (car.loc[~sel,'rdw_ser'] - car.now).apply(lambda x:x.days)
    car["RDW_age"].plot(marker=',', figsize=[16,2])
    plt.xlabel('lot')
    plt.ylabel('age (days)')
    plt.title('Age of rdw query since auction')

In [31]:
# get time stamp from added columns
car['nhtsa_ser'] = car.nhtsa_TimeStamp.apply(lambda x:pd.to_datetime(x, format='%Y%m%d') if len(x)==8 else np.NaN)

In [None]:
if VERBOSE > 0:
    # plot query age
    sel = car['nhtsa_ser'].isna()
    car['NHTSA_age'] = np.NaN
    car.loc[~sel, "NHTSA_age"] = (car.loc[~sel,'nhtsa_ser'] - car.now).apply(lambda x:x.days)
    car["NHTSA_age"].plot(marker=',', figsize=[16,2])
    plt.xlabel('lot')
    plt.ylabel('age (days)')
    plt.title('Age of nhtsa query since auction')

In [None]:
if VERBOSE > 0:
    # Nr of rdw entries
    df_ = pd.DataFrame(index=car.index, columns=['is_old_style','old_style_rdw', 'rdw', 'nhtsa'], data=np.NaN)
    df_.loc[:, 'is_old_style'] = False
    if 'rdwinfo' in car.columns:
        df_.is_old_style  = car.rdwinfo.apply(lambda x: isinstance(x, dict))
        df_.loc[df_.is_old_style, 'old_style_rdw'] = car.loc[df_.is_old_style, 'rdwinfo'].apply(len)
    else:
        df_.number_of_rdw_entries = 0

    rdw_cols = car.columns.str.startswith('rdw_')
    nhtsa_cols = car.columns.str.startswith('nhtsa_')
    sel = ~df_.is_old_style
    df_.loc[sel, 'rdw'] = car.loc[sel, rdw_cols].notna().sum(axis='columns')
    df_.loc[sel, 'nhtsa'] = car.loc[sel, nhtsa_cols].notna().sum(axis='columns')
    df_.drop(columns=['is_old_style'], inplace=True)

    f=plt.figure(figsize=[16,4])
    ax=f.gca()
    df_.plot(marker='o', linestyle='', secondary_y = False, ax = ax)
    (df_/df_.max(axis=0)).plot(marker='.', linestyle='', secondary_y = True , ax = ax)
    ax.set_xlabel('lot')
    ax.set_title('Number of fields')
    ax.set_ylabel('fraction of max entries')
    ax.legend(title='number of entries')
    print('some old-style (dict) rdw info is retrofitted. In 2021-01 ovi data may have been added.')


### Rename RDW info
`<field name>_rdw(ovi)` indicates that RDW field might already exist in the auction info. The suffix `_rdw` prevents overwriting.


In [34]:
# rename rdw fields
Map = {
    "rdw_aantal_cilinders": "nCyl_rdw",
    "rdw_cilinderinhoud": "cylvol_rdw",
    "rdw_aantal_zitplaatsen": "nSeat_rdw",
    "rdw_basisgegevens_aantal_zitplaatsen_bovengrens": "nSeat2_rdw", 
    "rdw_aantal_deuren" : "nDoor_rdw",
    "rdw_basisgegevens_aantal_deuren_bovengrens": "nDoor2_rdw",
    "rdw_breedte": "width_rdw",
    "rdw_lengte": "length_rdw",
    "rdw_wielbasis": "wheelbase_rdw",
    "rdw_massa_ledig_voertuig":"weight_rdw",
    "rdw_basisgegevens_breedte_voertuig_uitvoering_bovengrens": "width2_rdw",
    "rdw_basisgegevens_lengte_voertuig_uitvoering_bovengrens": "length2_rdw",
    "rdw_basisgegevens_wielbais_bovengrens": "wheelbase2_rdw",
    "rdw_basisgegevens_massa_leeg_voertuig_bovengrens": "weight2_rdw",
    "rdw_basisgegevens_hoogte_voertuig_uitvoering_bovengrens": "height_rdw", 
    "rdw_bruto_bpm" : "bpm_rdw",
    "rdw_catalogusprijs": "newprice_rdw",
    "rdw_datum_tenaamstelling": "reglast_str_rdw",
    "rdw_datum_eerste_afgifte_nederland": "regnl_str_rdw",
    "rdw_datum_eerste_toelating": "regfirst_str_rdw",
    "rdw_vervaldatum_apk":"apk_str_rdw",
    "rdw_inrichting":"body2_rdw",
    "rdw_eerste_kleur":"color_rdw",
    "rdw_merk":"brand_rdw",
    "rdw_handelsbenaming":"model_rdw",
    "rdw_taxi_indicator":"taxi_rdw",
    "rdw_basisgegevens_kant_van_het_stuur":"rhd_rdw",
    "rdw_basisgegevens_max_constructie_snelheid_bovengrens": "maxspeed_rdw",
    "rdw_ovi_private_owners": "private_owners_rdwovi",
    "rdw_ovi_company_owner": "company_owners_rdwovi",
    "rdw_ovi_under_survey": "under_survey_rdwovi", #WOK rdw_ovi_wachten_op_keuring parsed
    "rdw_ovi_merk":"brand_rdwovi",
    "rdw_ovi_handelsbenaming":"model_rdwovi",
    "rdw_ovi_aantal_cilinders": "nCyl_rdwovi",
    "rdw_ovi_aantal_deuren": "nDoor_rdwovi",
    "rdw_ovi_aantal_zitplaatsen": "nSeat_rdwovi",
    "rdw_ovi_bpm_bedrag": "bpm_rdwovi",
    "rdw_ovi_breedte": "width_rdwovi",
    "rdw_ovi_catalogus_prijs": "newprice_rdwovi",
    "rdw_ovi_cilinder_inhoud": "cylvol_rdwovi",
    "rdw_ovi_lengte": "length_rdwovi",
    "rdw_ovi_massa_ledig_voertuig": "weight_rdwovi",
    "rdw_ovi_eerste_toelatingsdatum": "regfirst_str_rdwovi",
    "rdw_nettomaximumvermogen": "power2_rdw",
    "rdw_ovi_maximum_constructie_snelheid": "maxspeed_rdwovi",
    "rdw_zuinigheidslabel": "energylab2_rdw",
    "rdw_wacht_op_keuren": "under_survey_rdw", # not provided in open data!
    "rdw_ovi_kleur": "color_rdwovi",
    "rdw_zuinigheidsclassificatie": "energylab3_rdw",

}

# Should all exist in dataframe
if not OPBOD:
    assert len(np.setdiff1d(list(Map.keys()), car.columns)) == 0, print(f'Columns not in data')
else:
    # Debug
    print('skip', np.setdiff1d(list(Map.keys()), car.columns))

car.rename(columns=Map, inplace=True)


In [35]:
# aggregate rdw fields with index numbers stored in dicts
#   this can be mean, string join, first or whatever
car['power_rdw'] = car.rdw_brandstof_nettomaximumvermogen_concat.apply(
    lambda x: np.mean(list(x.values())) if len(x) > 0 else np.NaN)

car["milage_rdw"] = car.rdw_brandstof_brandstofverbruik_gecombineerd_concat.apply(
    lambda x: np.mean(list(x.values())) if len(x) > 0 else np.NaN)

car['fuel_rdw'] = car.rdw_brandstof_brandstof_omschrijving_concat.apply(
    lambda x:'/'.join(x.values()) if len(x) > 0 else '')

car['body1_rdw'] = car.rdw_carrosserie_type_carrosserie_europese_omschrijving_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else '')

car['nCyl1_rdw'] = car.rdw_motor_uitvoering_aantal_cilinders_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['cylvol1_rdw'] = car.rdw_motor_uitvoering_cilinderinhoud_cm3_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['automatic_rdw'] = car.rdw_versnellingsbak_uitvoering_type_versnellingsbak_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['energylab_rdw'] = car.rdw_uitvoeringverbruik_per_uitgave_verbruikcategorie_uitvoering_concat.apply(
    lambda x: x[max(x.keys())] if len(x) > 0 else '')

car['hybrid_rdw'] = car.rdw_motor_uitvoering_hybride_elektrisch_voertuig_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['nGear_rdw'] = car.rdw_versnellingsbak_uitvoering_aantal_versnellingen_bovengrens_concat.apply(
    lambda x: x[max(x.keys())] if len(x) > 0 else np.NaN)

# Add LPG specification to fuel
is_lpg = car.fuel_rdw.str.lower().str.contains('lpg')
car.loc[is_lpg, 'fuel_rdw'] = car.loc[is_lpg, ['fuel_rdw', 'rdw_type_gasinstallatie']].apply('/'.join, axis='columns')

if VERBOSE > 1:
    display(car.loc[car['fuel_rdw'].drop_duplicates().index, ['fuel_rdw', 'rdw_brandstof_brandstof_omschrijving_concat', 'rdw_type_gasinstallatie']])
    


In [36]:
# add old style dict entries from rdw as new columns
map_old2new = {
    "aantal_cilinders": "nCyl_rdw",
    "cilinderinhoud": "cylvol_rdw",
    "nettomaximumvermogen": "power_rdw",
    "brandstofverbruik_gecombineerd":"milage_rdw",
    "aantal_zitplaatsen": "nSeat_rdw",
    "aantal_deuren" : "nDoor_rdw",
    "breedte": "width_rdw",
    "lengte": "length_rdw",
    "wielbasis": "wheelbase_rdw",
    "massa_ledig_voertuig":"weight_rdw",
    "bruto_bpm" : "bpm_rdw",
    "catalogusprijs": "newprice_rdw",
    "datum_tenaamstelling": "reglast_str_rdw",
    "datum_eerste_afgifte_nederland": "regnl_str_rdw",
    "datum_eerste_toelating": "regfirst_str_rdw",
    "vervaldatum_apk":"apk_str_rdw",
    "type_carrosserie_europese_omschrijving":"body1_rdw",
    "inrichting":"body2_rdw",
    "brandstof_omschrijving":"fuel_rdw",
    "eerste_kleur":"color_rdw",
    "merk":"brand_rdw",
    "handelsbenaming":"model_rdw",
    "taxi_indicator":"taxi_rdw",
}

for rdw_key, new_column in map_old2new.items():
    if 'rdwinfo' not in car.columns:
        print('skip old2new')
        break
    is_old_style = car.rdwinfo.apply(lambda row: rdw_key in row.keys() if isinstance(row, dict) else False)
    car.rdwinfo[is_old_style].apply(lambda x: x[rdw_key][0]) # Take first. Rdw named first key conveniently '0'
    values = car.rdwinfo[is_old_style].apply(lambda x: list(x[rdw_key].values()))

    if VERBOSE > 0:
        display({'text/plain': f'{rdw_key:40s}-> {new_column}'}, raw=True)
        
    if values.apply(len).max() == 1:
        # Column should exist. Hence insert with mask
        car.loc[is_old_style, new_column] = values.apply(lambda x: x[0])

    elif new_column == 'fuel_rdw':
        order = car.rdwinfo[is_old_style].apply(lambda x: list(x['brandstof_volgnummer'].values()))
        ordered_values = [[v[i-1] for i in o] for o, v in zip(order, values)]
        concat_values = [*map('/'.join, ordered_values)]
        tank_type = car.rdwinfo[is_old_style].apply(lambda x: list(x['type_gasinstallatie'].values()) if "type_gasinstallatie" in x else []) # will include CNG
        fuel = pd.Series({i: f + '/' + '/'.join(tank_type.loc[i]) if len(tank_type.loc[i]) > 0 else f for f, i in zip(concat_values, tank_type.index)})
        car.loc[is_old_style, new_column] = fuel

    if VERBOSE > 1:
        try:
            df_ = pd.DataFrame(car.loc[is_old_style, new_column].value_counts().sort_index())
        except TypeError:
            df_ = pd.DataFrame(car.loc[is_old_style, new_column].astype(str).value_counts().sort_index())
        display(df_)




aantal_cilinders                        -> nCyl_rdw

cilinderinhoud                          -> cylvol_rdw

nettomaximumvermogen                    -> power_rdw

brandstofverbruik_gecombineerd          -> milage_rdw

aantal_zitplaatsen                      -> nSeat_rdw

aantal_deuren                           -> nDoor_rdw

breedte                                 -> width_rdw

lengte                                  -> length_rdw

wielbasis                               -> wheelbase_rdw

massa_ledig_voertuig                    -> weight_rdw

bruto_bpm                               -> bpm_rdw

catalogusprijs                          -> newprice_rdw

datum_tenaamstelling                    -> reglast_str_rdw

datum_eerste_afgifte_nederland          -> regnl_str_rdw

datum_eerste_toelating                  -> regfirst_str_rdw

vervaldatum_apk                         -> apk_str_rdw

type_carrosserie_europese_omschrijving  -> body1_rdw

inrichting                              -> body2_rdw

brandstof_omschrijving                  -> fuel_rdw

eerste_kleur                            -> color_rdw

merk                                    -> brand_rdw

handelsbenaming                         -> model_rdw

taxi_indicator                          -> taxi_rdw

## preprocessing of rdw info

In [37]:
map_str2date = {
    "regfirst_str_rdw":"regfirst_rdw",
    "regnl_str_rdw":"regnl_rdw",
    "reglast_str_rdw":"reglast_rdw",
    "apk_str_rdw":"apk_rdw",
}

if VERBOSE > 1:
    gb_ = car[[*map_str2date.keys()]].groupby([*map(lambda x: '{0}-{1:>2s}'.format(*x.split('-')[0:2]), car.index)])
    df_ = pd.concat([
        gb_.first(),
        gb_.first().applymap(type)
    ], axis=1)
    display(df_)

# lookup table when format changed
date_format = pd.DataFrame(
    columns = ['start', 'end', 'pattern'], 
    index = ['pat1', 'pat2', 'pat3'], 
    data = [
        ['1769-1-1', '2019-03-01', '%d/%m/%Y'], #25/11/2008
        #['2019-03-01', '2019-04-01', [lambda x: f'{x:.0f}', '%Y%m%d']],#20021112 convert to string first
        ['2019-03-01', '2019-04-01', '%Y%m%d'], #20140818
        #['2019-04-01', '2100-01-01', [lambda x: f'{x:.0f}', '%Y%m%d']] #2.00902e+07
        ['2019-04-01', '2100-01-01', '%Y%m%d.0'] #2.00902e+07
    ])

query_date = car.rdw_ser.fillna(pd.NaT)

# Loop over columns that need transforming
for rdw_key, new_column in map_str2date.items():

    car[new_column] = pd.NaT

    if VERBOSE > 0:
        print(f'{rdw_key:10s} -> {new_column}')

    # Do per formatting version
    for date_fmt in date_format.iterrows():
        if VERBOSE > 1:
            print(f'{date_fmt[0]}: ')
            for k,v in date_fmt[1].items():
                print(f'\t{k}: {v}')

        # Select rows with formatting
        sel = (query_date >= date_fmt[1].start) & (query_date < date_fmt[1].end)
        old_fmt = car.loc[sel, rdw_key]
        # remove empty string
        old_fmt.drop(old_fmt.index[old_fmt == ''], inplace = True)
        
        if VERBOSE > 1:
            print('Query dates (check with start, end):')
            print(f'\t{query_date[sel].min().strftime("%Y-%m-%d")} .. {query_date[sel].max().strftime("%Y-%m-%d")}')
            print(f'Format in {rdw_key} (check with pattern):')
            print(f'\t{old_fmt[sel][0]} .. {old_fmt[sel][-1]}')

        # Convert to datetime. Optionally apply modifier if list.
        if isinstance(date_fmt[1].pattern, list):
            for fun in date_fmt[1].pattern[:-1]:
                old_fmt = old_fmt.apply(fun)
            date_pat = date_fmt[1].pattern[-1]
        else:
            date_pat = date_fmt[1].pattern

        # Update dataframe
        new_fmt = old_fmt.apply(lambda d: pd.to_datetime(d, format=date_pat))
        new_fmt.name = new_column    
        car[new_column].update(new_fmt)
        if VERBOSE > 1:
            print(f'{car[new_column].dropna().shape[0]} of {car.shape[0]} are modified after applying {date_fmt[0]}')

# remove string values (old columns)
car.drop(columns=map_str2date.keys(), inplace = True)

# Ovi data has format 0d-0m-yyyy
map_str2date = {
    "regfirst_str_rdwovi":"regfirst_rdwovi",
    'rdw_ovi_afg_dat_kent': 'afgiftekenteken_rdwovi',
    'rdw_ovi_datum_gdk': 'goedkeuring_rdwovi',
    
}

for rdw_key, new_column in map_str2date.items():

    car[new_column] = pd.NaT

    if VERBOSE > 0:
        print(f'{rdw_key:10s} -> {new_column}')

    car[new_column].update(
        car[rdw_key].replace('', np.NaN).dropna()\
        .apply(lambda d: pd.to_datetime(d, format="%d-%m-%Y"))
    )
    
# drop old columns
car.drop(columns=map_str2date.keys(), inplace = True)


# Dates have format yyyy-0m-0dThh:mm:ss.000 0d-0m-yyyy
map_str2date = {
    'rdw_datum_eerste_toelating_dt': 'regfirst_2_rdw',
    'rdw_datum_eerste_afgifte_nederland_dt': 'regnl_2_rdw',
    'rdw_datum_tenaamstelling_dt': 'reglast_2_rdw',
    'rdw_vervaldatum_apk_dt': 'apk_2_rdw',
    'rdw_datum_eerste_tenaamstelling_in_nederland_dt': 'regnl_3_rdw', # name change from 'regnl_2_rdw' 
                                                                      # https://groups.google.com/d/msgid/voertuigen-open-data/9438d67a-b3ff-42d3-a6be-f7a370611e0en%40googlegroups.com?utm_medium=email&utm_source=footer
    
}


for rdw_key, new_column in map_str2date.items():

    car[new_column] = pd.NaT

    if VERBOSE > 0:
        print(f'{rdw_key:10s} -> {new_column}')

    car[new_column].update(
        car[rdw_key].replace('', np.NaN).dropna()\
        .apply(lambda d: pd.to_datetime(d, format="%Y-%m-%dT%H:%M:%S"))
    )
    
# drop old columns
car.drop(columns=map_str2date.keys(), inplace = True)

regfirst_str_rdw -> regfirst_rdw
regnl_str_rdw -> regnl_rdw
reglast_str_rdw -> reglast_rdw
apk_str_rdw -> apk_rdw
regfirst_str_rdwovi -> regfirst_rdwovi
rdw_ovi_afg_dat_kent -> afgiftekenteken_rdwovi
rdw_ovi_datum_gdk -> goedkeuring_rdwovi
rdw_datum_eerste_toelating_dt -> regfirst_2_rdw
rdw_datum_eerste_afgifte_nederland_dt -> regnl_2_rdw
rdw_datum_tenaamstelling_dt -> reglast_2_rdw
rdw_vervaldatum_apk_dt -> apk_2_rdw
rdw_datum_eerste_tenaamstelling_in_nederland_dt -> regnl_3_rdw


In [38]:
# Add age if column contains date
age = car[map_str2date.values()].apply(lambda c: car.now - c).add_suffix('_age')
if all(age.columns.isin(car.columns)):
    car.update(age)
else:
    car = pd.concat([car, age], axis=1)


In [39]:
# same data type
# sometimes type is numerical
car.model_rdw = car.model_rdw.astype(str)
car.brand_rdw = car.brand_rdw.astype(str)


In [40]:
# lower case
cols = ['body1_rdw', 'body2_rdw', 'fuel_rdw', 'model_rdw', 'model_rdwovi']
car[cols] = car[cols].apply(lambda x: x.str.lower())

In [41]:
# strip brand from model
def remove_brand(model,brand):
    '''remove trailing brand name from model name'''
    
    import re
    
    if type(model) == str:
        model = re.sub('(?i)^' + brand + '[ ,;]*','',model)
    return model 

# special cases
selJag = car.brand_rdw == 'JAGUAR CARS'
selCit = car.brand_rdw == 'CITROEN' 

# remove brand name
for col in ['model_drz', 'model_rdw', 'model_rdwovi']:
    car[col] = car.apply(lambda row:remove_brand(row[col], row.brand_rdw),axis='columns')

    car.loc[selJag,col] = car.loc[selJag,:].apply(lambda row:remove_brand(row[col],'JAGUAR'), axis='columns')
    car.loc[selCit,col] = car.loc[selCit,:].apply(lambda row:remove_brand(row[col],'CITROËN'), axis='columns')
    car.loc[selCit,col] = car.loc[selCit,:].apply(lambda row:remove_brand(row[col],'citroÃ«n'), axis='columns')


In [42]:
# category to boolean
car.rhd_rdw = car.rhd_rdw.replace({'R': True, 'L': False})
car.automatic_rdw = car.automatic_rdw.replace({'A': True, 'H': False, 'C': True})
car.hybrid_rdw = car.hybrid_rdw.replace({'J': True, 'N': False})
car.taxi_rdw = car.taxi_rdw.replace({'Ja': True, 'Nee': False})
#rdw_hefas 

In [43]:
# consistent with other fields

# cm to mm
car.wheelbase_rdw = car.wheelbase_rdw * 10.0 
car.length_rdw = car.length_rdw * 10.0
car.width_rdw = car.width_rdw * 10.0

car.length_rdwovi = car.length_rdwovi.apply(lambda x: x*10 if x!=-1 else x)
car.width_rdwovi = car.width_rdwovi.apply(lambda x: x*10 if x!=-1 else x)

In [44]:
# Numerical fields
# weight, width, length, height, etc
for fld in [
    'length_rdw',
    'width_rdw',
    'nDoor_rdw',
    'nDoor2_rdw',
    'length_rdwovi',
    'width_rdwovi',
    
]:
    if car[fld].dtype == 'int':
        car[fld] = car[fld].replace({0: -1})
        if VERBOSE > 0:
            print(f'replace 0 with -1 in {fld}')
    elif car[fld].dtype == 'float':
        car[fld] = car[fld].replace({0.0: np.NaN})
        if VERBOSE > 0:
            print(f'replace 0.0 with NaN in {fld}')
    else:
        print(fld, car[fld].dtype)
        raise TypeError

        
if VERBOSE > 1:
    print('These have zeros that possibly need replacing too')
    [print(f'\t{c}') for c in car.columns if (car[c]==0).any()];


replace 0.0 with NaN in length_rdw
replace 0.0 with NaN in width_rdw
replace 0 with -1 in nDoor_rdw
replace 0 with -1 in nDoor2_rdw
replace 0 with -1 in length_rdwovi
replace 0 with -1 in width_rdwovi


### Use auction info or RDW info

In [45]:
sorted(car.columns[car.columns.str.startswith('nhtsa')].to_list())

['nhtsa_TimeStamp',
 'nhtsa_activesafetysystem_911notification__can_aacn',
 'nhtsa_activesafetysystem___abs',
 'nhtsa_activesafetysystem___activesafetysysnote',
 'nhtsa_activesafetysystem___automaticpedestrianalertingsound',
 'nhtsa_activesafetysystem___autoreversesystem',
 'nhtsa_activesafetysystem___edr',
 'nhtsa_activesafetysystem___esc',
 'nhtsa_activesafetysystem___keylessignition',
 'nhtsa_activesafetysystem___saeautomationlevel',
 'nhtsa_activesafetysystem___saeautomationlevel_to',
 'nhtsa_activesafetysystem___tpms',
 'nhtsa_activesafetysystem___tractioncontrol',
 'nhtsa_activesafetysystem_backingupandparking__parkassist',
 'nhtsa_activesafetysystem_backingupandparking__rearautomaticemergencybraking',
 'nhtsa_activesafetysystem_backingupandparking__rearcrosstrafficalert',
 'nhtsa_activesafetysystem_backingupandparking__rearvisibilitysystem',
 'nhtsa_activesafetysystem_forwardcollisionprevention__cib',
 'nhtsa_activesafetysystem_forwardcollisionprevention__dynamicbrakesupport',
 

In [46]:
fldpairs = [
    ['taxi_rdw','taxi_drz','taxi'],
    ['regfirst_rdw','regfirst_rdwovi','MF_drz','MF'],
    ['apk_drz','apk_rdw','apk'],
    ['body1_rdw','body2_rdw','nhtsa_exterior_body__bodyclass', 'bodytype'],
    ['fuel_rdw','fuel_drz','nhtsa_engine___fueltypeprimary', 'fuel'],
    ['brand_drz','brand_rdw', 'brand_rdwovi', 'nhtsa_general___make', 'brand'],
    ['model_drz','model_rdw', 'model_rdwovi', 'nhtsa_general___model', 'model'],
    ['nDoor2_rdw', 'nDoor_rdw', 'nDoor_rdwovi', 'nhtsa_exterior_body__doors', 'nDoor'],
    ['nSeat2_rdw', 'nSeat_rdw', 'nSeat_rdwovi', 'nhtsa_interior_seat__seats', 'nSeat'],
    ['nCyl1_rdw', 'nCyl_rdw', 'nCyl_rdwovi', 'nhtsa_engine___enginecylinders', 'nCyl'],
    ['cylvol1_rdw', 'cylvol_rdw', 'cylvol_rdwovi', 'nhtsa_engine___displacementcc', 'cylvol'],
    ['rhd_drz', 'rhd_rdw', 'rhd'],
    ['wheelbase2_rdw', 'wheelbase_rdw','nhtsa_exterior_dimension__wheelbaselong', 'wheelbase'],
    ['width2_rdw', 'width_rdw', 'width_rdwovi', 'width'],
    ['length2_rdw', 'length_rdw', 'length_rdwovi', 'length'],
    ['weight2_rdw', 'weight_rdw', 'weight_rdwovi', 'nhtsa_exterior_dimension__gcwr', 'weight'],
    ['automatic_drz', 'automatic_rdw', 'nhtsa_mechanical_transmission__transmissionstyle', 'automatic'],
    ['hybrid_rdw', 'hybrid_drz', 'nhtsa_engine___electrificationlevel', 'hybrid'],
    ['bpm_rdw', 'bpm_rdwovi', 'bpm'],
    ['newprice_rdw', 'newprice_rdwovi', 'nhtsa_general___baseprice', 'newprice'],
    ['power_rdw', 'power2_rdw', 'nhtsa_engine___enginehp', 'nhtsa_engine___enginehp_to','power'],
    ['maxspeed_rdw', 'maxspeed_rdwovi', 'nhtsa_engine___topspeedmph', 'maxspeed'],
    ['height_rdw', 'height_rdw', 'height'], # mock compare to trim suffix
    ['nGear_rdw', 'nGear_rdw', 'nhtsa_mechanical_transmission__transmissionspeeds', 'nGear'], # i.d.
    ['private_owners_rdwovi', 'private_owners_rdwovi', 'private_owners'], # i.d.
    ['company_owners_rdwovi', 'company_owners_rdwovi', 'company_owners'], # i.d.    
    ['modelspec_drz','modelspec_drz', 'modelspec'], #i.d.    
    ['energylab3_rdw', 'energylab_rdw', 'energylab2_rdw', 'energylab'],
    ['under_survey_rdwovi', 'under_survey_rdw', 'under_survey'],
    ['color_rdw', 'color_rdwovi', 'color'],
    ['regnl_rdw', 'regnl_2_rdw', 'regnl_3_rdw', 'regnl'],
]


for flds in fldpairs:

    # add result field if not exist
    if OPBOD:
        not_exist = [f for f in flds if f not in car.columns]
        df_ = car.loc[:, np.setdiff1d(flds,not_exist)]
        df_[not_exist] = np.NaN

    else:
        if flds[-1] not in car.columns:
            df_ = car.loc[:, flds[:-1]]
            df_[flds[-1]] = np.NaN
        else:
            df_ = car.loc[:, flds]

    # replace placeholders with NaN
    df_.replace({
        'niet geregistreerd': np.NaN, 
        '': np.NaN,
        -1: np.NaN,
        'nan': np.NaN,
    }, inplace=True)

    # branch off df for display
    df_disp = df_.copy()
    df_disp.fillna('.', inplace=True)

    # select rows that are not all (but one) NaN
    notallna = df_.iloc[:,:-1].notna().sum(axis=1) > 1
    
    # start filling values from left to right
    df_ = df_.bfill(axis=1)
    
    # Last value is result
    choice = df_.iloc[:,0]
    choice.name = f'>{flds[-1]}<'
    car[flds[-1]] = choice # <- UPDATE DATA
    
    # display differences
    
    if VERBOSE > 0:
        # accented are same
        if flds[-1] == 'brand':
            df_.replace({'CITROËN': 'CITROEN'}, inplace=True)
        # add extra info
        if flds[-1] == 'model':
            df_disp = pd.concat([
                car.loc[: ,['brand_rdw', 'brand']].ffill(axis=1).iloc[:,-1], 
                car.modelspec_drz,
                df_disp], axis=1)

        df_disp = pd.concat([df_disp, choice], axis=1)

        nuq = df_.iloc[:,:-1].nunique(axis=1, dropna=True)
        isdiff = (nuq > 1) & notallna


        if all(~isdiff):
            display({'text/html': 
                     f'<b>{flds[-1]}</b>: same in all auctions <b>{", ".join(flds[:-1])}</b>'}, raw=True)
        else:
            if VERBOSE > 1:
                display(df_disp[isdiff])
            else:
                # only current auction
                islast = df_disp.index.str.startswith('-'.join(df_disp.index[-1].split('-')[:-1]))
                if all(~(isdiff & islast)):
                    display({'text/html': 
                             f'<b>{flds[-1]}</b>: same in last auctions <b>{", ".join(flds[:-1])}</b>'}, raw=True)
                else:
                    display(df_disp[isdiff & islast])


Unnamed: 0,body1_rdw,body2_rdw,nhtsa_exterior_body__bodyclass,bodytype,>bodytype<
2022-11-702711,vrachtwagen,open wagen,.,.,vrachtwagen
2022-11-700631,stationwagen,stationwagen,Wagon,.,stationwagen
2022-11-701531,hatchback,hatchback,Hatchback/Liftback/Notchback,.,hatchback


Unnamed: 0,fuel_rdw,fuel_drz,nhtsa_engine___fueltypeprimary,fuel,>fuel<
2022-11-702411,benzine/elektriciteit,benzine,.,.,benzine/elektriciteit
2022-11-702711,lpg/benzine/g2 gasinstallatie,lpg,.,.,lpg/benzine/g2 gasinstallatie
2022-11-704411,benzine/elektriciteit,benzine,.,.,benzine/elektriciteit
2022-11-701531,benzine,benzine,Gasoline,.,benzine
2022-11-701931,diesel,diesel,Gasoline,.,diesel
2022-11-702531,benzine,benzine,Gasoline,.,benzine


Unnamed: 0,brand_drz,brand_rdw,brand_rdwovi,nhtsa_general___make,brand,>brand<
2022-11-701911,JAGUAR,JAGUAR CARS,JAGUAR CARS,.,.,JAGUAR


Unnamed: 0,brand,modelspec_drz,model_drz,model_rdw,model_rdwovi,nhtsa_general___model,model,>model<
2022-11-710011,RENAULT,,scenic,megane scenic,megane scenic,.,.,scenic
2022-11-700631,VOLVO,,v70,v70,v70,V70,.,v70
2022-11-701531,VOLKSWAGEN,,golf,golf,golf,GTI,.,golf
2022-11-702531,PEUGEOT,,208,208,208,505,.,208


Unnamed: 0,nDoor2_rdw,nDoor_rdw,nDoor_rdwovi,nhtsa_exterior_body__doors,nDoor,>nDoor<
2022-11-700011,5.0,4.0,.,.,.,5.0
2022-11-704611,5.0,3.0,.,.,.,5.0


Unnamed: 0,nCyl1_rdw,nCyl_rdw,nCyl_rdwovi,nhtsa_engine___enginecylinders,nCyl,>nCyl<
2022-11-702531,3.0,3.0,3.0,4.0,.,3.0


Unnamed: 0,cylvol1_rdw,cylvol_rdw,cylvol_rdwovi,nhtsa_engine___displacementcc,cylvol,>cylvol<
2022-11-701531,.,1984.0,1984.0,1984.0,.,1984.0
2022-11-701931,1398.0,1398.0,1398.0,1905.0,.,1398.0
2022-11-702531,999.0,999.0,999.0,1905.0,.,999.0


Unnamed: 0,wheelbase2_rdw,wheelbase_rdw,nhtsa_exterior_dimension__wheelbaselong,wheelbase,>wheelbase<
2022-11-260111,2942.0,2940.0,.,.,2942.0
2022-11-260211,3075.0,3080.0,.,.,3075.0
2022-11-700011,2595.0,2600.0,.,.,2595.0
2022-11-700311,2703.0,2700.0,.,.,2703.0
2022-11-700511,2407.0,2410.0,.,.,2407.0
...,...,...,...,...,...
2022-11-702931,2575.0,2580.0,.,.,2575.0
2022-11-703331,3165.0,3170.0,.,.,3165.0
2022-11-704631,2575.0,2580.0,.,.,2575.0
2022-11-704931,2928.0,2930.0,.,.,2928.0


Unnamed: 0,width2_rdw,width_rdw,width_rdwovi,width,>width<
2022-11-260311,1864.0,1860.0,.,.,1864.0
2022-11-700011,1729.0,1730.0,.,.,1729.0
2022-11-701111,2018.0,1970.0,.,.,2018.0
2022-11-703111,1777.0,1780.0,.,.,1777.0
2022-11-703311,1793.0,1790.0,.,.,1793.0
2022-11-704011,1865.0,1870.0,.,.,1865.0
2022-11-704611,1682.0,1680.0,.,.,1682.0
2022-11-704911,1777.0,1780.0,.,.,1777.0
2022-11-705411,1985.0,1990.0,.,.,1985.0
2022-11-707511,1753.0,1750.0,.,.,1753.0


Unnamed: 0,length2_rdw,length_rdw,length_rdwovi,length,>length<
2022-11-260111,4881.0,4880.0,.,.,4881.0
2022-11-260311,4324.0,4320.0,.,.,4324.0
2022-11-700011,4157.0,4160.0,.,.,4157.0
2022-11-700311,4661.0,4660.0,.,.,4661.0
2022-11-700711,4052.0,4050.0,.,.,4052.0
2022-11-700811,3962.0,3960.0,.,.,3962.0
2022-11-701111,5167.0,5050.0,.,.,5167.0
2022-11-701911,4861.0,4860.0,.,.,4861.0
2022-11-702111,3921.0,3920.0,.,.,3921.0
2022-11-702311,3415.0,3420.0,.,.,3415.0


Unnamed: 0,automatic_drz,automatic_rdw,nhtsa_mechanical_transmission__transmissionstyle,automatic,>automatic<
2022-11-700631,False,True,.,.,False
2022-11-701531,True,.,Automatic,.,True
2022-11-702531,False,False,Automatic,.,False
2022-11-704931,True,F,.,.,True


Unnamed: 0,hybrid_rdw,hybrid_drz,nhtsa_engine___electrificationlevel,hybrid,>hybrid<
2022-11-702411,True,False,.,.,True
2022-11-704411,True,False,.,.,True


Unnamed: 0,power_rdw,power2_rdw,nhtsa_engine___enginehp,nhtsa_engine___enginehp_to,power,>power<
2022-11-701531,155.0,.,210.0,.,.,155.0
2022-11-701931,50.0,.,123.0,.,.,50.0
2022-11-702531,50.0,.,123.0,.,.,50.0


Unnamed: 0,nGear_rdw,nGear_rdw.1,nhtsa_mechanical_transmission__transmissionspeeds,nGear,>nGear<
2022-11-702531,5.0,5.0,4.0,.,5.0


Unnamed: 0,energylab3_rdw,energylab_rdw,energylab2_rdw,energylab,>energylab<
2022-11-260311,E,D,.,.,E
2022-11-700411,E,F,.,.,E
2022-11-700511,A,C,.,.,A
2022-11-700611,D,G,.,.,D
2022-11-700811,A,C,.,.,A
2022-11-701011,C,G,.,.,C
2022-11-701711,E,F,.,.,E
2022-11-701911,E,F,.,.,E
2022-11-702011,C,D,.,.,C
2022-11-702311,B,C,.,.,B


Unnamed: 0,color_rdw,color_rdwovi,color,>color<
2022-11-705711,WIT,WIT / ZWART,.,WIT
2022-11-703331,ZWART,ZWART / GEEL,.,ZWART


- - - - 
# calculate extra info

In [47]:
# four wheel drive
car['fwd'] = car.rdw_basisgegevens_aantal_aangedreven_assen.replace({-1: '', 0: False, 1: False})
sel = car.fwd.apply(lambda x: x>1 if type(x)==int else False)
car.loc[sel,'fwd'] = True

In [48]:
car["Age"] = (car["now"] - car["MF"]).apply(lambda x: x.days)
car["APK_age"] = (car["now"] - car["apk"]).apply(lambda x: x.days)
car["import_age"] = (car["regnl"] - car["regfirst_rdw"]).apply(lambda x: x.days)

In [None]:
# plot
if VERBOSE > 1:
    df_ = car[[c for c in car.columns if c.endswith('_age')] + ['Age']]\
    .applymap(lambda x: x.days/365.25 if isinstance(x, pd.Timedelta) else x/365.25)\
    .replace({pd.NaT: np.nan})
    
    for k,s in df_.iteritems():
        plt.figure(figsize=[16,2])
        s.plot(marker=',', linestyle='', alpha=1, figsize=[16,2], ms=2)
        plt.title(k)
        
    plt.ylabel('age (year)')
    
elif VERBOSE > 0:
    car[[c for c in car.columns if c.endswith('_age')] + ['Age']]\
    .applymap(lambda x: x.days/365.25 if isinstance(x, pd.Timedelta) else x/365.25)\
    .replace({pd.NaT: np.nan})\
    .plot(marker='s', linestyle='', alpha=0.4, figsize=[16,8], ms=2)
    plt.legend()   
    plt.xlabel('lot')
    plt.ylabel('age (year)')
    


# subselection and save

### Save data for ML

In [50]:
save_cols = [
    "Price",
    "brand",
    "model",
    "Age",
    "fuel",
    "Odo",
    "APK_age",
    "import_age",
    "bodytype",
    "cylvol",
    "nCyl",
    "power",
    "weight",
    "bpm",
    "newprice",
    "nSeat",
    "nDoor", 
    "color",
    'fwd',
    'maxspeed',
    'length',
    'height',
    'width',
    'automatic',
    'nGear',
    'energylab',
    'private_owners',
    'company_owners',
    'under_survey',
]
map_lowercase = {
    'Price':'price',
    'Age':'age',
    'Odo':'odometer',
    'APK_age':'days_since_inspection_invalid',
    'import_age':'age_at_import',
    'bodytype':'body_type',
    'cylvol':'displacement',
    'nCyl':'number_of_cylinders',
    'bpm':'registration_tax',
    'newprice':'sale_price',
    'nSeat':'number_of_seats',
    'nDoor':'number_of_doors',
    'maxspeed':'top_speed',
    'automatic':'automatic_gearbox',
    'nGear':'number_of_gears',
    'energylab': 'energy_label',
}

if VERBOSE > 1:
    print('Columns >> .. << are saved as car dataset')
    
    ncol = 8
    l = [*car.columns]
    l = sorted(l)
    l = [f'>> {i} <<' if i in save_cols else i for i in l]
    l_padded = np.ceil(len(l) / ncol)*ncol
    l += [''] * int(l_padded - len(l))
    df_ = pd.DataFrame(np.reshape(l, (-1,ncol)), columns = [''] * ncol)
    df_['ix'] = ''
    df_.set_index('ix', inplace=True)
    df_.index.name=''
    with pd.option_context("display.max_rows", 999, "max_colwidth", 32):
        display(df_)

out = car.loc[:,save_cols].rename(columns=map_lowercase)
if VERBOSE > 0:
    with pd.option_context("display.max_columns", 999):
        display(out.tail(), metadata = {'tags': (TAG_SINGLE, )})

# save data
file_name = '../data/cars-for-ml.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
# save
if (SKIPSAVE==False): # and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    


../data/cars-for-ml.pkl


### Save data for image classification

In [51]:
save_cols = [
    "Images",
    "brand",
    "model",
    "modelspec",
    "color",
    "Age",
    "bodytype",
    "cabriolet",
    "nDoor", 
    "length", 
    'height',
    'width',
    "wheelbase",
    "ForeignReg",
    "Reg",
    "taxi",
]
map_lowercase = {
    'Images':'image_urls',
    'modelspec':'model_specification',
    'Age':'age',
    'bodytype':'body_type',
    'cabriolet':'convertible',
    'nDoor':'number_of_doors',
    'ForeignReg':'foreign_registration',
    'Reg':'registration_number'
}

out = car.loc[:,save_cols].rename(columns=map_lowercase)
if VERBOSE > 0:
    with pd.option_context("display.max_columns", 999):
        display(out.tail(), metadata={'tags': (TAG_SINGLE, )})


# save data
file_name = '../data/cars-for-imageclf.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')

# save
if (SKIPSAVE==False): # and (not(os.path.isfile(file_name))):
    print(file_name, out.shape)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

../data/cars-for-imageclf.pkl (9637, 16)


# Write example table to file
based on: https://stackoverflow.com/a/33869154

In [52]:
def pandas_df_to_markdown_table(df):
    fmt = ['-----' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return df_formatted.to_csv(sep="|", index=False)


In [53]:
fn = '../data/cars-for-ml.pkl'
print(f'load {fn}')
out = pd.read_pickle(fn)

file_name = '../assets/example-table-of-ml.md'
example = out.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'

# convert to md
try:
    table_text = example.reset_index().to_markdown()
except ImportError:
    print('Fallback')
    table_text = pandas_df_to_markdown_table(example.reset_index())

# save
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    with open(file_name,'w') as file:
        file.write(table_text)

    print('A markdown table is available as\n\t{}'.format(file_name))
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    display(example, metadata={'tags': (TAG_SINGLE, )})
    

load ../data/cars-for-ml.pkl
Fallback
Skip. ../assets/example-table-of-ml.md exists or saving is disabled in settings.


In [54]:
fn = '../data/cars-for-imageclf.pkl'
print(f'load {fn}')
out = pd.read_pickle(fn)

file_name = '../assets/example-table-of-imageclf.md'
example = out.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'


# convert to md
try:
    table_text = example.reset_index().to_markdown()
except ImportError:
    print('Fallback')
    table_text = pandas_df_to_markdown_table(example.reset_index())

# save
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    with open(file_name,'w') as file:
        file.write(table_text)

    print('A markdown table is available as\n\t{}'.format(file_name))
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    display(example, metadata={'tags': (TAG_SINGLE, )})
    

load ../data/cars-for-imageclf.pkl
Fallback
Skip. ../assets/example-table-of-imageclf.md exists or saving is disabled in settings.


In [55]:
with pd.option_context('display.max_rows', 999):
    display(car.brand.value_counts().sort_index())

AIXAM                          1
ALFA ROMEO                    67
ASTON-MARTIN                  13
AUDI                         878
AUSTIN-HEALEY                  1
AUVERLAND                      1
BENTLEY                       15
BMW                          772
BUICK                          1
CADILLAC                       8
CHEVROLET                     81
CHRYSLER                      45
CITROËN                      308
DACIA                          9
DAEWOO                        15
DAF                            2
DAIHATSU                      28
DAIMLER                        2
DATSUN                         2
DAX                            1
DODGE                         14
DS                             1
FERRARI                       15
FIAT                         282
FORD                         364
GMC                            2
HONDA                         70
HUMMER                         2
HYMER                          1
HYUNDAI                       93
INFINITI  