# Preprocess car data

In [None]:
import drz_config
cfg = drz_config.read_config()
DATE = cfg['DATE']
VERBOSE = cfg['VERBOSE']
OPBOD = cfg['OPBOD']
SKIPSAVE = cfg['SKIPSAVE']

if VERBOSE > 0:
    print(cfg)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

## Load data

In [None]:
if OPBOD:
    fn = '../../../python-nb/data/cars-from-all-auctions-opbod.pkl'
else:
    fn = '../data/cars-from-all-auctions.pkl'
print(fn)
car = pd.read_pickle(fn)

if VERBOSE > 0:
    display(car.tail())

### Rename columns
to keep track of source of field

In [None]:
car.rename(columns={
    'rhd': 'rhd_drz',
    'automatic': 'automatic_drz',
    'hybrid': 'hybrid_drz'
}, inplace=True)

## Odometer
convert miles to km and make all numerical

In [None]:
def odo_str2float(df):
    '''
    Convert odometer to numerical values.
    Also convert Miles to KMs.
    '''
    
    ml2km = 1.609344
    
    # substitute to nan
    for repair_to_nan in ['onbekend', 
                          'volgens NAP logisch', 'volgens nap logisch', 
                          'volgens NAP onlogisch', 'volgens nap onlogisch']:
        df.OdoKM = df.OdoKM.apply(lambda x:x.replace(repair_to_nan,'nan') if (type(x) == str) and (len(x)>0) else 'nan')
        df.OdoMLS = df.OdoMLS.apply(lambda x:x.replace(repair_to_nan,'nan') if (type(x) == str) and (len(x)>0) else 'nan')
    # float
    df["OdoKM_num"] = df.OdoKM.apply(lambda x:x.replace('.','')).astype(float)
    df["OdoMLS_num"] = df.OdoMLS.apply(lambda x:x.replace('.','')).astype(float)
    # convert miles to km
    df["Odo"] = df.OdoKM_num.copy()
    chooseMls = (df.OdoKM_num.isna()) & ~(df.OdoMLS_num.isna())
    df.loc[chooseMls,"Odo"] = df.loc[chooseMls,"OdoMLS_num"].apply(float) * ml2km

In [None]:
# do conversion
odo_str2float(car)

if VERBOSE > 0:
    # print new column and source info
    display(car.loc[:,['Odo','OdoKM','OdoKM_num','OdoMLS','OdoMLS_num']])

    # plot odometer
    fig,ax=plt.subplots(figsize=[16,8])
    ax.set_xlabel('lot')
    ax.set_ylabel('odometer (km or mls)')

    car.loc[:,["OdoKM_num"]].plot(marker='o',linestyle='',alpha=1,ax=ax)
    car.loc[:,["Odo"]].plot(marker='+',color='k',linestyle='',alpha=1,ax=ax)
    car.loc[:,["OdoMLS_num"]].plot(marker='o',linestyle='',alpha=.5,ax=ax)


## Model and brand

Enforce some consistency in naming


In [None]:
# Rename to conventional brand name
car.ItemBrand.replace({
    "ASTON MARTIN":"ASTON-MARTIN",
    'AUTO UNION':'AUDI',
    'JAGUAR CARS':'JAGUAR',
    "MERCEDES BENZ":"MERCEDES-BENZ",
    "MERCEDES":"MERCEDES-BENZ",
    "MICRO COMPACT CAR SMART":"SMART",
    "CITRO": "CITROËN",
},inplace=True)


# When brand name has a specification that needs to go in the model name.
# E.g. Mercedes <AMG> and Audi <QUATTRO>
def add_model_spec(s,spec):
    '''Adds specification at the end of the model name'''
    import re 
    
    # addition should not exist
    if not re.search('(?i)' + spec, s):
        
        # add separator
        if not (s.endswith(';')):
            s += ';'
        # add specification
        s += ' ' + spec
    return s

sel = car.ItemBrand == 'QUATTRO'
car.loc[sel,'ItemType'] = car.loc[sel,'ItemType'].apply(lambda s: add_model_spec(s,'quattro'))
car.loc[sel,'ItemBrand'] = 'AUDI'

sel = car.ItemBrand == 'MERCEDES-AMG'
car.loc[sel,'ItemType'] = car.loc[sel,'ItemType'].apply(lambda s: add_model_spec(s,'amg'))
car.loc[sel,'ItemBrand'] = 'MERCEDES-BENZ'


In [None]:
# unescape special characters
import html
car["model_drz"] = car.ItemType.fillna('').apply(lambda s: html.unescape(s))

# strip brand from model
def remove_brand(model,brand):
    '''remove trailing brand name from model name'''
    
    import re
    
    if type(model) == str:
        model = re.sub('(?i)^' + brand + '[ ,;]*','',model)
    return model 

Type = car.apply(lambda row: remove_brand(row.model_drz,row.ItemBrand),axis='columns')

# special case with accented brand names
sel = car.ItemBrand == 'CITROËN'
Type[sel] = Type[sel].map(lambda x: remove_brand(x,'CITROEN'))
sel = car.ItemBrand == 'JAGUAR CARS'
Type[sel] = Type[sel].map(lambda x: remove_brand(x,'JAGUAR'))

# split extra specifications
Type = Type.fillna('').apply(lambda s: s.split(sep=';'))
# add simple model info back and extra specifications in new field
car["model_drz"] = Type.apply(lambda s: s[0])
car["modelspec_drz"] = Type.apply(lambda s: ''.join(s[1:]).strip(' '))

### Concatenate columns with index numbers

In [None]:
def _split_indexnr(c):
    M = re.match(r'^(rdw_[a-z,_,]+)(_cm3)?((_[0-9]+)+)$', c)
    if M is None:
        return None
    
    return M[0], M[1] + (M[2] if M[2] is not None else ''), M[3]


## Example
# cs = [
#     'rdw_motor_uitvoering_cilinderinhoud_cm3_1',
#     'rdw_motor_uitvoering_cilinderinhoud_cm3_2',
#     'rdw_motor_uitvoering_cilinderinhoud_cm3_3',
#     'rdw_motor_uitvoering_brandstof_emissie_stikstofoxide_type_1_1_1',
#     'rdw_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1_1',
#     'rdw_motor_uitvoering_brandstof_brandstofverbruik_stadsrit_3_1'
# ]
# pd.DataFrame([_split_indexnr(c) for c in cs], columns=['old', 'new', 'counter'])

In [None]:
def _prog(cur, end, last=0):
    
    pct = cur/end
    
    if pct > last/100:
        print(f'{last}%')
        last += 10 # every 10% an update
    elif cur >= end:
        print('done')
    else:
        print('.', end='')
    
    return last

## Example
# last = _prog(11,100,10)
# last = _prog(12,100,last)
# last = _prog(13,100,last)

In [None]:
# Get info from dataframe columns
new_names = np.array([i[1] + '_concat' for i in map(_split_indexnr, car.columns) if i is not None])
old_names = np.array([i[0] for i in map(_split_indexnr, car.columns) if i is not None])
counter = np.array([i[2] for i in map(_split_indexnr, car.columns) if i is not None])

if 2 > VERBOSE > 0:
    # initiate progress bar
    last = 0
for prg, new_name in enumerate(set(new_names)):
    
    # select columns in df
    sel = new_names == new_name
    
    # make dict from columns
    new_dicts = car[old_names[sel]].apply(lambda row: {
        k[1:]: v # {'1_1_1': 'value'}, "[1:]" to trim off leading "_" 
        for k,v in zip(counter[sel], row) 
        if ~((isinstance(v,float)) and (np.isnan(v)))
    }, axis=1)
    
    # add series to new column 
    car[new_name] = new_dicts
    
    # Remove old columns
    car.drop(columns=old_names[sel], inplace=True)
    
    # progress
    if 2 > VERBOSE > 0:
        last = _prog(prg, len(set(new_names)), last)
    elif VERBOSE > 1:
        print(f'{counter[sel][0]:7s} .. {counter[sel][-1]:7s} ({sum(sel):3.0f}) -> {new_name:s}')
        


In [None]:
if VERBOSE > 1:
    # Unknown registrations
    # Might be able to fix it by looking at the raw text.
    ixs = car.loc[car.Reg.str.lower() == 'onbekend'].index
    display(car.loc[ixs,['Reg','ForeignReg']])
    for ix in ixs:
        print(ix,end='\n\t')
        rt = car.loc[ix,'Raw_text']
        if type(rt) == list:
            print('\n\t'.join(rt))
        else:
            print(rt)

if VERBOSE > 1:
    # FUTURE: Do something with foreign registrations
    # Parsing did not always get it right.
    list(car.ForeignReg.unique())


if VERBOSE > 1:
    # Steering wheel in center? "M"?
    display(car.loc[car.rdw_basisgegevens_kant_van_het_stuur == 'M', 'Raw_text'])

    # De zijde van het voertuig waar het stuurwiel is gemonteerd.
    # Waarden 
    # L     Links
    # R    Rechts
    # M    Midden
    # src: https://www.rdw.nl/-/media/rdw/rdw/pdf/sitecollectiondocuments/over-rdw/naslagwerk/beschrijving-dataset-typegoedkeuring-v10.pdf
    # car.rdw_basisgegevens_kant_van_het_stuur.value_counts()



## Other repairs

In [None]:
# reparse raw text NAP
# Wrong ones have 'isch'

# re pattern
patt = '^Km-stand volgens nap (?P<val>(logisch)|(onlogisch))$'
# loop over wrong ones
for ix in car[car.NAP == 'isch'].index:
    # line by line
    for line in car.loc[ix,'Raw_text']:
        M = re.match(patt,line)
        if M:
            car.loc[ix,"NAP"] = M.group('val')

## Adhoc repair

When all fails

In [None]:
if OPBOD:
    print('skip')
else:
    # '2603 Afkomstig van JFC HQ Brunssum.'
    ix='2017-5-2603'
    car.loc[ix,"LotNr"] = '2603'
    car.loc[ix, 'jfc'] = True
    # K2000098227 Afkomstig van JFC HQ Brunssum.
    ix='2020-9-8227'
    car.loc[ix,"LotNr"] = '8227'
    car.loc[ix, 'jfc'] = True
    

    # ix = car.Mfyear[car.Mfyear == 'onbekend']
    # car.loc[car.Mfyear == 'onbekend',:]
    # ix = '2018-11-2613'
    ix = ['2017-4-7127', '2017-6-7173', '2018-11-2613']
    # car.loc[ix,"Mfyear"] = -1
    car.loc[ix,'Mfyear'] = np.NaN

    # fix issus with one lot that has no type
    # print(car.ItemType[~ (car.ItemType.apply(type) == str)])
    # [print(l) for l in eval(car.loc['2017-6-7121','Raw_text'])]
    # car.loc['2017-6-7121','Images']
    ix = '2017-6-7121'
    car.loc[ix,'ItemType'] = 'golf'

    # car.loc[car.ItemBrand == 'Kampeerwagen/camper',:]
    # car.loc["2017-5-2408",:]
    ix = '2017-5-2408'
    #car.loc[ix,'ItemBrand'] = 'VOLKSWAGEN'
    car.drop(ix,inplace=True) # remove alltogether


    # car.loc[car.ItemBrand == 'AUDI A4',:]
    ix = '2018-6-7195'
    car.loc[ix,'ItemBrand'] = 'AUDI'
    car.loc[ix,'ItemType'] = 'a4; ' + car.loc[ix,"ItemType"]

    # car.loc[car.ItemBrand == 'MERCDES-BENZ',:]
    ix = '2018-9-8162'
    car.loc[ix,'ItemBrand'] = 'MERCEDES-BENZ'

    # car.loc[car.ItemBrand == 'MINI COOPER',:]
    ix = '2018-10-2210'
    car.loc[ix,'ItemBrand'] = 'MINI'
    car.loc[ix,'ItemType'] = 'cooper; ' + car.loc[ix,"ItemType"]

    # car.loc[car.ItemType == 'benz',:]
    # car.loc[ix,'Images']
    # This is a w204 mfyear < 2011
    ix = '2017-5-2618'
    car.loc[ix,'ItemType'] = 'c cdi'

    # car.loc["2018-1-3046","Raw_text"]
    # This is combined lot
    car.drop("2018-1-3046",inplace=True) # remove alltogether

    # car.ForeignReg=='Het voertuig is voorzien van taxi-kentekenplaten. Taxiregistratie kunt u laten be&#235;indigen via de RDW. Vervanging van de blauwe door gele'
    # Taxi
    ixs = ['2017-11-8302', '2017-11-8305', '2018-1-8163']
    regs = ['54-GLL-5','57-XZ-FV','70-TLF-3']
    for ix,reg in zip(ixs,regs):
        car.loc[ix,'taxi'] = True
        car.loc[ix,'Reg'] = reg

    # Typo in registration K1900022009
    # 8-SKL-15 not 8-SLK-15
    ix = '2019-2-2009'
    car.loc[ix,'Reg'] = '8-SKL-15'
    if car.loc[ix,'rdwinfo']['kenteken'][0] == '8SLK15':
        car.loc[ix,'rdwinfo'] = None

    # NAP is provided first and impacts Odometer reading
    ixs = ['2019-9-9106', '2019-9-9249']
    naps = ['logisch', 'onlogisch']
    kms = ['251.571', '', '']
    #display(car.loc[ixs,['NAP', 'OdoKM', 'Raw_text']])
    for ix,nap,km in zip(ixs,naps,kms):
        car.loc[ix,'NAP'] = nap
        car.loc[ix,'OdoKM'] = km
        
    # "bouwjaar verklaring noodzakelijk."
    #car.loc[[v=="verklaring noodzakelijk." for v in car.Mfyear]]
    ixs = ['2020-1-7177']
    for ix in ixs:
        car.loc[ix, 'Mfyear'] = ''
    
    # Text in lot was missing a character: "58.83"
    ix = '2020-12-7138'
    car.loc[ix, 'OdoKM'] = '58.683'
    
    # date format is different
    ix = '2020-12-7263'
    car.loc[ix, 'Mfdate'] = car.loc[ix, 'Mfdate'].replace('-','.')
    

- - - - 
## Make data type consistent

In [None]:
def string_to_list_images(s):
    '''Convert string to list'''
    if type(s) == str:
        s=s.replace('[',"['",1)
        s=s[::-1].replace(']',"]'",1)[::-1]
        s=eval(s.replace(', ',"' , '"))
        
    return s

def string_to_int_lotnr(s):
    '''Convert string to int'''
    if type(s) == str:
        if s[0] == 'K':
            # "K1800092200"
            s=int(s[-4:])
        else:
            s=int(s)
        
    return s

def string_to_list_rawtext(s):
    '''Convert string to list'''
    if type(s) == str:
        if s[1] == "'":
            s=eval(s)
        else:
            s=string_to_list_images(s)
        
    return s

In [None]:
# Convert to the same data type

car.Images = car.Images.apply(lambda r: string_to_list_images(r))
car.LotNr = car.LotNr.apply(string_to_int_lotnr)
car.Raw_text = car.Raw_text.apply(string_to_list_rawtext)

In [None]:
# convert Booleans to 1 / 0

for c in ['automatic_drz','cabriolet','rhd_drz','no_road','taxi','crewcab','carwrap','d_lic',
          'benzine','diesel','lpg','electric','hybrid_drz',
          'jfc','locked','no_key','no_cvo','no_igk','no_odo','wo_frame','used_parts', 'legguard', 
          'import','early_reg','maybe_reg','no_inireg','no_nlreg193','no_nlreg194', 'no_nlreg19', 'no_orireg','no_reg','no_regneeded','no_rdw','rdw150','no_vin','btw21',
          'disclaim1','disclaim2','disclaim12','disclaim3','disclaim4', 'disclaim5', 'disclaim_cr6']:
    
    assert c in car.columns
    ## Debug
    # if c not in car.columns:
    #     print('skip', c)
    #     continue

    car[c] = car[c].replace({0:False,1:True}).fillna(False)

In [None]:
# Get last query from list of rdw querries
if 'rdwinfo' not in car.columns:
    print('skip')
else:
    last_query = car.rdwinfo.apply(lambda x:pd.to_datetime([i['TimeStamp'] for i in x],format='%Y%m%d').argmax() if type(x) == list else np.NaN)
    for queries,last,idx in zip(car.rdwinfo,last_query,car.index):
        if np.isnan(last):
            continue
        car.loc[idx,'rdwinfo'] = [queries[int(last)]]

### Fill empty

In [None]:
car.Draw = car.Draw.fillna(False)
car.Mfyear = car.Mfyear.replace('',np.NaN).fillna(-1).astype(int)
car.Note = car.Note.astype(str)

In [None]:
# columns that are integers
cols = [
    'N_images',
    'rdw_aantal_cilinders',
    'rdw_aantal_deuren',
    'rdw_basisgegevens_aantal_deuren_bovengrens',
    'rdw_basisgegevens_aantal_deuren_ondergrens',
    'rdw_aantal_rolstoelplaatsen',
    'rdw_aantal_wielen',
    'rdw_basisgegevens_aantal_wielen',
    'rdw_aantal_zitplaatsen',
    'rdw_basisgegevens_aantal_zitplaatsen_bovengrens',
    'rdw_basisgegevens_aantal_zitplaatsen_ondergrens',
    'rdw_basisgegevens_aantal_aangedreven_assen',
    'rdw_uitvoering_wijzigingsnummer',
    'rdw_eeg_basis_goedkeuringsnummer',
    'rdw_eeg_uitbreiding_goedkeuringsnummer',    
]

# Should exist in dataframe
assert len(set(cols).difference(car.columns)) == 0
## Debug
# print('skip', np.setdiff1d(cols, car.columns))

car.loc[:,cols] = car.loc[:,cols].fillna(-1).astype(int)


In [None]:
# columns that are strings and need to be filled with ''
cols = [
    'APKdate', 
    'BTW', 
    'ForeignReg', 
    'ItemType', 
    'Mfdate', 
    'NAP', 
    'Reg', 
    'Source', 
    'SupInfo',
    'rdw_kenteken',
    'rdw_Reg',
    'rdw_api_gekentekende_voertuigen_assen',
    'rdw_api_gekentekende_voertuigen_brandstof',
    'rdw_api_gekentekende_voertuigen_carrosserie',
    'rdw_api_gekentekende_voertuigen_carrosserie_specifiek',
    'rdw_api_gekentekende_voertuigen_voertuigklasse',
    'rdw_eerste_kleur',
    'rdw_europese_voertuigcategorie',
    'rdw_export_indicator',
    'rdw_handelsbenaming',
    'rdw_inrichting',
    'rdw_merk',
    'rdw_openstaande_terugroepactie_indicator',
    'rdw_plaats_chassisnummer',
    'rdw_retrofit_roetfilter',
    'rdw_taxi_indicator',
    'rdw_tweede_kleur',
    'rdw_type',
    'rdw_typegoedkeuringsnummer',
    'rdw_uitvoering',
    'rdw_variant',
    'rdw_voertuigsoort',
    'rdw_wacht_op_keuren',
    'rdw_wam_verzekerd',
    'rdw_zuinigheidslabel',    
    'rdw_type_gasinstallatie',
    'rdw_TimeStamp_x',
    'rdw_TimeStamp_y',
    'rdw_eu_type_goedkeuringssleutel',
    'rdw_eu_type_goedkeuringssleutel_y',
    'rdw_eeg_uitvoeringscode',
    'rdw_eeg_variantcode',
    'rdw_api_as_gegevens_eeg_uitvoering',
    'rdw_api_basisgegevens_eeg_uitvoering',
    'rdw_api_carrosserie_uitvoering',
    'rdw_api_carrosserie_uitvoering_klasse',
    'rdw_api_carrosserie_uitvoering_nummerieke_code',
    'rdw_api_handelsbenaming_uitvoering',
    'rdw_api_merk_uitvoering_toegestaan',
    'rdw_api_motor_uitvoering',
    'rdw_api_motor_uitvoering_brandstof',
    'rdw_api_plaatsaanduiding_uitvoering',
    'rdw_api_subcategorie_uitvoering',
    'rdw_api_uitvoeringverbruik_per_uitgave',
    'rdw_api_versnellingsbak_uitvoering',
    'rdw_eeg_ece_voertuig_categorie_bij_type',
    'rdw_europese_typegoedkeuring_status',
    'rdw_fabrikant',
    'rdw_landcode_eeg_typegoedkeuring',
    'rdw_eeg_voertuig_cat_toevoeging',
    'rdw_richtlijn_nr_laatste_wijziging',
    'rdw_type_fabrikant',
    'rdw_basisgegevens_status_voertiug_kentekening',
    'rdw_basisgegevens_compleet_voertuig_indicator',
    'rdw_basisgegevens_links_rechts_rijdend',
    'rdw_basisgegevens__24ghz_kortbereik_radar',
    'rdw_basisgegevens_eur_codering_carrosserietype',
    'rdw_basisgegevens_handelsbenaming',
    'rdw_basisgegevens_kant_van_het_stuur',
    'rdw_subcategorie_uitvoering_subcateg_uitvoering_europees',
    'rdw_basisgegevens_tweede_brandstofcode_voertuig',
    'rdw_basisgegevens_eeg_uitvoering_cat_toevoeging',
    'rdw_brandstof_omschrijving',
    'rdw_carrosserietype', 
    'rdw_type_carrosserie_europese_omschrijving',
    'rdw_carrosserie_voertuig_nummer_europese_omschrijving',
]

# Should exist in dataframe
assert len(set(cols).difference(car.columns)) == 0
## Debug
# print('skip', np.setdiff1d(cols, car.columns))
# cols = np.intersect1d(cols, car.columns)

car.loc[:,cols] = car.loc[:,cols].fillna('').astype(str)

In [None]:
# Replace zero price (not sold) with NaN
car.Price = car.Price.replace({0.0:np.NaN})

show data types per column

In [None]:
if VERBOSE > 0:
    # print type per column and example values
    df_ = pd.DataFrame(columns = ['Column', 'Data Type', 'Type counter',  'Example values'])
    for c in car.columns:
        cnt=-1
        for t in car[c].apply(type).unique():
            cnt+=1

            # column name, data type
            df_ = df_.append({'Column': c, 
                              'Data Type': str(t), 
                              'Type counter': cnt}, ignore_index=True)
            ix = df_.index[-1]

            # skip large 
            if c in ['SupInfo','Raw_text','rdwinfo','Images']:
                v = '..skip..'
                df_.loc[ix, 'Example values'] = v
                continue
            if list in car[c].apply(type).unique():
                v = 'max nr of items: ' +\
                str(car[c].apply(lambda s:len(s) if list==type(s) else 0).max())
                df_.loc[ix, 'Example values'] = v
                continue
            elif dict in car[c].apply(type).unique():
                if t == dict:
                    v = 'max nr of keys: ' +\
                    str(car[c].apply(lambda s: len(s)).max())
                    df_.loc[ix, 'Example values'] = v
                    continue

            # values
            v = car[c].unique()
            if len(v) < 10:
                # print all
                df_.loc[ix, 'Example values'] = v
            else:
                # print first and last
                df_.loc[ix, 'Example values'] = '{} .. {}'.format(v[0],v[-1])

    df_.set_index('Column', inplace=True)
    if VERBOSE > 1:
        with pd.option_context('display.max_rows', 999):
            display(df_)
    else:
        display(df_)


In [None]:
if VERBOSE > 0:
    print('These columns contain more than one type')

    display(
        df_.reset_index()\
        .pivot(columns='Type counter', index='Column', values='Data Type')\
        .dropna(subset=[1])\
        .sort_values(by=[0, 1])\
        .fillna('')
    )


if VERBOSE > 1:
    gb=df_.groupby('Data Type')
    for g in gb.groups:
        print(g)
        with pd.option_context("display.max_rows", 999):
            display(gb.get_group(g).drop(columns=['Data Type']).sort_index())


## Dummies to category (reverse one-hot-encode)

Categorize info from auction by converting from dummies to categories. Convert different boolean fields into one field with string.

In [None]:
# fuel
Fuels = ['lpg','benzine','diesel','hybrid']

# new field
car["fuel_drz"] = str()
# loop over columns
for fuel in np.intersect1d(Fuels,list(car.columns)): # warning: This is case sensitive
    car.loc[car.loc[:,fuel] == True,"fuel_drz"] = fuel


## Date and time operations

Age of car, APK etc.

In [None]:
# Date of auction based on index name
car["now"] = [pd.to_datetime(re.search('([0-9]{4}-[0-9]+)-.*',i)[1],format='%Y-%m') for i in car.index.values]

# APK / Manufacture date / Year
car['apk_drz'] = car.APKdate.apply(lambda t: pd.to_datetime(t,format='%d.%m.%Y') if len(t) != 0 else np.NaN)
car['MF_full_ser'] = car.Mfdate.apply(lambda t: pd.to_datetime(t,format='%d.%m.%Y') if len(t) != 0 else np.NaN)
car['MF_year_ser'] = car.Mfyear.apply(lambda t: pd.to_datetime(t,format='%Y') if t>0 else np.NaN)

# Choose MF year or full date
car["MF_drz"] = car.MF_full_ser.copy()
chooseShort = car.MF_full_ser.isna() & car.MF_year_ser.notna()
car.loc[chooseShort,"MF_drz"] = car.loc[chooseShort,"MF_year_ser"]


- - - -
# Add rdw info into dataframe

### Age of query

Older queries might not have accurate information that was current at auction (e.g. inspection date)

In [None]:
# get time stamp from added columns (new format since Apr 2019)
car['rdw_ser'] = car.rdw_TimeStamp_x.apply(lambda x:pd.to_datetime(x, format='%Y%m%d') if len(x)==8 else np.NaN)

# get time stamp from dict
if 'rdwinfo' in car.columns:
    sel = car['rdw_ser'].isna()
    car.loc[sel,'rdw_ser'] = car.loc[sel,'rdwinfo'].apply(lambda x:pd.to_datetime(x['TimeStamp'],format='%Y%m%d') if type(x)==dict else np.NaN)

In [None]:
if VERBOSE > 0:
    # plot query age
    sel = car['rdw_ser'].isna()
    car['RDW_age'] = np.NaN
    car.loc[~sel, "RDW_age"] = (car.loc[~sel,'rdw_ser'] - car.now).apply(lambda x:x.days)
    car["RDW_age"].plot(marker=',', figsize=[16,2])
    plt.xlabel('lot')
    plt.ylabel('age (days)')
    plt.title('Age of rdw query since auction')

In [None]:
if VERBOSE > 0:
    # Nr of rdw entries
    df_ = pd.DataFrame(index=car.index, columns=['old_style', 'number_of_rdw_entries'])
    df_.old_style = car.rdwinfo.apply(lambda x: isinstance(x, dict))
    sel = df_.old_style
    df_.loc[sel, 'number_of_rdw_entries'] = car.loc[sel, 'rdwinfo'].apply(len)

    rdw_cols = car.columns.str.startswith('rdw_')
    sel = ~df_.old_style
    df_.loc[sel, 'number_of_rdw_entries'] = car.loc[sel, rdw_cols].notna().sum(axis='columns')
    df_.plot(marker=',', linestyle='', figsize=[16,4], legend=False)
    plt.xlabel('lot')
    plt.title('Number of rdw fields')
    plt.ylabel('# entries')
    print('some old-style (dict) rdw info is retrofitted. In 2021-01 ovi data may have been added.')

### Rename RDW info
`_<field name>` indicates that RDW field might already exist in the auction info. The leading underscore prevents overwriting.


In [None]:
# rename rdw fields
Map = {
    "rdw_aantal_cilinders": "_nCyl",
    "rdw_cilinderinhoud": "_cylvol",
    "rdw_aantal_zitplaatsen": "_nSeat",
    "rdw_basisgegevens_aantal_zitplaatsen_bovengrens": "_nSeat2", 
    "rdw_aantal_deuren" : "_nDoor",
    "rdw_basisgegevens_aantal_deuren_bovengrens": "_nDoor2",
    "rdw_breedte": "_width",
    "rdw_lengte": "_length",
    "rdw_wielbasis": "_wheelbase",
    "rdw_massa_ledig_voertuig":"_weight",
    "rdw_basisgegevens_breedte_voertuig_uitvoering_bovengrens": "_width2",
    "rdw_basisgegevens_lengte_voertuig_uitvoering_bovengrens": "_length2",
    "rdw_basisgegevens_wielbais_bovengrens": "_wheelbase2",
    "rdw_basisgegevens_massa_leeg_voertuig_bovengrens": "_weight2",
    "rdw_basisgegevens_hoogte_voertuig_uitvoering_bovengrens": "height", 
    "rdw_bruto_bpm" : "bpm",
    "rdw_catalogusprijs": "newprice",
    "rdw_datum_tenaamstelling": "_reglast_str",
    "rdw_datum_eerste_afgifte_nederland": "_regnl_str",
    "rdw_datum_eerste_toelating": "_regfirst_str",
    "rdw_vervaldatum_apk":"_apk_str",
    "rdw_inrichting":"_body2",
    "rdw_eerste_kleur":"_color",
    "rdw_merk":"_brand",
    "rdw_handelsbenaming":"_model",
    "rdw_taxi_indicator":"_taxi",
    "rdw_basisgegevens_kant_van_het_stuur":"_rhd",
    "rdw_basisgegevens_max_constructie_snelheid_bovengrens": "maxspeed",
    "rdw_ovi_private_owners": "private_owners",
    "rdw_ovi_company_owner": "company_owners",
    "rdw_ovi_under_survey": "under_survey", #WOK
    
}

# Should all exist in dataframe
assert len(np.setdiff1d(list(Map.keys()), car.columns)) == 0

car.rename(columns=Map, inplace=True)


In [None]:
# aggregate rdw fields with index numbers stored in dicts
#   this can be mean, string join, first or whatever
car['power'] = car.rdw_brandstof_nettomaximumvermogen_concat.apply(
    lambda x: np.mean(list(x.values())) if len(x) > 0 else np.NaN)

car["milage"] = car.rdw_brandstof_brandstofverbruik_gecombineerd_concat.apply(
    lambda x: np.mean(list(x.values())) if len(x) > 0 else np.NaN)

car['_fuel'] = car.rdw_brandstof_brandstof_omschrijving_concat.apply(
    lambda x:'/'.join(x.values()) if len(x) > 0 else '')

car['_body1'] = car.rdw_carrosserie_type_carrosserie_europese_omschrijving_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else '')

car['_nCyl1'] = car.rdw_motor_uitvoering_aantal_cilinders_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['_cylvol1'] = car.rdw_motor_uitvoering_cilinderinhoud_cm3_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['_automatic'] = car.rdw_versnellingsbak_uitvoering_type_versnellingsbak_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['energylab'] = car.rdw_uitvoeringverbruik_per_uitgave_verbruikcategorie_uitvoering_concat.apply(
    lambda x: x[max(x.keys())] if len(x) > 0 else '')

car['_hybrid'] = car.rdw_motor_uitvoering_hybride_elektrisch_voertuig_concat.apply(
    lambda x: list(x.values())[0] if len(x) > 0 else np.NaN)

car['nGear'] = car.rdw_versnellingsbak_uitvoering_aantal_versnellingen_bovengrens_concat.apply(
    lambda x: x[max(x.keys())] if len(x) > 0 else np.NaN)

# Add LPG specification to fuel
is_lpg = car._fuel.str.lower().str.contains('lpg')
car.loc[is_lpg, '_fuel'] = car.loc[is_lpg, ['_fuel', 'rdw_type_gasinstallatie']].apply('/'.join, axis='columns')

if VERBOSE > 1:
    display(car.loc[car['_fuel'].drop_duplicates().index, ['_fuel', 'rdw_brandstof_brandstof_omschrijving_concat', 'rdw_type_gasinstallatie']])
    


In [None]:
# add old style dict entries as new columns
map_old2new = {
    "aantal_cilinders": "_nCyl",
    "cilinderinhoud": "_cylvol",
    "nettomaximumvermogen": "power",
    "brandstofverbruik_gecombineerd":"milage",
    "aantal_zitplaatsen": "_nSeat",
    "aantal_deuren" : "_nDoor",
    "breedte": "_width",
    "lengte": "_length",
    "wielbasis": "_wheelbase",
    "massa_ledig_voertuig":"_weight",
    "bruto_bpm" : "bpm",
    "catalogusprijs": "newprice",
    "datum_tenaamstelling": "_reglast_str",
    "datum_eerste_afgifte_nederland": "_regnl_str",
    "datum_eerste_toelating": "_regfirst_str",
    "vervaldatum_apk":"_apk_str",
    "type_carrosserie_europese_omschrijving":"_body1",
    "inrichting":"_body2",
    "brandstof_omschrijving":"_fuel",
    "eerste_kleur":"_color",
    "merk":"_brand",
    "handelsbenaming":"_model",
    "taxi_indicator":"_taxi",
}

for rdw_key, new_column in map_old2new.items():
    
    is_old_style = car.rdwinfo.apply(lambda row: rdw_key in row.keys() if isinstance(row, dict) else False)
    car.rdwinfo[is_old_style].apply(lambda x: x[rdw_key][0]) # Take first. Rdw named first key conveniently '0'
    values = car.rdwinfo[is_old_style].apply(lambda x: list(x[rdw_key].values()))

    if VERBOSE > 0:
        print(f'{rdw_key:40s} -> {new_column}')
        
    if values.apply(len).max() == 1:
        # Column should exist. Hence insert with mask
        car.loc[is_old_style, new_column] = values.apply(lambda x: x[0])

    elif new_column == '_fuel':
        order = car.rdwinfo[is_old_style].apply(lambda x: list(x['brandstof_volgnummer'].values()))
        ordered_values = [[v[i-1] for i in o] for o, v in zip(order, values)]
        concat_values = [*map('/'.join, ordered_values)]
        tank_type = car.rdwinfo[is_old_style].apply(lambda x: list(x['type_gasinstallatie'].values()) if "type_gasinstallatie" in x else []) # will include CNG
        fuel = pd.Series({i: f + '/' + '/'.join(tank_type.loc[i]) if len(tank_type.loc[i]) > 0 else f for f, i in zip(concat_values, tank_type.index)})
        car.loc[is_old_style, new_column] = fuel

    if VERBOSE > 1:
        try:
            print(car.loc[is_old_style, new_column].value_counts().sort_index())
        except TypeError:
            print(car.loc[is_old_style, new_column].astype(str).value_counts().sort_index())




## preprocessing of rdw info

In [None]:
map_str2date = {
    "_regfirst_str":"regfirst_rdw",
    "_regnl_str":"regnl_rdw",
    "_reglast_str":"reglast_rdw",
    "_apk_str":"apk_rdw",
}

if VERBOSE > 1:
    gb_ = car[[*map_str2date.keys()]].groupby([*map(lambda x: '{0}-{1:>2s}'.format(*x.split('-')[0:2]), car.index)])
    df_ = pd.concat([
        gb_.first(),
        gb_.first().applymap(type)
    ], axis=1)
    display(df_)

# lookup table when format changed
date_format = pd.DataFrame(columns=['start', 'end', 'pattern'])
date_format = date_format.append(pd.DataFrame(columns=date_format.columns, index=['pat1', 'pat2', 'pat3'], 
                                              data=[
                                                  ['1769-1-1', '2019-03-01', '%d/%m/%Y'], #25/11/2008
                                                  ['2019-03-01', '2019-04-01', [lambda x: f'{x:.0f}', '%Y%m%d']],#20021112 convert to string first
                                                  ['2019-04-01', '2100-01-01', [lambda x: f'{x:.0f}', '%Y%m%d']] #2.00902e+07
                                              ]))
query_date = car.rdw_ser.fillna(pd.NaT)

# Loop over columns that need transforming
for rdw_key, new_column in map_str2date.items():

    car[new_column] = pd.NaT

    if VERBOSE > 0:
        print(f'{rdw_key:10s} -> {new_column}')

    # Do per formatting version
    for date_fmt in date_format.iterrows():
        if VERBOSE > 1:
            print(f'{date_fmt[0]}: ')
            for k,v in date_fmt[1].items():
                print(f'\t{k}: {v}')

        # Select rows with formatting
        sel = (query_date >= date_fmt[1].start) & (query_date < date_fmt[1].end)
        old_fmt = car.loc[sel, rdw_key]

        if VERBOSE > 1:
            print('Query dates (check with start, end):')
            print(f'\t{query_date[sel].min().strftime("%Y-%m-%d")} .. {query_date[sel].max().strftime("%Y-%m-%d")}')
            print(f'Format in {rdw_key} (check with pattern):')
            print(f'\t{old_fmt[sel][0]} .. {old_fmt[sel][-1]}')

        # Convert to datetime. Optionally apply modifier if list.
        if isinstance(date_fmt[1].pattern, list):
            for fun in date_fmt[1].pattern[:-1]:
                old_fmt = old_fmt.apply(fun)
            date_pat = date_fmt[1].pattern[-1]
        else:
            date_pat = date_fmt[1].pattern

        # Update dataframe
        new_fmt = old_fmt.apply(lambda d: pd.to_datetime(d, format=date_pat))
        new_fmt.name = new_column    
        car[new_column].update(new_fmt)
        if VERBOSE > 1:
            print(f'{car[new_column].dropna().shape[0]} of {car.shape[0]} are modified after applying {date_fmt[0]}')

# remove string values
car.drop(columns=map_str2date.keys(), inplace = True)

In [None]:
# Add age if column contains date
age = car[map_str2date.values()].apply(lambda c: car.now - c).add_suffix('_age')
if all(age.columns.isin(car.columns)):
    car.update(age)
else:
    car = pd.concat([car, age], axis=1)


In [None]:
# same data type
# sometimes type is numerical
car._model = car._model.astype(str)
car._brand = car._brand.astype(str)


In [None]:
# lower case
cols = ['_body1','_body2','_fuel','_model']
car[cols] = car[cols].apply(lambda x: x.str.lower())

In [None]:
# strip brand
car._model = car.apply(lambda row:remove_brand(row._model,row._brand),axis='columns')

# special cases
sel = car._brand == 'JAGUAR CARS'
car.loc[sel,'_model'] = car.loc[sel,:].apply(lambda row:remove_brand(row._model,'JAGUAR'),axis='columns')

In [None]:
# category to boolean
car._rhd = car._rhd.replace({'R': True, 'L': False})
car._automatic = car._automatic.replace({'A': True, 'H': False, 'C': True})
car._hybrid = car._hybrid.replace({'J': True, 'N': False})
#rdw_hefas 

In [None]:
# consistent with other fields

# m to cm
car._wheelbase = car._wheelbase * 10.0 
car._length = car._length * 10.0
car._width = car._width * 10.0

In [None]:
# N.A. values
car._color = car._color.replace({'N.v.t.': '', 'Niet geregistreerd': ''})

In [None]:
# Numerical fields
# weight, width, length, height, etc
for fld in [
    '_length',
    '_width',
    '_nDoor',
    '_nDoor2',
]:
    if car[fld].dtype == 'int':
        car[fld] = car[fld].replace({0: -1})
        if VERBOSE > 0:
            print(f'replace 0 with -1 in {fld}')
    elif car[fld].dtype == 'float':
        car[fld] = car[fld].replace({0.0: np.NaN})
        if VERBOSE > 0:
            print(f'replace 0.0 with NaN in {fld}')
    else:
        print(fld, car[fld].dtype)
        raise TypeError

        
if VERBOSE > 1:
    print('These have zeros that possibly need replacing too')
    print([c for c in car.columns if (car[c]==0).any()])


### Use auction info or RDW info

In [None]:
# make same string values as rdw
car._body2.replace({'niet geregistreerd':''}, inplace=True)
car['taxi_drz'] = car.taxi.replace({False:'Nee', True:'Ja'})


In [None]:
# Select single value if exist in multiple fields

# Pairs to compare
#    Will choose first entry if not equal
fldpairs = [
    ['_taxi','taxi_drz','taxi'],
    ['regfirst_rdw','MF_drz','MF'],
    ['apk_drz','apk_rdw','apk'],
    ['_body1','_body2','bodytype'],
    ['_fuel','fuel_drz','fuel'],
    ['ItemBrand','_brand','brand'],
    ['model_drz','_model','model'],
    ['_nDoor2', '_nDoor', 'nDoor'],
    ['_nSeat2', '_nSeat', 'nSeat'],
    ['_nCyl1', '_nCyl', 'nCyl'],
    ['_cylvol1', '_cylvol', 'cylvol'],
    ['rhd_drz', '_rhd', 'rhd'],
    ['_wheelbase2', '_wheelbase', 'wheelbase'],
    ['_width2', '_width', 'width'],
    ['_length2', '_length', 'length'],
    ['_weight2', '_weight', 'weight'],
    ['automatic_drz', '_automatic', 'automatic'],
    ['_hybrid', 'hybrid_drz', 'hybrid'],
]

for flds in fldpairs:
    # not empty or nan and different
    isDiff = np.logical_and.reduce((
        ~car[flds[0]].isna(),
        ~car[flds[1]].isna(),
        car[flds[0]] != '',
        car[flds[1]] != '',
        car[flds[0]] != 'nan',
        car[flds[1]] != 'nan',
        car[flds[0]] != -1,
        car[flds[1]] != -1,
        car[flds[0]] != car[flds[1]]
    ))
    if '_brand' in flds:
        # RDW can't handel special characters
        isDiff = np.logical_and.reduce((
            isDiff,
#             car[flds[0]] != 'CITROEN',
            car[flds[1]] != 'CITROEN',            
        ))
        
    chooseany = np.logical_and(~car[flds[0]].isna(),~car[flds[1]].isna())
    choosefirst = np.logical_and(~car[flds[0]].isna(),car[flds[1]].isna())
    choosesecond = np.logical_and(car[flds[0]].isna(),~car[flds[1]].isna())
    if ('MF' in flds) or ('apk' in flds):
        car[flds[2]] = pd.to_datetime('NaN')
    elif (car[flds[0]].dtype == float) & (car[flds[1]].dtype == float):
        car[flds[2]] = np.NaN
    elif (car[flds[0]].dtype == int) & (car[flds[1]].dtype == int):
        car[flds[2]] = -1
    elif (car[flds[0]].dtype == str) & (car[flds[1]].dtype == str):
        car[flds[2]] = ''
    else:
        car[flds[2]] = 'NaN'

    car.loc[chooseany | choosefirst,flds[2]] = car.loc[chooseany | choosefirst,flds[0]]
    car.loc[choosesecond,flds[2]] = car.loc[choosesecond,flds[1]]

    if all(~isDiff):
        if VERBOSE > 0: print(f'{flds[-1]}: same in all auctions')
    else:
        if 'model' in flds:
            df_ = car.loc[isDiff,['brand','_brand']+flds]
        else:
            df_ = car.loc[isDiff,flds]
            
        if VERBOSE == 1:
            # only recent
            df_ = df_.loc[df_.index.str.startswith(DATE)]
        
        if VERBOSE > 0:
            if df_.shape[0] > 0:
                display(df_)
            else:
                print(f'{flds[-1]}: no difference in {DATE} auction')
            

            



In [None]:
# convert back to boolean
car.taxi.replace({'Nee':False,'Ja':True}, inplace = True)

- - - - 
# calculate extra info

In [None]:
# four wheel drive
car['fwd'] = car.rdw_basisgegevens_aantal_aangedreven_assen.replace({-1: '', 0: False, 1: False})
sel = car.fwd.apply(lambda x: x>1 if type(x)==int else False)
car.loc[sel,'fwd'] = True

In [None]:
car["Age"] = (car["now"] - car["MF"]).apply(lambda x: x.days)
car["APK_age"] = (car["now"] - car["apk"]).apply(lambda x: x.days)
car["import_age"] = (car["regnl_rdw"] - car["regfirst_rdw"]).apply(lambda x: x.days)

In [None]:
# plot
if VERBOSE > 1:
    df_ = car[[c for c in car.columns if c.endswith('_age')] + ['Age']]\
    .applymap(lambda x: x.days/365.25 if isinstance(x, pd.Timedelta) else x/365.25)\
    .replace({pd.NaT: np.nan})
    
    for k,s in df_.iteritems():
        plt.figure(figsize=[16,2])
        s.plot(marker=',', linestyle='', alpha=1, figsize=[16,2], ms=2)
        plt.title(k)
        
    plt.ylabel('age (year)')
    
elif VERBOSE > 0:
    car[[c for c in car.columns if c.endswith('_age')] + ['Age']]\
    .applymap(lambda x: x.days/365.25 if isinstance(x, pd.Timedelta) else x/365.25)\
    .replace({pd.NaT: np.nan})\
    .plot(marker='s', linestyle='', alpha=0.4, figsize=[16,8], ms=2)
    plt.legend()   
    plt.xlabel('lot')
    plt.ylabel('age (year)')
    


# subselection and save

### Save data for ML

In [None]:
save_cols = [
    "Price",
    "brand",
    "model",
    "Age",
    "fuel",
    "Odo",
    "APK_age",
    "import_age",
    "bodytype",
    "cylvol",
    "nCyl",
    "power",
    "weight",
    "bpm",
    "newprice",
    "nSeat",
    "nDoor", 
    "_color",
    'fwd',
    'maxspeed',
    'length',
    'height',
    'width',
    'automatic',
    'nGear',
    'energylab',
    'private_owners',
    'company_owners',
    'under_survey',
]
map_lowercase = {
    'Price':'price',
    'Age':'age',
    'Odo':'odometer',
    'APK_age':'days_since_inspection_invalid',
    'import_age':'age_at_import',
    'bodytype':'body_type',
    'cylvol':'displacement',
    'nCyl':'number_of_cylinders',
    'bpm':'registration_tax',
    'newprice':'sale_price',
    'nSeat':'number_of_seats',
    'nDoor':'number_of_doors',
    '_color':'color',
    'maxspeed':'top_speed',
    'automatic':'automatic_gearbox',
    'nGear':'number_of_gears',
    'energylab': 'energy_label',
}

if VERBOSE > 1:
    print('Columns >> .. << are saved as car dataset')
    
    ncol = 8
    l = [*car.columns]
    l = sorted(l)
    l = [f'>> {i} <<' if i in save_cols else i for i in l]
    l_padded = np.ceil(len(l) / ncol)*ncol
    l += [''] * int(l_padded - len(l))
    df_ = pd.DataFrame(np.reshape(l, (-1,ncol)), columns = [''] * ncol)
    df_['ix'] = ''
    df_.set_index('ix', inplace=True)
    df_.index.name=''
    with pd.option_context("display.max_rows", 999, "max_colwidth", 32):
        display(df_)

out = car.loc[:,save_cols].rename(columns=map_lowercase)
if VERBOSE > 0:
    with pd.option_context("display.max_columns", 999):
        display(out.tail())

# save data
if OPBOD:
    file_name = '../../../python-nb/data/cars-for-ml-opbod.pkl'
else:
    file_name = '../data/cars-for-ml.pkl'

# save
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    


### Save data for image classification

In [None]:
save_cols = ["Images",
              "brand",
              "model",
              "modelspec_drz",
              "_color",
              "Age",
              "bodytype",
              "cabriolet",
              "nDoor", 
              "length", 
              'height',
              'width',
              "wheelbase",
              "ForeignReg",
              "Reg",
              "taxi",
             ]
map_lowercase = {
    'Images':'image_urls',
    'modelspec_drz':'model_specification',
    '_color':'color',
    'Age':'age',
    'bodytype':'body_type',
    'cabriolet':'convertible',
    'nDoor':'number_of_doors',
    'ForeignReg':'foreign_registration',
    'Reg':'registration_number'
}

out = car.loc[:,save_cols].rename(columns=map_lowercase)
if VERBOSE > 0:
    with pd.option_context("display.max_columns", 999):
        display(out.tail())


# save data
if OPBOD:
    file_name = '../../../python-nb/data/cars-for-imageclf-opbod.pkl'
else:
    file_name = '../data/cars-for-imageclf.pkl'


# save
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    print(file_name)
    out.to_pickle(file_name)
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

# Write example table to file
based on: https://stackoverflow.com/a/33869154

In [None]:
def pandas_df_to_markdown_table(df):
    fmt = ['-----' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return df_formatted.to_csv(sep="|", index=False)


In [None]:
fn = '../data/cars-for-ml.pkl'
print(f'load {fn}')
out = pd.read_pickle(fn)

file_name = '../assets/example-table-of-ml.md'
example = out.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'

# convert to md
try:
    table_text = example.reset_index().to_markdown()
except ImportError:
    print('Fallback')
    table_text = pandas_df_to_markdown_table(example.reset_index())

# save
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    with open(file_name,'w') as file:
        file.write(table_text)

    print('A markdown table is available as\n\t{}'.format(file_name))
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    display(example)
    

In [None]:
fn = '../data/cars-for-imageclf.pkl'
print(f'load {fn}')
out = pd.read_pickle(fn)

file_name = '../assets/example-table-of-imageclf.md'
example = out.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'


# convert to md
try:
    table_text = example.reset_index().to_markdown()
except ImportError:
    print('Fallback')
    table_text = pandas_df_to_markdown_table(example.reset_index())

# save
if (SKIPSAVE==False) and (not(os.path.isfile(file_name))):
    with open(file_name,'w') as file:
        file.write(table_text)

    print('A markdown table is available as\n\t{}'.format(file_name))
else:
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    display(example)
    