# Merge all auctions into one

Combines several datasets and do some simple analyses.  


### User variables

In [1]:
# dates of auction
# format: yyyy-mm; yyyy: year, mm: month (%Y-%m)
AuctionDates = ['2017-03','2017-04','2017-05','2017-06','2017-07','2017-08','2017-09','2017-10','2017-11','2017-12',
                '2018-01','2018-02','2018-03','2018-04','2018-05','2018-06','2018-07','2018-08','2018-09','2018-10','2018-11','2018-12',
                '2019-01','2019-02','2019-03','2019-04','2019-05']

### Import modules 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
plt.style.use('ggplot')

### Load all data

Auctions are saved monthly in a .pkl file

In [3]:
print('load data')
data = dict()
for AuctionDate in AuctionDates:
    # read data
    fn = '../data/rdw-data-{:s}.pkl'.format(
        AuctionDate
    )
    print(fn)
    df = pd.read_pickle(fn)
    data[AuctionDate] = df

print('\nlast rows of last file')
data[AuctionDates[-1]].tail()

load data
../data/rdw-data-2017-03.pkl
../data/rdw-data-2017-04.pkl
../data/rdw-data-2017-05.pkl
../data/rdw-data-2017-06.pkl
../data/rdw-data-2017-07.pkl
../data/rdw-data-2017-08.pkl
../data/rdw-data-2017-09.pkl
../data/rdw-data-2017-10.pkl
../data/rdw-data-2017-11.pkl
../data/rdw-data-2017-12.pkl
../data/rdw-data-2018-01.pkl
../data/rdw-data-2018-02.pkl
../data/rdw-data-2018-03.pkl
../data/rdw-data-2018-04.pkl
../data/rdw-data-2018-05.pkl
../data/rdw-data-2018-06.pkl
../data/rdw-data-2018-07.pkl
../data/rdw-data-2018-08.pkl
../data/rdw-data-2018-09.pkl
../data/rdw-data-2018-10.pkl
../data/rdw-data-2018-11.pkl
../data/rdw-data-2018-12.pkl
../data/rdw-data-2019-01.pkl
../data/rdw-data-2019-02.pkl
../data/rdw-data-2019-03.pkl
../data/rdw-data-2019-04.pkl
../data/rdw-data-2019-05.pkl

last rows of last file


Unnamed: 0,Source,Title,Price,Draw,Raw_text,N_images,Images,Note,LotNr,LotType,...,rdw_brandstof_uitstoot_deeltjes_licht_1,rdw_brandstof_uitstoot_deeltjes_licht_2,rdw_brandstof_uitstoot_deeltjes_zwaar_1,rdw_brandstof_uitstoot_deeltjes_zwaar_2,rdw_carrosserie_carrosserietype_1,rdw_carrosserie_type_carrosserie_europese_omschrijving_1,rdw_carrosserie_specifiek_carrosserie_voertuig_nummer_code_volgnummer_1,rdw_carrosserie_specifiek_carrosserie_voertuig_nummer_europese_omschrijving_1,rdw_carrosserie_specifiek_carrosseriecode_1,rdw_TimeStamp
2019-5-9601,http://www.domeinenrz.nl/catalogi/verkoop_bij_...,Kavel K1900059601,1117.0,False,"[, Het kan zijn er dat er belangrijke onderdel...",1,[http://www.domeinenrz.nl/catalogi/ufc/static/...,False,K1900059601,Demontage fietsen (Partij ca. 50 stuks),...,,,,,,,,,,
2019-5-9602,http://www.domeinenrz.nl/catalogi/verkoop_bij_...,Kavel K1900059602,0.0,False,"[Type S40/16Q, Uitvoering 21.675 m3/HR @ 500pa...",3,[http://www.domeinenrz.nl/catalogi/ufc/static/...,False,K1900059602,LUCHTBEHANDELINGSKAST,...,,,,,,,,,,
2019-5-9700,http://www.domeinenrz.nl/catalogi/verkoop_bij_...,Kavel K1900059700,3000.0,False,"[Dikte 80 mm, Lengtes van 200 tot 290 cm*, Lic...",3,[http://www.domeinenrz.nl/catalogi/ufc/static/...,False,K1900059700,Partij isolatieplaten (w.o. type sandwich),...,,,,,,,,,,
2019-5-9701,http://www.domeinenrz.nl/catalogi/verkoop_bij_...,Kavel K1900059701,0.0,False,"[Type GG140SoE, Bouwjaar 2009, Aardgas, 140 kW...",5,[http://www.domeinenrz.nl/catalogi/ufc/static/...,False,K1900059701,Generator/ warmtekrachtmodule,...,,,,,,,,,,
2019-5-9702,http://www.domeinenrz.nl/catalogi/verkoop_bij_...,Kavel K1900059702,655.0,False,"[Type J15000 PRO 3, 14,7 kW, Bouwjaar 2017, Af...",2,[http://www.domeinenrz.nl/catalogi/ufc/static/...,False,K1900059702,Watergekoelde airco,...,,,,,,,,,,


### Consolidate format
Older files are slightly different such as naming conventions etc. Here all results are consolidated into one format.

In [4]:
# Images

# print first lot
print('Fields with images look like this:\n')
print('{:7s} | {:7s} | {:14s} | {:7s}\n{:7s}-+-{:s}-+-{:s}-+-{:s}'.format('Month','Column','Type','1st row','-'*7,'-'*7,'-'*14,'-'*46))
for k in data:
    col_idx = pd.np.where([c.lower() in ['image','images'] for c in data[k].columns])[0]
    example = data[k].iloc[0,col_idx].values[0]
    print('{:7s} | {:7s} | {}'.format(k,data[k].columns[col_idx][0],type(example)),end='')
    if type(example) == str:
        print('  | "{} .. {}"'.format(example[0:20],example[-20:]))
    else:
        print(' | length {} '.format(len(example)))

Fields with images look like this:

Month   | Column  | Type           | 1st row
--------+---------+----------------+-----------------------------------------------
2017-03 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-04 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-05 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-06 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-07 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-08 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-09 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-10 | Image   | <class 'str'>  | "['http://www.domeine .. 1024/768/image.jpg']"
2017-11 | Images  | <class 'str'>  | "[http://www.domeinen .. /1024/768/image.jpg]"
2017-12 | Images  | <class 'str'>  | "[http://www.domeinen .. /1024/768/image.j

In [5]:
# convert string representation of list to real list
# "['image1', 'image2']"
for k in ['2017-03',
          '2017-04',
          '2017-05',
          '2017-06',
          '2017-07',
          '2017-08',
          '2017-09',
          '2017-10']:
    data[k]['Images'] = data[k]['Image'].apply(eval)
    data[k] = data[k].drop(columns=['Image'])

# convert string representation of list without quotes to real list
# "[image1, image2]"
for k in ['2017-11',
          '2017-12',
          '2018-01',
          '2018-02',
          '2018-03',
          '2018-04']:
    data[k]['Images'] = data[k]['Images'].apply(lambda s:eval(re.sub(r"^\[(.*)\]$",r"['\1']",re.sub(' *, *',"', '",s))))
    

In [6]:
# Other column name that changed in november 2017
for k in ['2017-03',
          '2017-04',
          '2017-05',
          '2017-06',
          '2017-07',
          '2017-08',
          '2017-09',
          '2017-10']:
    data[k] = data[k].rename(columns={
        'draw':'Draw',
        'nr':'LotNr',
        'raw':'Raw_text',
        'misc':'SupInfo',
        'footnote':'Note',
        'jfq':'jfc',
    })

# Merge all auctions into one data frame

In [7]:
# append data into one data frame
for i,k in enumerate(sorted(data.keys(),reverse=False)):
    if i == 0:
        df = data[k]
    else:
        oldcols = pd.np.setdiff1d(df.columns,data[k].columns)
        newcols = pd.np.setdiff1d(data[k].columns,df.columns)
        if oldcols.size > 0:
            print('fields not present in {:s}'.format(k))
            [print('\t{:s}'.format(c)) for c in oldcols]
        if newcols.size > 0:
            print('fields new in {:s}'.format(k))
            [print('\t{:s}'.format(c)) for c in newcols]
        
        df=df.append(data[k],sort=False).copy()
    print(k,df.shape)
            

2017-03 (234, 52)
2017-04 (434, 52)
2017-05 (561, 52)
2017-06 (804, 52)
2017-07 (993, 52)
2017-08 (1238, 52)
2017-09 (1447, 52)
2017-10 (1626, 52)
fields not present in 2017-11
	disclaim3
	disclaim4
	hybrid
	import
	jfc
	no_igk
	no_nlreg193
	taxi
fields new in 2017-11
	N_images
	Source
	early_reg
	locked
	used_parts
	wo_frame
2017-11 (1877, 58)
fields not present in 2017-12
	disclaim3
	disclaim4
	hybrid
	import
	jfc
	lpg
	no_nlreg193
	no_orireg
	taxi
	used_parts
2017-12 (2118, 58)
fields not present in 2018-01
	disclaim3
	disclaim4
	jfc
	locked
	no_nlreg193
	no_orireg
	taxi
	used_parts
fields new in 2018-01
	disclaim_cr6
	import22_btw21
	import27_btw21
2018-01 (2393, 61)
fields not present in 2018-02
	disclaim3
	disclaim4
	disclaim_cr6
	electric
	import22_btw21
	import27_btw21
	jfc
	locked
	lpg
	no_nlreg193
	no_orireg
	no_regneeded
	taxi
	used_parts
2018-02 (2610, 61)
fields not present in 2018-03
	disclaim3
	disclaim4
	import
	import22_btw21
	import27_btw21
	jfc
	locked
	no_igk
	no_nl

# Continue with cars only

In [8]:
# select data
car=df[df.LotType == 'Personenauto'].copy()

In [9]:
# print lots that are cars per auction
prev = ['1999','0','0']
print('Car lots\nYear | Month | First ... Last index')
for ix in car.index:
    y,m,k = ix.split('-')
    if (y != prev[0]) or (m != prev[1]):
        if prev[0] == '1999':
            print('-----+-------+---------------------',end='')
        else:
            print('.. {:s}'.format(prev[2]),end='')
        print('\n{:4s} | {:>5s} |'.format(y,m),end='')
        prev[0] = y
        prev[1] = m
        print(' {:>5s} .'.format(k),end='')
    prev[2] = k
print('.. {:s}'.format(prev[2]),end='')
car.tail()

Car lots
Year | Month | First ... Last index
-----+-------+---------------------
2017 |     3 |  2000 ... 8318
2017 |     4 |  2000 ... 7309
2017 |     5 |  2200 ... 8025
2017 |     6 |  2000 ... 7311
2017 |     7 |  2200 ... 8178
2017 |     8 |  2001 ... 7304
2017 |     9 |  2200 ... 8187
2017 |    10 |  2000 ... 7171
2017 |    11 |  2200 ... 8305
2017 |    12 |  2000 ... 7308
2018 |     1 |  2200 ... 8170
2018 |     2 |  2000 ... 7184
2018 |     3 |  2200 ... 8338
2018 |     4 |  2000 ... 7336
2018 |     5 |  2200 ... 8226
2018 |     6 |  2000 ... 7315
2018 |     7 |  2200 ... 8311
2018 |     8 |  2000 ... 7328
2018 |     9 |  2200 ... 8183
2018 |    10 |  2000 ... 7319
2018 |    11 |  2200 ... 8326
2018 |    12 |  2000 ... 7361
2019 |     1 |  2201 ... 8329
2019 |     2 |  2001 ... 7264
2019 |     3 |  2200 ... 8318
2019 |     4 |  2000 ... 7314
2019 |     5 |  2200 ... 8352

Unnamed: 0,LotNr,Price,Title,LotCat,LotType,ItemBrand,ItemType,Mfdate,Mfyear,APKdate,...,rdw_carrosserie_type_carrosserie_europese_omschrijving_1,rdw_carrosserie_specifiek_carrosserie_voertuig_nummer_code_volgnummer_1,rdw_carrosserie_specifiek_carrosserie_voertuig_nummer_europese_omschrijving_1,rdw_carrosserie_specifiek_carrosseriecode_1,rdw_TimeStamp,d_lic,btw21,rdw_type_gasinstallatie,rdw_brandstof_co2_uitstoot_gewogen_1,rdw_brandstof_co2_uitstoot_gewogen_2
2019-5-8223,K1900058223,2589.0,Kavel K1900058223,Voertuigen en onderdelen,Personenauto,MERCEDES BENZ,300 se,18.03.1988,,,...,Sedan,,,,20190511.0,False,False,,,
2019-5-8305,K1900058305,3099.0,Kavel K1900058305,Voertuigen en onderdelen,Personenauto,MERCEDES-BENZ,vito 109 cdi,02.08.2007,,,...,MPV,,,,20190511.0,False,False,,,
2019-5-8311,K1900058311,4254.0,Kavel K1900058311,Voertuigen en onderdelen,Personenauto,MERCEDES-BENZ,viano cdi 3.0,06.05.2008,,,...,Stationwagen,,,,20190511.0,False,False,,,
2019-5-8312,K1900058312,539.0,Kavel K1900058312,Voertuigen en onderdelen,Personenauto,VOLKSWAGEN,caddy,25.10.2006,,,...,,,,,,False,False,,,
2019-5-8352,K1900058352,1878.0,Kavel K1900058352,Voertuigen en onderdelen,Personenauto,FIAT,ducato; 11 k 2.0 jtd,07.07.2006,,25.05.2019,...,MPV,,,,20190511.0,False,False,,,


# Save file

In [10]:
# save data
fn = '../data/cars-from-all-auctions.pkl'
print(fn)
car.to_pickle(fn)

../data/cars-from-all-auctions.pkl


# Write example results to file
based on: https://stackoverflow.com/a/33869154

In [11]:
def pandas_df_to_markdown_table(df):
    fmt = ['-----' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return df_formatted.to_csv(sep="|", index=False)


In [12]:
fn = '../assets/example-table-of-cars.md'
example = car.tail(10).copy()
# trim some long fields
example.rdwinfo = '.. rdw info ..'
example.Raw_text = '.. raw text ..'
example.SupInfo = '.. suplm. info. ..'
table_text = pandas_df_to_markdown_table(example)
with open(fn,'w') as file:
    file.write(table_text)
    
print('A markdown table is available as\n\t{}'.format(fn))

A markdown table is available as
	../assets/example-table-of-cars.md
