# Explore auction results

EDA of recent auction.

In [None]:
import drz_config
cfg = drz_config.read_config()
DATE = cfg['DATE']
VERBOSE = cfg['VERBOSE']
OPBOD = cfg['OPBOD']
if not OPBOD:
    month_counter = cfg['URL'][-2:]
else:
    month_counter = cfg['URL'][-4:-2]
    
if VERBOSE > 0:
    display(cfg)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image

# Load results

Load both auction results **and** data from rdw.

In [None]:
file_name = f'../data/drz-data-{DATE}-{month_counter}.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
print(file_name)
drz = pd.read_pickle(file_name)

file_name = f'../data/rdw-data-{DATE}-{month_counter}.pkl'
if OPBOD:
    file_name = file_name.replace('.pkl', '-opbod.pkl')
print(file_name)
rdw = pd.read_pickle(file_name)

# rdw column names
#   NOTE: format changed in April 2019: RDW info is appended to auction results as extra columns in dataframe
if pd.to_datetime(DATE, format='%Y-%m') >= pd.to_datetime('2019-04', format='%Y-%m'):
    rdwinfo = [c for c in rdw.columns if c.startswith('rdw_')]

# fix odometer
if pd.to_datetime(DATE, format='%Y-%m') == pd.to_datetime('2019-09', format='%Y-%m'):
    sel = (rdw.OdoKM == 'volgens NAP logisch') | (rdw.OdoKM == 'volgens NAP onlogisch') | (rdw.OdoKM == 'volgens nap onlogisch')
    display(drz.loc[sel, ['Title', 'OdoKM', 'NAP']])
    
    idx = '2019-9-9106'
    print('\n'.join(drz.loc[idx,'Raw_text']))
    drz.loc[idx,'NAP'] = 'logisch'
    rdw.loc[idx,'NAP'] = 'logisch'
    drz.loc[idx,'OdoKM'] = '251.571'
    rdw.loc[idx,'OdoKM'] = '251.571'
    
    display(drz.loc[[idx],['Title', 'OdoKM', 'NAP']])
    display(rdw.loc[[idx],['Title', 'OdoKM', 'NAP']])

    idx = '2019-9-9258'
    print('\n'.join(drz.loc[idx,'Raw_text']))
    drz.loc[idx,'NAP'] = 'onlogisch'
    rdw.loc[idx,'NAP'] = 'onlogisch'
    drz.loc[idx,'OdoKM'] = ''
    rdw.loc[idx,'OdoKM'] = ''
    
    idx = '2019-9-9249'
    print('\n'.join(drz.loc[idx,'Raw_text']))
    drz.loc[idx,'NAP'] = 'onlogisch'
    rdw.loc[idx,'NAP'] = 'onlogisch'
    drz.loc[idx,'OdoKM'] = ''
    rdw.loc[idx,'OdoKM'] = ''
    
    
    display(rdw.loc[sel, ['Title', 'OdoKM', 'NAP']])
    display(drz.loc[sel, ['Title', 'OdoKM', 'NAP']])
    sel = (rdw.OdoKM == 'volgens NAP logisch') | (rdw.OdoKM == 'volgens NAP onlogisch')
    assert all(~sel)



# Cheap cars with valid inspection

In [None]:
is_cheap = (
    drz.LotType == "Personenauto"
) & (
    drz.Price<3000
) & (
    drz.Price>0
) & ~(
    (
        drz.APKdate.isnull()
    ) | (
        drz.APKdate.apply(lambda x:x == '')
    )
)

if all(~is_cheap):
    is_cheap[drz.Price[drz.Price>0].argmin()] = True
    
drz.loc[is_cheap,
        ["Price","ItemType","ItemBrand","Mfdate","APKdate","OdoKM","automatic", "benzine"]].sort_values(by=['benzine', 'automatic', 'OdoKM'], ascending=False)

In [None]:
is_reserved = (
    drz.LotType == "Personenauto"
) & (
    drz.Price == 0
) 

drz.loc[is_reserved,
        ["Price","ItemType","ItemBrand","Mfdate","APKdate","OdoKM","automatic"]].sort_values(by=['APKdate', 'automatic', 'OdoKM'], ascending=False)

In [None]:
if pd.to_datetime(DATE, format='%Y-%m') < pd.to_datetime('2019-04', format='%Y-%m'):
    display(rdw.loc[is_cheap,["rdwinfo"]])
else:
    display(rdw.loc[is_cheap,rdwinfo])

In [None]:
if pd.to_datetime(DATE, format='%Y-%m') < pd.to_datetime('2019-04', format='%Y-%m'):
    display(rdw.loc[is_cheap,"rdwinfo"][0])
else:
    display(rdw.loc[is_cheap,rdwinfo].iloc[0,:])

# Pictures of lots

In [None]:
# ix=drz.N_images.idxmax()
ix=drz.Price.idxmax(); print('Price: EUR {:g}\n\n'.format(drz.loc[ix,'Price']))
# Mfdate = drz.Mfdate.apply(lambda x: pd.to_datetime(x,format='%d.%m.%Y') if len(x)==10 else np.NaN)
# ix=Mfdate.idxmin(); print('year of mf: {:s}\n\n'.format(drz.loc[ix,'Mfdate']))
# ix = '2019-2-2005' # largest km/day
# ix = '2019-2-7345' # largest km

print(ix)
for u in drz.loc[ix,'Images']:
    print(u)
    print(drz.loc[ix,'Source'])
    display(Image(url=u,width=400))
    

# Usage

How many km are driven per year or day?

### Odometer conversion

In [9]:
def odo_str2float(df):
    '''
    Convert odometer to numerical values.
    Also convert Miles to KMs.
    '''
    
    ml2km = 1.609344
    
    # substitute to nan
    df.OdoKM = df.OdoKM.apply(lambda x:x.replace('onbekend','nan') if (type(x) == str) and (len(x)>0) else 'nan')
    df.OdoMLS = df.OdoMLS.apply(lambda x:x.replace('onbekend','nan') if (type(x) == str) and (len(x)>0) else 'nan')
    # float
    df["OdoKM_num"] = df.OdoKM.apply(lambda x:x.replace('.','')).astype(float)
    df["OdoMLS_num"] = df.OdoMLS.apply(lambda x:x.replace('.','')).astype(float)
    # convert miles to km
    df["Odo"] = df.OdoKM_num.copy()
    chooseMls = (df.OdoKM_num.isna()) & ~(df.OdoMLS_num.isna())
    df.loc[chooseMls,"Odo"] = df.loc[chooseMls,"OdoMLS_num"].apply(float) * ml2km

In [None]:
odo_str2float(drz)

drz[['Odo','OdoKM','OdoKM_num','OdoMLS','OdoMLS_num']]

### Dates

In [11]:
# twins
vc = drz.Mfdate.value_counts()
drz.loc[drz.Mfdate.isin([i for i in vc[vc > 1].index if i != '']),
       ['Reg', 'Mfdate', 'ItemBrand', 'ItemType']].sort_values(by='Mfdate')

Unnamed: 0,Reg,Mfdate,ItemBrand,ItemType


In [12]:
# twins
vc = rdw.rdw_typegoedkeuringsnummer.value_counts()
rdw.loc[rdw.rdw_typegoedkeuringsnummer.isin([i for i in vc[vc > 1].index if i != '']),
       ['Reg', 'Mfdate', 'ItemBrand', 'ItemType','rdw_typegoedkeuringsnummer']].sort_values(by='rdw_typegoedkeuringsnummer')

Unnamed: 0,Reg,Mfdate,ItemBrand,ItemType,rdw_typegoedkeuringsnummer
2021-11-1833,DSN-90-F,03.01.2019,PIAGGIO,vespa sprint,e1*168/2013*00138*00
2021-11-1824,DSN-95-X,28.11.2018,PIAGGIO,vespa sprint,e1*168/2013*00138*00
2021-11-1831,DXB-19-B,04.10.2019,PIAGGIO,vespa sprint,e1*168/2013*00138*00
2021-11-1812,DZX-78-D,29.01.2021,PIAGGIO,vespa sprint,e1*168/2013*00138*01
2021-11-1848,FBS-41-B,07.04.2021,PIAGGIO,vespa sprint,e1*168/2013*00138*01
2021-11-1810,FGG-30-V,22.04.2021,PIAGGIO,vespa sprint 50,e1*168/2013*00265*00
2021-11-1832,FGP-47-X,19.05.2021,PIAGGIO,vespa sprint 50,e1*168/2013*00265*00
2021-11-1844,F-698-PG,03.01.2014,PIAGGIO,C38,e3*2002/24*0565*01
2021-11-1801,F-353-GH,01.12.2011,PIAGGIO,c38,e3*2002/24*0565*01
2021-11-7016,NT-556-X,24.10.2012,HYUNDAI,i 10,e4*2001/116*0131*05


In [13]:
AuctDate = pd.to_datetime(DATE, format = '%Y-%m')

# serial date
drz["Mfdate_ser"] = drz.Mfdate.apply(lambda x:pd.to_datetime(x, format = '%d.%m.%Y') if (x!='onbekend') & (x!='') & ('-' not in x) & (len(x) >= 8)else np.NaN)

# age
drz["Age"] = (AuctDate - drz.Mfdate_ser)
drz["Age_year"] = drz.Age.apply(lambda x:x.days/365.25)

- - - -
# Continue with cars only

In [14]:
out = drz.Price.copy()
Features = drz.loc[:,["ItemBrand","ItemType","Odo","Age_year"]].copy()

In [15]:
# Drop rows
Subsel = drz.LotType == 'Personenauto'
dropIx=drz[~Subsel].index
out.drop(index=dropIx,inplace=True)
Features.drop(index=dropIx,inplace=True)

### one hot encoding brand and model

In [None]:
Dummies = pd.get_dummies(Features.ItemBrand,prefix='Brand_',drop_first=False)
Features = pd.concat([Features,Dummies],axis=1)

Features.describe()

# Plotting

In [17]:
def ecdf(data):
    x = sorted(data)
    y = np.arange(0,len(x))/len(x)
    return x,y

# # example
# x,y = ecdf(km_p_day)
# plt.plot(x,y)

In [18]:
# km per day
km_p_day = Features.Odo/(Features.Age_year*365.25).fillna(0)
sel = ~(km_p_day.isna() | (km_p_day == np.inf))
km_p_day = km_p_day.loc[sel]

In [None]:
# Plot ecdf and hist
x,y = ecdf(km_p_day)
xD,yD = ecdf(km_p_day[rdw.diesel])
xB,yB = ecdf(km_p_day[rdw.benzine])
fig,axs = plt.subplots(ncols=1,nrows=2, figsize=[8,10],sharex=True)

# ecdf
ax = axs[0]
ax.step(x,y*100)
ax.step(xD,yD*100)
ax.step(xB,yB*100)
ax.plot(np.median(x),50,'+',markersize=20)
ax.text(np.median(x),50,'Median {:.1f} km/day      '.format(np.median(x)),ha='right',va='center');
ax.set_title('Distribution of intensity of usage')
ax.set_ylabel('Fraction of cars (%)')

xl = axs[0].get_xlim()

# hist
ax = axs[1]
ax.hist(x,bins=np.logspace(-1,np.log10(xl[1]),50))
yl = axs[1].get_ylim()
ax.plot(np.array([1,1])*np.median(x),yl,'-',linewidth=3)
ax.set_xlabel('Average travel (km/day)')
ax.set_ylabel('count')

ax.text(
    km_p_day.min(),1,
    ' {} {}\n{}km since {}'.format(*rdw.loc[km_p_day.idxmin(),['ItemBrand','ItemType','OdoKM','Mfdate']].values),
    rotation=90,
    va='bottom',
    ha='center'
)

ax.text(
    km_p_day.max(),1,
    ' {} {}\n{}km since {}'.format(*rdw.loc[km_p_day.idxmax(),['ItemBrand','ItemType','OdoKM','Mfdate']].values),
    rotation=90,
    va='bottom',
    ha='center'
)

ax.set_xscale('log')


In [None]:
plt.plot(Features.Odo,out,'.')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('odo (km converted)')
plt.ylabel('Price')

plt.figure()
plt.plot(Features.Age_year,out,'.')
plt.yscale('log')
plt.xlabel('age (years)')
plt.ylabel('Price')
plt.xlim(0,50)


plt.figure()
plt.plot(Features.Age_year,Features.Odo,'.')
plt.xlabel('age (years)')
plt.ylabel('odo (km converted)')
plt.xlim(0,50)

plt.show()



In [None]:
pd.plotting.scatter_matrix(Features.loc[:, [c for c in Features.columns if not c.startswith('Brand__')]],figsize=[4,4])
plt.show()

In [None]:
# reserved price?
gr = [l[:1] for l in np.array([*out.index.str.split('-')])[:,2]]
gb = out.groupby(gr)
for g,_ in gb:
    x,y = ecdf(gb.get_group(g))
    plt.figure(figsize=[4,4])
    plt.step(x,y*len(y),'+:')
    min_x = np.array(x)[np.array(x) > 0].min()
    plt.xlim(left=min_x*0.2, right=min_x*1.8)
    plt.xlabel('price')
    plt.ylabel('cars')
    plt.title(f'lowest priced lots start with {g}')
    # lowest price is 174?
    out.value_counts().sort_index()