<a id='eda_top'>

# EDA and some basic analyses

## EDA  
- [regression](#eda_regress)
- [scatter matrix](#eda_scattermtx)
- [distributions](#eda_dist)
  - [ecdf](#eda_ecdf)
- [dendrogram](#eda_dendro)
- [TSNE](#eda_tsne)

## Basic analyses
The output figures are publishable and are saved to disk.  

- [Analysis of strategical bidding with non-round prices](#eda_digits)  
  Hot plot of occurence of digits in winning prices. 
   
- [Usage intensity](#eda_usage)  
  Above or below average usage (odometer reading) will determine car value. This is a quick analysis of usage intensity. 
  

In [1]:
import sys
import os
import re
import json

In [2]:
with open('../assets/drz-settings-current.json', 'r') as fid:
    cfg = json.load(fid)
print(cfg['AUCTION'])

OPBOD = cfg['AUCTION']['kind'] == 'opbod'
AUCTION_ID = cfg['AUCTION']['id']
DATA_DIR = cfg['FILE_LOCATION']['data_dir']
RESULTS_DIR = cfg['FILE_LOCATION']['report_dir']
VERBOSE = int(cfg['GENERAL']['verbose'])
SAVE_METHOD = cfg['GENERAL']['save_method']


{'kind': 'opbod', 'id': '2025-0501', 'date': '20250509'}


In [3]:
if SAVE_METHOD == 'skip_when_exist':
    do_save = lambda fn: not(os.path.isfile(fn))
elif SAVE_METHOD == 'always_overwrite':
    do_save = lambda _: True
elif SAVE_METHOD == 'skip_save':
    do_save = lambda _: False
else:
    raise NotImplementedError(f'SAVE_METHOD: {SAVE_METHOD} not implemented')

In [4]:
TAG_SINGLE = "nbconvert_instruction:remove_single_output"


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.manifold import TSNE


In [6]:
# set figure defaults (needs to be in cell seperate from import sns)
plt.style.use([
    'default',
    f"{cfg['FILE_LOCATION']['app_dir']}/assets/movshon.mplstyle",
    f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-notebook.mplstyle"
])

In [7]:
def diff_month(d1, d2):
    
    '''returns difference in months between two datetime objects'''
    
    return (d1.year - d2.year) * 12 + d1.month - d2.month

def car_name(row):
    
    '''From dataframe row create a sensible name to identify car'''
    
    # index of row
    idx = row.name.strip()
    # translate str to datatime object
    # auct = pd.datetime.strptime(idx[0:-5],'%Y-%m')
    auct = pd.to_datetime('-'.join(idx.split('-')[0:2]) + '-01', format='%Y-%m-%d')
    # Manufacture date and year
    mf = auct - row.age
    if pd.isnull(mf):
        mfy = "'??"
    else:
        mfy = "'" + str(mf.year)[-2:]
    # Inspection date and diff in months
    apk = auct - row.days_since_inspection_invalid
    apk_month = diff_month(auct,apk)
    # fill NaN
    row.fillna({
        'color': '?',
        'brand': '?',
        'model': '?',
        'body_type': '?',
        'number_of_doors': -0,
        'displacement': -0,
        'number_of_cylinders': -0,
        'fuel': '?',
        'age': pd.Timedelta(-999),
        'days_since_inspection_invalid': pd.Timedelta(-999),
        'odometer': -0
    },inplace=True)

    # construct name
    name = '{} {:7} {} {} {} {:g}drs {:g}cc {:g}cyl {} {:.1f}km/day '.format(
        mfy,
        row.color,
        row.brand,
        row.model,
        row.body_type,
        row.number_of_doors,
        row.displacement,
        row.number_of_cylinders,
        row.fuel,
        row.odometer/row.age.days)
    
    # add inspection if any
    if (row.days_since_inspection_invalid.days > 0):
        name += 'inspection invalid'
    else:
        name += 'inspection {:g}m'.format(-apk_month)
        
    return name


In [8]:
fn = f'{DATA_DIR}/cars-for-ml.pkl'
if OPBOD:
    fn = fn.replace('.pkl', '-opbod.pkl')
    
print(f'load {fn}')
df = pd.read_pickle(fn)
if VERBOSE > 1:
    print(df.columns)

# categories
cat_columns = ['brand', 'model', 'fuel', 'body_type','color', 'energy_label', 'fourwd']
if VERBOSE > 1:
    print(list(df[cat_columns]))

# sensible name
df['car_name'] = df.apply(car_name, axis='columns')

if VERBOSE > 0:
    display(df.tail(10), metadata={'tags': (TAG_SINGLE, )})
else:
    print('last lot:',df.index[-1])

load /home/tom/bin/satdatsci/Saturday-Datascience/data/cars-for-ml-opbod.pkl


<a href="#eda_top" id='eda_regress'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Price

In [9]:
def get_nr_nc(n):
    nr = int(np.ceil(n**(1/2)))
    nc = int(np.ceil(n/nr))
    return nr,nc

In [10]:
feat = np.setdiff1d(df.columns, cat_columns + ['price', 'car_name'])
X = df.loc[:,feat].astype('O').fillna(np.nan)
y = df.price.astype('O').fillna(np.nan)
print('features:\n', feat)


if VERBOSE > 0:
    nr,nc = get_nr_nc(len(feat))
    fig,axs = plt.subplots(nrows=nr, ncols=nc, figsize=[2*nc,2*nr], sharey=True)

    for iF,ax in enumerate(axs.ravel()):
        if iF >= len(feat):
            ax.set_visible(False)
            continue

        display({'text/html': feat[iF]}, raw=True, metadata={'tags': (TAG_SINGLE, )})
        ax.plot(X.iloc[:,iF],y,',k')
        ax.set_title(feat[iF])
        if feat[iF] == 'displacement':
            ax.set_xlabel('Engine displacement [cc]')
        elif feat[iF] == 'age':
            ax.set_xlabel('Age [days]')
        elif feat[iF] == 'age_at_import':
            ax.set_xlabel('Age at import [days]')
        elif feat[iF] == 'days_since_inspection_invalid':
            ax.set_xlabel('Inspection invalid [days]')
        elif feat[iF] == 'power':
            #nettomaximumvermogen
            ax.set_xlabel('[kW]')
        elif feat[iF] == 'odometer':
            ax.set_xlabel('[km]')
        elif feat[iF] == 'registration_tax':
            ax.set_xlabel('[EUR]')
        elif feat[iF] == 'sale_price':
            ax.set_xlabel('[EUR]')

    
    
    axs.ravel()[0].set_ylabel('price [EUR]')


features:
 ['age' 'age_at_import' 'automatic_gearbox' 'company_owners'
 'days_since_inspection_invalid' 'displacement' 'height' 'length'
 'number_of_cylinders' 'number_of_doors' 'number_of_gears'
 'number_of_seats' 'odometer' 'original_sale_price' 'power'
 'private_owners' 'registration_tax' 'top_speed' 'under_survey' 'weight'
 'width']


  X = df.loc[:,feat].astype('O').fillna(np.nan)
  y = df.price.astype('O').fillna(np.nan)


---
<a href="#eda_top" id='eda_scattermtx'><font size=+1><center>^^ TOP ^^</center></font></a>


In [11]:
yX = pd.concat([y, X], axis='columns').fillna(np.nan)

# group per car brand
yX['brand'] = df.brand
grouped = yX.groupby('brand')
VW = grouped.get_group('VOLKSWAGEN')
RENAULT = grouped.get_group('RENAULT')
MERC = grouped.get_group('MERCEDES-BENZ')
FORD = grouped.get_group('FORD')
OPEL = grouped.get_group('OPEL')
SMART = grouped.get_group('SMART')

is_diesel = df.fuel == 'diesel' # Future use


In [None]:
if VERBOSE > 1:
    df_ = MERC
    brand = df_.brand.values[0]
    num_cols = [c for c in df_.columns if df_[c].dtype != 'object']
    pd.plotting.scatter_matrix(df_[num_cols].astype(float), grid=True, figsize=[16,16], diagonal='kde')
    plt.suptitle(f'{brand} n={df_.shape[0]:.0f}\nAll numerical features')

<a href="#eda_top" id='eda_dist'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Distributions

In [13]:
def ecdf(data):
    d = data[data.notna()]
    y = np.linspace(0,1,len(d))
    x = sorted(d)
    return x,y

## Simple bar graphs

In [None]:
if VERBOSE > 0:
    # loop over fields
    for data, title_label, xaxis_label in zip(
        [
            yX.number_of_cylinders,
            yX.number_of_doors,
            yX.number_of_gears,
            yX.private_owners,
            yX.company_owners,
        ],
        [
            'Number of cylinders',
            'Number of doors',
            'Number of transmission gears',
            'Number of private owners',
            'Number of company owners',
        ],
        [
            'Cylinders',
            'Doors',
            'Gears',
            'Owners',
            'Owners',
        ]
    ):
        # "histogram"
        cnt = data.value_counts()
        # create figure
        plt.figure(figsize=[2,2])
        # plot bars
        plt.bar(x=cnt.index, height=cnt.values, edgecolor='k', facecolor='None')
        # prettify
        ax = plt.gca()
        ax.set_title(title_label, style='italic')
        ax.set_xlabel(xaxis_label, style='italic')
        ax.set_ylabel('Number of cars', style='italic')
        ax.set_xlim(left=-2, right=cnt.index.max()+1)
        ax.set_xticks(cnt.index)
        ax.xaxis.set_tick_params(which='minor', bottom=False)

<a href="#eda_top" id='eda_ecdf'><font size=+1><center>^^ TOP ^^</center></font></a>


## Ecdf of Length, Width, Height, Weight, Speed, Power etc.

Red is with imputed median

In [None]:
if VERBOSE > 0:
    # loop over fields
    for data, title_label, xaxis_label, bins in zip(
        # data
        [
            yX.length, 
            yX.width,
            yX.height,
            yX.weight,
            yX.top_speed,
            yX.power,
            yX.original_sale_price,
            yX.displacement,
        ],
        # title
        [
            'Length', 
            'Width',
            'Height',
            'Weight',
            'Top speed',
            'Power',
            'Sale price',
            'Engine displacement'
        ],
        # xlabel
        [
            'Length [mm]', 
            'Width [mm]',
            'Height [mm]',
            'Weight [kg]',
            'Speed [km/h]',
            'Power [kW]',
            'Price [EUR]',
            'Volume [cm^3]'
        ],
        # bins: left, right, nr of steps
        [
            (2000, 9000, 36), 
            (1000, 2500, 61),
            (1000, 5000, 81),
            (500, 5000, 46),
            (80, 350, 55), 
            (0, 500, 51), 
            (0, 500000, 51), 
            (0, 10000, 101)
        ],
    ):
        # separate tuple in 3 different values
        left_bin_edge, right_bin_edge, bin_steps = bins

        # prepare x and y from values
        x,y = ecdf(data)
        # store median value
        median_value = data.median()
        # reporting
        print('{}\n\tmin: {}\n\tmid: {}\n\tmax: {}\n\tbin: {} : {} : {} (n={})'.format(
            title_label, 
            x[0], median_value, x[-1], 
            left_bin_edge, (right_bin_edge-left_bin_edge)/(bin_steps-1), right_bin_edge, bin_steps-1
        ))
        # warn if out of bounds
        if (x[0] < left_bin_edge) or (x[-1] > right_bin_edge):
            print('>> CLIPPING <<\n')

        # plot full
        fig,axs=plt.subplots(figsize=[4,4], nrows=2, ncols=1, sharex=True)

        axs[0].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=0.005, ms=16)
        axs[0].step(x, y, color='grey', linestyle='-', linewidth=2)

        axs[1].hist(x, bins=np.linspace(left_bin_edge, right_bin_edge, bin_steps), edgecolor='k', facecolor='None')
        yl=axs[1].get_ylim()
        axs[1].vlines(x=median_value, ymin=0, ymax=yl[1], color='b')
        axs[1].set_ylim(bottom=yl[0], top=yl[1])


        # prepare data with median replacing NaNs
        data = data.fillna(median_value)
        x,y = ecdf(data)

        # plot nan replaced
        axs[0].step(x, y, markerfacecolor='r', markeredgecolor='none', marker='o', alpha=0.01, ms=8)
        axs[0].step(x, y, color='pink', linestyle='-', linewidth=2)

        axs[1].hist(x, bins=np.linspace(left_bin_edge, right_bin_edge, bin_steps), edgecolor='r', facecolor='r', rwidth = 0.25)
        yl=axs[1].get_ylim()
        axs[1].vlines(x=median_value, ymin=0, ymax=yl[1], color='b')
        axs[1].set_ylim(bottom=yl[0], top=yl[1])

        # prettify
        axs[0].set_xlim(left=left_bin_edge, right=right_bin_edge)
        axs[0].set_title('Empirical Cumulative Distribution Function\n' + title_label, style='italic')
        axs[0].set_yticks(np.linspace(0,1.0,5))
        axs[1].set_xlabel(xaxis_label, style='italic')
        axs[0].set_ylabel('Fraction of total', style='italic')
        axs[1].set_ylabel('Number of cars\n(per bin)', style='italic')


## Age at import
zoom into two clusters of data

In [None]:
data = yX.age_at_import.apply(lambda x: x.days)/365.25
data = data[data>0]
x,y = ecdf(data)

if VERBOSE > 0:
    # plot full
    f = plt.figure(figsize=[8,4])
    ax = f.gca()
    ax.step(x, y, markerfacecolor='blue', markeredgecolor='none', marker='o', alpha=0.025, ms=16)
    ax.step(x, y, color='lightblue', linestyle='-', linewidth=2, label='Age at import')
    ax.set_title('Empirical Cumulative Distribution Function\nAge')
    ax.set_xlabel('age [years]')
    ax.set_ylabel('fraction of total')
    ax.set_xticks(range(0,55,5))
    ax.legend()


    # plot old/young
    fig,axs = plt.subplots(figsize=[8,4], ncols=2, nrows=1)
    axs[0].step(x, y, markerfacecolor='blue', markeredgecolor='none', marker='o', alpha=0.125, ms=16)
    axs[0].step(x, y, color='lightblue', linestyle='-', linewidth=2)
    axs[1].step(x, y, markerfacecolor='blue', markeredgecolor='none', marker='o', alpha=0.125, ms=16)
    axs[1].step(x, y, color='lightblue', linestyle='-', linewidth=2)
    axs[0].set_xlim(left=2, right=4)
    axs[1].set_xlim(left=15, right=30)
    axs[1].axvline(17.5, linestyle=':')
    axs[0].set_ylim(bottom=0.2, top=0.55)
    axs[1].set_ylim(bottom=0.9, top=1.02)
    axs[0].set_ylabel('fraction of total')
    axs[0].set_xlabel('age [years]')
    axs[1].set_xlabel('age [years]')
    axs[0].set_title('young')
    axs[1].set_title('old')


data = yX.age.apply(lambda x: x.days)/365.25
x,y = ecdf(data)

if VERBOSE > 0:
    # plot full
    ax.step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=0.025, ms=16)
    ax.step(x, y, color='grey', linestyle='-', linewidth=2)


    # plot old/young
    axs[0].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=0.125, ms=16)
    axs[0].step(x, y, color='grey', linestyle='-', linewidth=2)
    axs[1].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=0.125, ms=16)
    axs[1].step(x, y, color='grey', linestyle='-', linewidth=2)


## Price
on log scale

In [None]:
data = df.price
x,y = ecdf(data)

if VERBOSE > 0:
    
    # plot full
    fig,axs=plt.subplots(figsize=[4,4], nrows=2, ncols=1, sharex=True)

    axs[0].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=0.005, ms=16)
    axs[0].step(x, y, color='grey', linestyle='-', linewidth=2)
    # axs[0].plot(median_usage, 0.5, marker='+', color='b', ms=64)
    # axs[0].text(median_usage,0.5,'Median: {:.1f} km/day      '.format(median_usage), ha='right', va='center', color='b', weight='bold')

    axs[1].hist(x, bins=np.logspace(1,6,81), edgecolor='k', facecolor='None')
    yl=axs[1].get_ylim()
    # axs[1].vlines(x=median_usage, ymin=0, ymax=yl[1], color='b')

    axs[0].set_xscale('log')
    axs[0].set_xlim(left=10, right=1000000)
    axs[0].set_title('Empirical Cumulative Distribution Function\nPrice', style='italic')
    axs[0].set_yticks(np.linspace(0,1.0,5))
    axs[1].set_xlabel('Price (EUR)', style='italic')
    axs[0].set_ylabel('Fraction of total', style='italic')
    axs[1].set_ylabel('Number of cars\n(per bin)', style='italic')


<a href="#eda_top" id='eda_dendro'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Dendrogram

This is a form of classification and produce a tree-like visualization

In [18]:
# dendrogram?

In [19]:

np.random.seed(42)

train = OPEL.drop(columns='brand').dropna()
if train.shape[0] < 10:
    train = OPEL.drop(columns=['brand', 'under_survey', 'private_owners', 'company_owners']).dropna()

for c in ['age', 'age_at_import', 'days_since_inspection_invalid']:
    train[c] = train[c].apply(lambda x: x.days)
    
mergings = linkage(train,method='complete')
brand_names = df.loc[train.index,'brand']
model_names = df.loc[train.index,'model']
color_names = df.loc[train.index,'color']
fuel_names = df.loc[train.index,'fuel']
lot_names = df.loc[train.index,:].index
names = df.loc[train.index,"car_name"]
# names = brand_names

level = 120000 # arbitrary!
labels = fcluster(mergings,level,criterion='distance')

if VERBOSE > 0:
    f = plt.figure(figsize=[16,4])
    ax = f.gca()
    dendrogram(mergings,
               labels=names,
               leaf_rotation=90,
               leaf_font_size=8,
               ax=ax
              );
    ax.plot([0,train.shape[0]*10],[level,level],'--')
    ax.xaxis.set_tick_params(which='minor', bottom=False)
    ax.xaxis.set_tick_params(which='major', bottom=True)

for l in range(1,labels.max()+1):
    print('\ncluster {}\n'.format(l))
    idx = train[labels == l].index
    for i in idx:
        print('\t{}'.format(df.loc[i,"car_name"]))
    

  ivl.append(labels[int(i - n)])



cluster 1

	'02 GRIJS   OPEL astra-g-cc Hatchback 4drs 1995cc 4cyl Diesel 57.7km/day inspection invalid
	'07 ZWART   OPEL astra Hatchback 4drs 1598cc 4cyl Benzine 76.0km/day inspection 3m
	'07 BLAUW   OPEL corsa MPV 4drs 1248cc 4cyl Diesel 66.6km/day inspection invalid

cluster 2

	'07 BLAUW   OPEL zafira MPV 4drs 1796cc 4cyl Benzine/LPG/G3 gasinstallatie 66.0km/day inspection invalid
	'01 GROEN   OPEL corsa-c Hatchback 4drs 1199cc 4cyl Benzine 40.0km/day inspection invalid
	'04 ZWART   OPEL astra Hatchback 4drs 1364cc 4cyl Benzine 48.0km/day inspection invalid
	'02 GRIJS   OPEL zafira-a Stationwagen 4drs 1598cc 4cyl Benzine 39.2km/day inspection invalid
	'05 ZWART   OPEL meriva-a MPV 4drs 1598cc 4cyl Benzine 47.0km/day inspection invalid
	'02 BLAUW   OPEL vectra-c-cc Hatchback 4drs 1796cc 4cyl Benzine 48.9km/day inspection invalid
	'05 ZWART   OPEL zafira-a Stationwagen 4drs 1598cc 4cyl Benzine 44.8km/day inspection 2m
	'11 GRIJS   OPEL astra sports tourer MPV 4drs 1248cc 4cyl Diesel

<a href="#eda_top" id='eda_tsne'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# tSNE

see if there are clusters

#### Train

In [20]:
# remove categories
train = df.drop(columns=['brand','model','fuel','body_type','color','car_name', 'energy_label', 'fourwd']).dropna()
if train.shape[0] < 100:
    train = df.drop(columns=['brand','model','fuel','body_type','color','car_name', 'energy_label', 'fourwd', 'under_survey', 'private_owners', 'company_owners']).dropna()
for c in ['age', 'age_at_import', 'days_since_inspection_invalid']:
    train[c] = train[c].apply(lambda x: x.days)
display(train.info())
display(train.describe())

model = TSNE(learning_rate = 100,verbose=1)
transformed = model.fit_transform(train)


<class 'pandas.core.frame.DataFrame'>
Index: 771 entries, 2021-09-1000 to 2024-11-1275
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   price                          771 non-null    float64
 1   age                            771 non-null    int64  
 2   odometer                       771 non-null    object 
 3   days_since_inspection_invalid  771 non-null    int64  
 4   age_at_import                  771 non-null    int64  
 5   displacement                   771 non-null    object 
 6   number_of_cylinders            771 non-null    float64
 7   power                          771 non-null    float64
 8   weight                         771 non-null    float64
 9   registration_tax               771 non-null    float64
 10  original_sale_price            771 non-null    float64
 11  number_of_seats                771 non-null    float64
 12  number_of_doors                771 

None

Unnamed: 0,price,age,days_since_inspection_invalid,age_at_import,number_of_cylinders,power,weight,registration_tax,original_sale_price,number_of_seats,number_of_doors,top_speed,length,height,width,number_of_gears,private_owners,company_owners
count,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0,771.0
mean,3287.990921,5656.73022,95.631647,566.649805,4.088197,89.276005,1231.854734,5533.63035,19381.806744,4.97406,3.684825,189.533074,4256.661479,1511.632944,1747.857328,5.425422,3.415045,0.45655
std,7373.172691,1876.771466,376.374971,1202.916284,0.804212,44.608905,297.325965,5540.473415,27546.919515,0.654139,0.896177,25.40738,415.796058,102.642559,85.338077,0.857709,2.101077,0.899707
min,174.0,386.0,-1075.0,0.0,2.0,37.0,695.0,0.0,0.0,2.0,0.0,140.0,3410.0,1199.0,1475.0,1.0,0.0,0.0
25%,388.5,4265.0,-168.0,0.0,4.0,59.0,1025.0,2267.0,0.0,5.0,4.0,171.0,3990.0,1446.0,1690.0,5.0,2.0,0.0
50%,920.0,6030.0,5.0,0.0,4.0,79.0,1195.0,4160.0,11640.0,5.0,4.0,185.0,4280.0,1482.0,1748.0,5.0,3.0,0.0
75%,3125.0,7054.5,332.5,224.5,4.0,103.0,1375.0,6423.5,30728.0,5.0,4.0,203.0,4554.0,1543.0,1801.0,6.0,5.0,1.0
max,120010.0,9466.0,2099.0,7309.0,8.0,460.0,2645.0,52089.0,309722.0,9.0,5.0,329.0,5400.0,2095.0,2060.0,9.0,14.0,7.0


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 771 samples in 0.000s...
[t-SNE] Computed neighbors for 771 samples in 0.071s...
[t-SNE] Computed conditional probabilities for sample 771 / 771
[t-SNE] Mean sigma: 9002.422209
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.273979
[t-SNE] KL divergence after 1000 iterations: 0.316362


#### plot

In [None]:
if VERBOSE > 0:

    # Labeling

    # year
    year_bins = np.arange(
        df.loc[train.index,"age"].apply(lambda x: x.days).min(),
        df.loc[train.index,"age"].apply(lambda x: x.days).max(),365)
    year_labels = [i for i in range(len(year_bins)-1)]

    # price
    price_bins = np.logspace(1, 6, 11)
    price_labels = [i for i in range(len(price_bins)-1)]

    # color, fuel, brand
    labels = df.loc[train.index,'color']
    labels = df.loc[train.index,'fuel']
    labels = df.loc[train.index,'brand']
    labels = pd.factorize(labels)[0]

    labels = pd.cut(train.age,bins=year_bins,labels=year_labels)
    labels = pd.cut(train.price,bins=price_bins,labels=price_labels)

    # name labels
    names = df.loc[train.index,'model']
    # names = train.index

    # plot
    xs = transformed[:,0]
    ys = transformed[:,1]
    plt.figure(figsize=[16,16])
    plt.set_cmap('hot')
    plt.scatter(xs,ys,c=labels,marker='s')
    cm=plt.get_cmap()
    # text labels
    for x,y,t,l in zip(xs,ys,names,labels):
        c = cm(l/max(labels))
        plt.text(x,y,t,color='k',ha='center',va='bottom', alpha=0.5)


- - - - 
# Results

The results here below surpass eda and are beter made suitable for publication


<a href="#eda_top" id='eda_digits'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Last digits of price

In [22]:
# 2d hist
price = df.price.replace(-1, np.nan).dropna()
last0_digit = price.apply(lambda x: f'{x:.0f}').apply(lambda x:x[-1]).astype(int)
last1_digit = price.apply(lambda x: f'{x:.0f}').apply(lambda x:x[-2]).astype(int)
cnt,b0,b1 = np.histogram2d(last1_digit, last0_digit, bins=[10,10],range=[(-0.5,9.5),(-0.5,9.5)])
_xx, _yy = np.meshgrid(b0[1:]-.5, b1[1:]-.5)
x, y = _xx.ravel(), _yy.ravel()

# normalize to expected value if homogenous
norm_cnt = np.log10(cnt/(cnt.ravel().sum()/100))

# sort reversed
idx = cnt.ravel().argsort()[::-1]

digits = pd.DataFrame(data={
    'occurrence':cnt.ravel().astype(int),
    'normalized':norm_cnt.ravel()[idx],
    'last-1':y.astype(int),
    'last-0':x.astype(int),
    'last 2 digits':['{:02.0f}'.format(f) for f in (y*10 + x)]
}).set_index('last 2 digits')
digits.sort_values(by='occurrence', ascending=False, inplace=True)

if VERBOSE > 1:
    display(digits)

digits_max = digits.iloc[0,[c in ['last-1','last-0'] for c in digits.columns]]
digits_min = digits.iloc[-1,[c in ['last-1','last-0'] for c in digits.columns]]

In [None]:
if VERBOSE > 0:
    display(digits)
    # plot
    from mpl_toolkits.mplot3d import Axes3D

    width = depth = .25
    bottom = np.zeros_like(cnt.ravel())

    fig = plt.figure(figsize=(8, 4))

    ax1 = fig.add_subplot(121, projection='3d')
    ax1.bar3d(x,y,bottom,width,depth,cnt.ravel())

    ax2 = fig.add_subplot(122)
    ax2.imshow(norm_cnt,vmax=+1,vmin=-1,
               cmap='RdBu',
               origin='lower')

    plt.hlines(digits_min[0],-0.5,9.5,color='b')
    plt.vlines(digits_min[1],-0.5,9.5,color='b')
    plt.hlines(digits_max[0],-0.5,9.5,color='r')
    plt.vlines(digits_max[1],-0.5,9.5,color='r')

    ax1.set_xlabel('Last digit')
    ax2.set_xlabel('Last digit')
    ax1.set_ylabel('One but last digit')
    ax2.set_ylabel('One but last digit')
    ax1.set_zlabel('Count')
    ax1.set_zlim(top=cnt.max()/2)
    ax2.set_title('Occurence of price ending with digits\nEUR xxx{:1.0f}{:1.0f} occurs {:.0f} times\nEUR xxx{:1.0f}{:1.0f} occurs {:.0f} times'.format(*digits_min,digits.iloc[-1].occurrence,
                                                                                                      *digits_max,digits.iloc[0].occurrence));
    ax1.xaxis.set_tick_params(which='minor', bottom=False)
    ax1.yaxis.set_tick_params(which='minor', left=False)
    ax2.xaxis.set_tick_params(which='minor', bottom=False)
    ax2.yaxis.set_tick_params(which='minor', left=False)

In [24]:
# plot for publication
fig,ax = plt.subplots(figsize=(4, 4))
imgplot = ax.imshow(norm_cnt,vmax=1,vmin=-1,
           cmap='RdBu',
           origin='lower')

# least and most occuring pairs
for i,d in pd.concat([digits.iloc[:5,:], digits.iloc[-5:,:]]).iterrows():
    x=d['last-0']
    y=d['last-1']
    label = '{:g}x'.format(d.occurrence)
    ax.text(x,y,label,
            color='lightgray',
            ha='center',va='center',
            bbox={'alpha':0,'color':'k'})
    
# labels
ax.set_xticks(range(0,10))
ax.set_yticks(range(0,10))
ax.set_title('Occurence of last two digits in price', style='italic')
ax.set_xlabel('Last digit', style='italic')
ax.set_ylabel('One but last digit', style='italic')
ax.xaxis.set_tick_params(which='minor', bottom=False)
ax.yaxis.set_tick_params(which='minor', left=False)


# colorbar
cbar = fig.colorbar(imgplot, ticks=[-1,0,+1], fraction=.045, aspect=20)
cbar.ax.set_yticklabels([
    'Below average\n{:.3f}x'.format(cnt.ravel().sum()/1000), 
    'Average ({:.2f}x)'.format(cnt.ravel().sum()/100), 
    '{:.1f}x\nAbove average'.format(cnt.ravel().sum()/10), 
])

# save
file_name = f'{RESULTS_DIR}/last-two-digits.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
    
if True | do_save(file_name): # always save
    print(file_name)
    with plt.style.context(f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-paper.mplstyle"):
        plt.savefig(file_name, bbox_inches='tight', transparent=False)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/results/last-two-digits-opbod.png


<a href="#eda_top" id='eda_usage'><font size=+1><center>^^ TOP ^^</center></font></a>

---

# Usage intensity

## Odometer
Distance travelled

In [25]:
data = yX.odometer/1000
x,y = ecdf(data)

# plot full
plt.figure(figsize=[4,2])
plt.step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=1/256, ms=8)
plt.step(x, y, color='grey', linestyle='-', linewidth=2)
plt.title('Empirical Cumulative Distribution Function\nOdometer', style='italic')
plt.xlabel('Distance [km x1000]', style='italic')
plt.ylabel('Fraction of total', style='italic')

plt.yticks(np.linspace(0,1.0,5))

print('median odo meter:', np.median(x))

# save
file_name = f'{RESULTS_DIR}/odometer-ecdf.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
    
if True | do_save(file_name): # always save
    print(file_name)
    with plt.style.context(f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-paper.mplstyle"):
        plt.savefig(file_name, bbox_inches='tight', transparent=False)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

median odo meter: 244.659
/home/tom/bin/satdatsci/Saturday-Datascience/results/odometer-ecdf-opbod.png


## Car age

In [26]:
data = yX.age.apply(lambda x: x.days)/365.25
data = data[data>0]
x,y = ecdf(data)

fig,axs = plt.subplots(figsize=[8,8], ncols=2, nrows=2)
gs = axs[0,0].get_gridspec()
axbig = fig.add_subplot(gs[0,:])
for ax in axs[0,:]:
    ax.remove()
axs = [axbig, axs[1,:]]

# plot full
ax = axs[0]
ax.step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=1/128, ms=8)
ax.step(x, y, color='grey', linestyle='-', linewidth=2)
ax.set_xlabel('Age (years)', style='italic')
ax.set_ylabel('Fraction of total', style='italic')
ax.set_title('Empirical Cumulative Distribution Function\nAge', style='italic')
ax.set_yticks(np.linspace(0,1.0,5))

# plot old/young
axs[1][0].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=1/128, ms=8)
axs[1][0].step(x, y, color='grey', linestyle='-', linewidth=2)
axs[1][1].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=1/64, ms=8)
axs[1][1].step(x, y, color='grey', linestyle='-', linewidth=2)
axs[1][0].set_yticks(np.linspace(0,1.0,5))
axs[1][1].set_yticks(np.linspace(0,1.0,21))
axs[1][0].set_xlim(left=0, right=20)
axs[1][1].set_xlim(left=20, right=60)
axs[1][1].set_ylim(bottom=0.88, top=1.02)
axs[1][0].set_ylabel('Fraction of total', style='italic')
axs[1][0].set_xlabel('Age (years)', style='italic')
axs[1][1].set_xlabel('Age (years)', style='italic')
axs[1][0].set_title('Young', style='italic')
axs[1][1].set_title('Old', style='italic')


# save
file_name = f'{RESULTS_DIR}/age-ecdf.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
    
if True | do_save(file_name): # always save
    print(file_name)
    with plt.style.context(f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-paper.mplstyle"):
        plt.savefig(file_name, bbox_inches='tight', transparent=False)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/results/age-ecdf-opbod.png


## Usage intensity

$usage=\frac{odometer}{age}$

In [27]:
usage = df.odometer/df.age.apply(lambda x: x.days)
median_usage = usage.median()

In [28]:
plt.figure(figsize=[4,4])
plt.plot(
    df.age.apply(lambda x: x.days)/365.25,
    df.odometer.astype('Float64')/1000, 
    markerfacecolor='k', markeredgecolor='none', linestyle='None', marker='o', ms=4 ,alpha=1/16)
plt.xlabel('Age (years)', style='italic')
plt.ylabel('Odometer (km x1000) ', style='italic')
xl = plt.xlim()
yl = plt.ylim()
xx = np.array(xl)*365.25
for p_d in [10, 20,40,80, 160]:
    yy = xx * p_d
    plt.plot(xx/365.25,yy/1000,':k')

    if p_d < 50:
        x = 20000
        y = p_d*x    
    else:
        y = 850000
        x = y/p_d
    if p_d == 10:
        txt_pat = ' {} km/day'
    else:
        txt_pat = ' {}'
    plt.text(x/365.25, y/1000, txt_pat.format(p_d), va='top', fontsize=16)

p_d = median_usage
yy = xx * p_d
plt.plot(xx/365.25,yy/1000,'-b')
plt.text(900000/median_usage/365.25,900000/1000,'Median: {:.1f} km/day'.format(median_usage), ha='center', va='bottom', color='b', fontweight='bold')
plt.xlim(xl)
plt.ylim(yl)
plt.xticks(range(0,65,10))


# save
file_name = f'{RESULTS_DIR}/usage-regression.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
    
if True | do_save(file_name): # always save
    print(file_name)
    with plt.style.context(f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-paper.mplstyle"):
        plt.savefig(file_name, bbox_inches='tight', transparent=False)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')


/home/tom/bin/satdatsci/Saturday-Datascience/results/usage-regression-opbod.png


In [29]:
# Fuel types
display(pd.DataFrame(df.fuel.value_counts()), metadata={'tags': (TAG_SINGLE, )})

plt.figure(figsize=[8,8])
for k,df_ in df.groupby('fuel'):
    if 'lpg' in k:
        clr = 'darkgreen'
    elif 'benzine' in k:
        clr = 'darkblue'
    elif 'diesel' in k:
        clr = 'darkred'
    else:
        clr = 'k'
    plt.plot(df_.age.apply(lambda x: x.days),df_.odometer.astype('Float64'), markerfacecolor=clr, markeredgecolor='none', linestyle='None', marker='o', ms=4 ,alpha=1/16)
plt.xlabel('Age (days)', style='italic')
plt.ylabel('Odometer (km)', style='italic')
plt.xscale('log')
plt.yscale('log')
plt.gca().set_aspect('equal')

xl = plt.xlim()
yl = plt.ylim()
p_d = median_usage
xx = np.array(xl)
yy = xx * p_d
plt.plot(xx,yy,'-b')
plt.text(xx[1],xx[1] * p_d,'Median: {:.1f} km/day'.format(median_usage), ha='right', va='bottom', color='b', fontweight='bold')
for angle,offset in zip([0.5, 1, 1.5, 2], [1,1,1,1]):

    yy = xx ** angle * median_usage * offset

    # label
    plt.text(xx[0],xx[0] ** angle * median_usage * offset,
             '  $y \propto x^{{{:g}}}$  '.format(angle,median_usage), ha='left', va='center',
             color='k')
    
    if angle == 1:
        continue
    # line
    plt.plot(xx,yy,':',color='k')


plt.xlim(xl)
plt.ylim(yl)

# save
file_name = f'{RESULTS_DIR}/usage-regression-loglog.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
    
if True | do_save(file_name): # always save
    print(file_name)
    with plt.style.context(f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-paper.mplstyle"):
        plt.savefig(file_name, bbox_inches='tight', transparent=False)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')

/home/tom/bin/satdatsci/Saturday-Datascience/results/usage-regression-loglog-opbod.png


In [30]:
data = usage.astype('O').fillna(np.nan)
x,y = ecdf(data)

# plot full
fig,axs=plt.subplots(figsize=[4,4], nrows=2, ncols=1, sharex=True)

axs[0].step(x, y, markerfacecolor='k', markeredgecolor='none', marker='o', alpha=0.005, ms=16)
axs[0].step(x, y, color='grey', linestyle='-', linewidth=2)
axs[0].plot(median_usage, 0.5, marker='+', markeredgecolor='b', ms=64)
axs[0].text(median_usage,0.5,'Median: {:.1f} km/day      '.format(median_usage), ha='right', va='center', color='b', weight='bold')

axs[1].hist(x, bins=np.logspace(-1,3,81), edgecolor='k', facecolor='None')
yl=axs[1].get_ylim()
axs[1].vlines(x=median_usage, ymin=0, ymax=yl[1], color='b')

axs[0].set_xscale('log')
axs[0].set_xlim(left=0.1, right=1000)
axs[0].set_title('Empirical Cumulative Distribution Function\nUsage per day', style='italic')
axs[0].set_yticks(np.linspace(0,1.0,5))
axs[1].set_xlabel('Usage (km/day)', style='italic')
axs[0].set_ylabel('Fraction of total', style='italic')
axs[1].set_ylabel('Number of cars\n(per bin)', style='italic')

# save
file_name = f'{RESULTS_DIR}/usage-dist.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
    
if True | do_save(file_name): # always save
    print(file_name)
    with plt.style.context(f"{cfg['FILE_LOCATION']['app_dir']}/assets/context-paper.mplstyle"):
        plt.savefig(file_name, bbox_inches='tight', transparent=False)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')
    

  data = usage.astype('O').fillna(np.nan)


/home/tom/bin/satdatsci/Saturday-Datascience/results/usage-dist-opbod.png


In [31]:
locname = pd.read_csv(f"{cfg['FILE_LOCATION']['code_dir']}/assets/lotnr-location-lookuptable.csv", sep=';')
# add 5xxx
locname = pd.concat([
    locname, 
    pd.DataFrame(data={
        'Lotnr low': 5000, 
        'Lotnr high': 5999, 
        'location name': 'Veld 500'
    }, index=[locname.index[-1] + 1])
])

lot_counter = yX.index.str.split('-',expand=True).to_frame().loc[:,2].str.split('').apply(lambda l: ''.join(l[:5])).astype(int)
lot_location = lot_counter.apply(lambda l: locname.apply(
    lambda r: (r['location name'] if ((l >= r['Lotnr low']) and (l <= r['Lotnr high'])) else np.nan),
    axis=1)).bfill(axis=1).iloc[:,0]
lot_location.index = yX.index

# Materieel Logistiekcommando Land/Matlogco
# Koninklijke Landmacht / IVORIA
# Ministerie van Defensie
# Kamp Nieuw Milligen
# Meervelderweg 19
# 3888 NH Uddel
lot_location.loc['2022-07-980027':'2022-07-982727'] = 'Kamp Nieuw Milligen, Uddel'

assert lot_location.isna().any() == False, 'Some locations could not be determined'

locname = locname.merge(lot_location.value_counts(), how='outer', left_on='location name', right_index=True)\
.rename(columns={'count': 'n'}).fillna(0).astype({'n':int})
display(locname)

Unnamed: 0,Lotnr low,Lotnr high,location name,n
9,7000,7024,ABI 1/ ABI 2,0
13,8000,8024,ABI 1/ ABI 2,0
17,9000,9024,ABI 1/ ABI 2,0
11,7050,7099,ABI 4,0
15,8050,8099,ABI 4,0
19,9050,9099,ABI 4,0
2,2000,2199,Hal A,0
3,2200,2399,Hal B,2
4,2400,2599,Hal C,0
5,2600,2799,Hal D,0


In [None]:
if VERBOSE > 0:
    # combine. These alternate every auction
    lot_location.loc[lot_location.isin(['Veld 700', 'Veld 800'])] = 'Veld 700/Veld 800'
    
    # Is there a reserve price?
    flat = []
    # count minimum price per auction 
    #    (nan is dropped)
    for e in [*yX.groupby(lot_location).price.apply(
        lambda x: [x.min()] * sum(x==x.min())
    )]:
        flat+=e
    # Plot
    df_ = pd.Series(flat).value_counts().sort_index()
    plt.figure(figsize=(16,4))
    sns.barplot(y=df_.values, x=df_.index.astype(int), orient='vertical', color='w')
    #plt.plot(df_, 'P:')
    #plt.gca().set_xscale('log')
    plt.xlabel('Price [Eur]')
    plt.ylabel('Occurences')
    plt.gca().xaxis.set_tick_params(which='minor', bottom=False)
    plt.gca().yaxis.set_tick_params(which='minor', left=False)

    
    # Price range per auction
    plt.figure()
    # group per auction and first digit
    gb = yX.groupby([*map(lambda x: '-'.join(x.split('-')[:2]), yX.index)])
    df_ = pd.DataFrame(data=yX.loc[:,['price', 'automatic_gearbox']])
    df_['date']=[*map(lambda x: '-'.join(x.split('-')[:2]), yX.index)]
    df_['first_nr_lot'] = [*map(lambda x: x.split('-')[-1][0], yX.index)]
    sns.boxplot(y=df_.price, x=df_.first_nr_lot, hue=df_.date)
    plt.gca().set_yscale('log')
    plt.gca().xaxis.set_tick_params(which='minor', bottom=False)



In [None]:
# Inflation

# combine. These alternate every auction
lot_location.loc[lot_location.isin(['Veld 700', 'Veld 800'])] = 'Veld 700/Veld 800'

#group = yX.index.map(lambda x: int(x.split('-')[2][0])).values
#group[np.isin(group, [7,8])] = 78
group = lot_location
month_s = yX.index.map(lambda x: '{0:4.0f}-{1:02.0f}'.format(*[int(xx) for xx in x.split('-')[:2]])).values
month = pd.to_datetime(month_s, format='%Y-%m')

# yX.price.groupby([group, month]).median().plot()

vc = pd.Series(group).value_counts()
vc_large = vc#.head(4)


f, axs = plt.subplots(vc_large.shape[0], 1, figsize=[8,vc_large.shape[0]*2], sharex=True)
for i,g in enumerate(vc_large.index):
    sel = group == g
#     print(i, g, sum(sel), month[sel].min(), month[sel].max())
    axs[i].plot(month[sel], yX.loc[sel, 'price'], ',')
    sns.lineplot(y=yX.loc[sel, 'price'], x=month[sel], ax= axs[i], estimator=np.mean, errorbar=('ci', 100))
#     sns.boxplot(y=yX.loc[sel, 'price'], x=month[sel], color='gray', ax= axs[i])
    axs[i].set_yscale('log')
    axs[i].set_title(g)
#     axs[i].set_xlim(left=month.min(), right=month.max())