<a id='clf_top'>

# Classify pictures from auction lots

The auction lots are accompanied with pictures. This notebook performs classification analyses.

## Classification models
1. [Simple binary classification model](#clf_model_1)
2. [Random forest](#clf_model_2)
3. [Multiclass](#clf_model_3)

## Results
1. [Model performance](#clf_performance)
    1. [Confusion matrix](#clf_performance_confmtx)  
    Models can miss or falsly identify items. Performance can be summarized in a confusion matrix:
    
    |      | predicted |     |
    |-----:|-----------|-----|
    | **real** | TP        | FP  |
    |      | FN        | TN  |  
    
    Correct: **TP**: True positive, **TN**: True negative  
    Wrong: **FN**: False negative, **FP**: False positive
    
    2. [ROC](#clf_performance_roc)  
    Receiver operator curve
    

In [1]:
import drz_config
cfg = drz_config.read_config()
VERBOSE = cfg['VERBOSE']
OPBOD = cfg['OPBOD']
SKIPSAVE = cfg['SKIPSAVE']

if VERBOSE > 0:
    display(cfg)
    
SKIP_MASK = True
TAG_SINGLE = "nbconvert_instruction:remove_single_output"

{'settings_fn': '../code/assets/drz-auction-settings.ini',
 'DATE': '2021-10',
 'VERBOSE': 1,
 'OPBOD': False,
 'URL': 'http://verkoop.domeinenrz.nl/verkoop_bij_inschrijving_2021-0010',
 'EXTEND_URL': False,
 'CLOSEDDATA': True,
 'closed_data_fields': '*',
 'SKIPSAVE': False}

In [2]:
USE_HIGH_RES = False # Do not reshape image to smaller size. This is beta! Memory issues may occur
PREDICT_CLASS = 'AUDI' # Test classifier on this class (brand)

In [3]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import re

import seaborn as sns
from PIL import Image

from os.path import isfile


In [4]:
plt.style.use(['default', '../assets/movshon.mplstyle', '../assets/context-notebook.mplstyle'])

In [5]:
# file name pattern for images
if OPBOD:
    image_file = lambda lot,idx: '../../../python-nb/data/images/{:s}-{:02.0f}.jpg'.format(lot,idx)
else:
    def image_file(lot,idx):
        #HACK ALERT!
        # if there are two auction a month
        if lot.startswith('2021-06-'):
            lot_ = lot.split('-')
            lot = '{}-{}-{}'.format(lot_[0], lot_[2][-2:], lot_[2][:-2])
        return r'/home/tom/data/satdatsci-images/{:s}/{:s}-{:02.0f}.jpg'.format(lot[:4], lot, idx)
    # image_file = lambda lot,idx: r'../data/images/{:s}-{:02.0f}.jpg'.format(lot,idx)
# image_file(df.index[-1],0)

# lot number to date
def idx2date(idx):
    M = re.match('([0-9]{4}-[0-9]{1,2})-[0-9]{4}', idx)
    return pd.to_datetime(M[1], format='%Y-%m')

# Load data

In [6]:
if OPBOD:
    fn = '../../../python-nb/data/cars-for-imageclf-opbod.pkl'
else:
    fn = '../data/cars-for-imageclf.pkl'

print(fn)
df = pd.read_pickle(fn)
print(df.shape)

# add date
df.loc[:,'auction_date'] = df.reset_index()['index'].apply(idx2date).values

if not OPBOD:
    # drop obs with no images stored
    df.drop(index=df.index[df.auction_date < pd.to_datetime('2019-02-01')], inplace=True)

# file location to front view pictures
df.loc[:,'front_image'] = df.reset_index().loc[:,'index'].apply(lambda l:image_file(l,0)).values

## HACK ALERT
sel = df.index.str.startswith('2021-07') & df.index.str.endswith('07')
df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'07-([0-9]{2}\.jpg)$', r'-\g<1>', x))
sel = df.index.str.startswith('2021-07') & df.index.str.endswith('17')
df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'07-([0-9]{4})17-([0-9]{2}\.jpg)$', r'17-\g<1>-\g<2>', x))
sel = df.index.str.startswith('2021-08') & df.index.str.endswith('08')
df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'08-([0-9]{2}\.jpg)$', r'-\g<1>', x))
sel = df.index.str.startswith('2021-08') & df.index.str.endswith('18')
df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'08-([0-9]{4})18-([0-9]{2}\.jpg)$', r'18-\g<1>-\g<2>', x))
sel = df.index.str.startswith('2021-09') & df.index.str.endswith('09')
df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'09-([0-9]{2}\.jpg)$', r'-\g<1>', x))
sel = df.index.str.startswith('2021-09') & df.index.str.endswith('19')
df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'09-([0-9]{4})19-([0-9]{2}\.jpg)$', r'19-\g<1>-\g<2>', x))
# sel = df.index.str.startswith('2021-10') & df.index.str.endswith('10')
# df.loc[sel,'front_image'] = df.loc[sel,'front_image'].apply(lambda x: re.sub(r'10-([0-9]{2}\.jpg)$', r'-\g<1>', x))

if not OPBOD:
    # no front image
    df.drop(index='2019-7-2212', inplace=True)
    df.drop(index='2020-10-7232', inplace=True) # fiat
    
    # front image is not first
    df.loc['2019-9-9152','front_image'] = image_file('2019-9-9152',1)
    df.loc['2020-10-7235','front_image'] = image_file('2020-10-7235',1) # smart

# print result
print(df.shape)
display(df.tail(), metadata={'tags': (TAG_SINGLE, )})


../data/cars-for-imageclf.pkl
(7218, 16)
(4159, 18)


## Create openML-like dataset structure

In [7]:
details = {f: t for f,t in zip(
    ['id', 'name', 'version', 'format', 'creator', 'collection_date', 'upload_date', 'licence', 'url', 'file_id', 'default_target_attribute', 'version_label', 'tag', 'visibility', 'original_data_url', 'paper_url', 'status', 'processing_date', 'md5_checksum'],
    [str(), str(), str(), str(), str(), str(), str(), str(), str(), str(), str(), str(), [str()], str(), str(), str(), str(),  str(),str()]
)}

drz = {f: t for f,t in zip(
    ['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'],
    [np.array([]), np.array([]), list(), str(), details, dict(), str()]
)}

In [8]:
# Choose a categorical feature
feature_to_classify = 'brand'
# feature_to_classify = 'body_type'

# meta data
drz['details'].update(dict(
    id='[no id]', 
    name='Dienst Roerende Zaken - {}'.format(feature_to_classify), 
    creator='TvG', 
    original_data_url='https://github.com/r5atom/Saturday-Datascience'))

# target: body type
drz['target'] = df[feature_to_classify].values

### Store pixels values in data set

In [9]:
def load_mask(fn, nchar_header=7, nbytes_per_char=[2,1], VERBOSE=False):
    import gzip
    
    def _get_chars(uz, pos, nchar, nbytes_per_char):
        # get characters
        recov = []
        for c in range(nchar):
            byte = uz[pos:][c*nbytes_per_char:(c+1)*nbytes_per_char]
            recov += [int.from_bytes(byte, byteorder='little', signed=False)]
        return recov


    # read zip
    with open(fn, 'rb') as fid:
        z = fid.read()

    # decompress (unzipped)
    uz = gzip.decompress(z)
    if VERBOSE: print(f'zip: {len(z)} unzip: {len(uz)}')

    # parse parts
    nparts = 2 #len(nbytes_per_char)
    pos = 0
    data = []
    EOF = False
    while EOF == False:
        if VERBOSE: print(f'chunk {int(len(data)/2)}')

        for part in range(nparts):

            # determine where part ends
            if part == 0:
                nchar = nchar_header
            else:
                header = data[-1]
                roi = header[-4:]
                nchar = (roi[2] - roi[0]) * (roi[3] - roi[1])

            if VERBOSE:print(f'part: {part} from: {pos} char: {nchar} nb/char: {nbytes_per_char[part]}')

            recov = _get_chars(uz, pos, nchar, nbytes_per_char[part])

            # update bgn for next loop
            pos += nchar*nbytes_per_char[part]

            data += [recov]

            EOF = pos == len(uz)

        if VERBOSE:print(f'pos: {pos}/{len(uz)}')
            
    return data

def mask_arrays2mask(mask_headers, tiny_masks):
    n_masks = len(mask_headers)
    masks = np.zeros((mask_headers[0][1], mask_headers[0][2], n_masks), dtype='bool')
    rois = np.zeros((n_masks, 4), dtype='int32')
    class_ids = np.zeros((n_masks), dtype='int32')

    for i_mask in range(n_masks):
        mask = tiny_masks[i_mask]
        header = mask_headers[i_mask]

        roi = header[3:]
        mask_array = np.array(mask)

        tiny_mask = np.reshape(mask_array, (roi[2]-roi[0], roi[3]-roi[1]))

        j,i = np.where(tiny_mask)
        i += roi[1]
        j += roi[0]
        masks[j,i,i_mask] = True
        rois[i_mask] = roi
        class_ids[i_mask] = header[0]
        
    return masks, rois, class_ids


In [10]:
if USE_HIGH_RES:
    image_size = (512, 341)   
else:
    image_size = (128, 85) # aspect 1.5
n_feat = image_size[0] * image_size[1]
n_obs = df.shape[0]
aspect_ratio = image_size[0] / image_size[1]
# empty data array
drz['data'] = np.empty((n_obs, n_feat), int)

# load data into data field
if VERBOSE > 0:
    print(f'loading {df.shape[0]} images')
for i,idx in enumerate(df.index):
    

    # load pixel data
    fn = df.loc[idx,'front_image']
    im = Image.open(fn)

    
    # check dimensions of original and crop if needed
    ar_mismatch = np.log10(im.size[0]/im.size[1] / aspect_ratio)
    OK = -0.005 < ar_mismatch < 0.002

    if VERBOSE > 0:
        if i==0:
            output = '<hl></br>'
            disp_id = display({'text/html': output}, raw=True, display_id=True)
    
    if VERBOSE>1 or (VERBOSE>0 and (i==0 or i==n_obs-1)): 
        output += f'<p>{i} {idx}: '
        output += f'{im.filename} {im.mode} {im.size} {im.format} {ar_mismatch:+.4f} >'
        #print(i, idx, end=': ')
        #print(im.filename, im.mode, im.size, im.format, '{:+.4f}'.format(ar_mismatch), end= ' > ')
    
    if not OK:
        if ar_mismatch < 0:
            # second dimension too large: crop from top and bottom
            crop_extend = im.size[1] - (im.size[0]/aspect_ratio)
            crop_top = np.ceil(crop_extend/2)
            crop_bot = crop_extend - crop_top
            box = [0, crop_top, im.size[0], im.size[1]-crop_bot]
            im = im.crop(box=box)
        else: 
            # first dimension too small: crop from left
            crop_extend = im.size[0]-(im.size[1]*aspect_ratio)
            # box = [0, 0, im.size[0]-crop_extend, im.size[1]]
            box = [crop_extend, 0, im.size[0], im.size[1]]
            im = im.crop(box=box)
        if VERBOSE>1:
            output2 = output.split('</br>')[-1]
            output2 += '<b>crop original</b> {:+.0f}px'.format(
                crop_extend
            )
            display({'text/html': '<font size=2>' + output2 + '</font>'}, raw=True)
            #print('\n\t\t\t\t\t\t\t\t\t> crop original < {:+.4f}'.format(np.log10(im.size[0]/im.size[1] / aspect_ratio)), end=' ')

    # reduce size and make gray scale
    #im = im.draft('L',(image_size[0]-1,image_size[1]-1))
    im = im.convert('L')
    im = im.resize(image_size)
    if VERBOSE>1 or (VERBOSE>0 and (i==0 or i==n_obs-1)):
        output += f'{im.mode} {im.size} {im.format}</p>'
        #print(im.mode, im.size, im.format, end=' ')
    elif VERBOSE>0:
        output += '. '
        #print('.', end='')
        
    # mask
    if SKIP_MASK == False:
        # load data
        mask_fn = fn.replace('.jpg', '-masks.gzip')
        if isfile(mask_fn):
            mask_data = load_mask(mask_fn)
            mask_headers = mask_data[0::2]
            tiny_masks = mask_data[1::2]
            # convert to mask image
            mask, _, i_class = mask_arrays2mask(mask_headers, tiny_masks)
            car_mask = Image.fromarray(np.array((mask[:,:, i_class==3].sum(axis=2))*255, dtype='uint8'))
            npix = sum(np.array(car_mask.getdata())>0)
            if npix == 0:
                # truck
                car_mask = Image.fromarray(np.array((mask[:,:, i_class==8].sum(axis=2))*255, dtype='uint8'))

            # reshape
            car_mask = car_mask.resize(image_size)
            npix = sum(np.array(car_mask.getdata())>0)

            # apply
            if npix == 0:
                raise
            im = Image.composite(im, Image.new("L", im.size, 0), car_mask)

            if VERBOSE>1 or (VERBOSE>0 and (i==0 or i==n_obs-1)):
                output += f'{npix}'
                #print(npix)
            elif VERBOSE>0:
                output += '.'
                #print('.', end='')

        else:
            raise
    else:
        if VERBOSE>1:
            output += '.</br>'
            #print('.')
            
    if VERBOSE > 0:
        outputlines = output.split('</br>')
        if len(outputlines) > 8:
            output = '</br>'.join(outputlines[0:4] + ['<center>...</center>'] + outputlines[-4::])
        disp_id.update({'text/html': '<font size=2>' + output + '</font>'}, raw=True)


    # add to data
    drz['data'][i,:] = np.array(im.getdata())
    


loading 4159 images


In [11]:
# add feature names
max_digits = list(np.ceil(np.log10(image_size)).astype(int))
pat='front-{{:0{:.0f}.0f}}-{{:0{:.0f}.0f}}'.format(*max_digits)
drz['feature_names'] = [pat.format(i,j) for j in range(0,image_size[1]) for i in range(0,image_size[0])]

### Show an observation

In [None]:
i = -1
obs = drz['data'][i]
obs_image = obs.reshape(image_size[1],image_size[0])
plt.figure(figsize=[d/64 for d in image_size])
plt.imshow(obs_image, cmap='Greys_r')
plt.title('pixel location')
plt.xlabel('horizontal (left: {})'.format(image_size[0]))
plt.ylabel('vertical (bottom: {})'.format(image_size[1]))
plt.text(0, 0, drz['target'][i], ha='left', va='top', fontsize=8, color='b')

# 3d plot
Ms = [re.match('front-([0-9]{{{:g}}})-([0-9]{{{:g}}})'.format(*max_digits),n) for n in drz['feature_names']]
loci = [int(M[1]) for M in Ms]
locj = [int(M[2]) for M in Ms]

from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

fig = plt.figure(figsize=[4,4])
ax = fig.gca(projection='3d')
ax.view_init(60, 10)
ax.scatter3D(locj, loci, drz['data'][i], marker='s', c=drz['data'][i], cmap='PRGn')
plt.show()

# Predicting classes

In [None]:
# histogram of occurence (prevalence) of classes
pd.Series(drz['target']).value_counts().plot(kind='bar', figsize=[16,2])
plt.show()

In [14]:
# function to plot exemplars

def plot_observations(observations, titles=None, margin=10, image_size=(128,85)):
    '''
    Displays images in one large figure.
    
    Note that the observations are one vector per observation as it is fed to the classifier with every pixel a feature.
    '''
    
    # number of tiles
    n_tiles = len(observations)
    
    if titles is None:
        titles = [str(i) for i in range(n_tiles)]
    
    # dimension of output row/columns
    nc = np.floor(np.sqrt(n_tiles)).astype(int) # largest in tile -> floor
    nr = np.ceil(n_tiles/nc).astype(int)
    # adjust 
    n_tiles_rect = nr*nc
    # empty figure
    im_rect = np.ones([image_size[1]*nr + margin*(nr+1), image_size[0]*nc + margin*(nc+1)])*observations.max()

    # fill rectangle
    title_coori = []
    title_coorj = []
    for ir in range(nr):
        for ic in range(nc):
            i = ir*nc + ic # tile index
            if i >= n_tiles:
                break
                
                
            # coordinates in rectangle
            offsetij = (ir*image_size[1] + margin*(ir+1), 
                        ic*image_size[0] + margin*(ic+1))
            coor_i = [b + offsetij[0] for b in [0, image_size[1]]]
            coor_j = [b + offsetij[1] for b in [0, image_size[0]]]
            # put image
            im_rect[
                coor_i[0]:coor_i[1],
                coor_j[0]:coor_j[1]
            ] = observations[i].reshape(image_size[1], image_size[0])
            title_coori += [coor_i[0]]
            # title_coorj += [(coor_j[1]-coor_j[0])/2 + coor_j[0]]
            title_coorj += [coor_j[0]]

    # prettify
    dpi = 96
    fig = plt.figure(figsize=[nc*image_size[0]/dpi*2, nr*image_size[1]/dpi*2], dpi=dpi)
    ax = fig.gca()
    ax.axis('off')
    # place
    ax.imshow(im_rect, cmap='Greys_r')

    # titles
    for y,x,t in zip(title_coori, title_coorj, titles):
        ax.text(x, y, t, ha='left', va='top')


In [None]:
# Plot observations 
plot_observations(np.array([drz['data'].var(axis=0)]), ['variance within images'], image_size=image_size)

# sel = drz['target'] == PREDICT_CLASS
# sel = drz['target'] == 'sedan'
# sel = drz['target'] == 'sedan'
# sel = drz['target'] == 'DAIMLERCHRYSLER AG'
auction_dates = df.reset_index()['index'].apply(lambda x:'-'.join(x.split('-')[0:-1])).values
sel = auction_dates == max(auction_dates)

plot_observations(drz['data'][sel], image_size=image_size)
# display(df[sel])

In [16]:
# empty dict to store models
models = dict()

<H1><a href="#clf_top">^</a></H1><a id='clf_model_1'>

# Model: Simple binary classifier

## Prepare input

In [17]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict

model_name = 'binary classifier with gradient descent'

X, y = drz['data'], drz['target']
print(X.shape)
print(y.shape)

(4159, 10880)
(4159,)


## Fit

In [18]:
# instantiate a dict in models at key with name of this model
models[model_name] = dict()

#test_size = 0.3
#i_split = round(n_obs * (1-0.3)) # index where to split
#shuffle_index = np.random.RandomState(seed=42).permutation(n_obs)
#assert shuffle_index[0] == 309 # sanity check
# actual split
#X_train, y_train = X[:i_split], y[:i_split]
#X_test, y_test = X[i_split:], y[i_split:]

# split train/test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

n_test = len(X_test)


(2911, 10880)
(1248, 10880)


## Binary classifier: Stochastic gradient descent

In [19]:
# class
predict_class = PREDICT_CLASS
y_binary, y_train_binary, y_test_binary = (y == predict_class), (y_train == predict_class), (y_test == predict_class)
# plt.hist(y_train_binary_astype('int'), bins=[-0.5,0.5,1.5])
# plt.hist(y_test_binary.astype('int'), bins=[-0.5,0.5,1.5])

In [20]:
# create classifier
clf = SGDClassifier(random_state=42, verbose=VERBOSE>1, fit_intercept=False)
models[model_name].update({'model':clf})

# fit
clf.fit(X_train, y_train_binary)
y_pred = clf.predict(X_test)

models[model_name].update({'n':y.shape[0]})
models[model_name].update({'n features':X.shape[1]})
models[model_name].update({'classes':[predict_class]})

# parameters
betas = [clf.intercept_[0], *clf.coef_[0]] # index 0 because of two classes
models[model_name].update({'betas':betas})

# scoring

# accuracy
models[model_name].update({'acc':clf.score(X,y_binary)})
models[model_name].update({'test acc':clf.score(X_test,y_test_binary)})
cv_results = cross_val_score(clf, X_test, y_test_binary, cv=5, scoring='accuracy')
models[model_name].update({'cv acc':cv_results})
# f1
models[model_name].update({'f1':f1_score(y_binary, clf.predict(X))})
models[model_name].update({'test f1':f1_score(y_test_binary, y_pred)})
cv_results = cross_val_score(clf, X_test, y_test_binary, cv=5, scoring='f1')
models[model_name].update({'cv f1':cv_results})

# sanity check to see if score is accuracy
y_train_pred = clf.predict(X_train)
assert sum(y_pred == y_test_binary) / n_test == clf.score(X_test,y_test_binary)
assert sum(y_pred == y_test_binary) / n_test == models[model_name]['test acc']

In [21]:
# plot classifier
plt.figure(figsize=[16,4])
plt.imshow(np.array(betas[1:]).reshape([image_size[1], image_size[0]]), cmap='vlag')
plt.axis('off')
plt.title('classifier {}'.format(predict_class))
cbar = plt.colorbar()
cbar.set_label('Coefficient', rotation=270)
cbar.ax.plot(0, models[model_name]['betas'][0], 'xb')
cbar.ax.text(0, models[model_name]['betas'][0], '  intercept>  ', style='italic', color='b', va='center', ha='right')
# stats
xy=[plt.gca().get_xlim()[1], plt.gca().get_ylim()[1]]
plt.text(xy[0]*1.5,xy[1], 'f1 = {:.2f}, f1$_{{cv{:g}}}$ = {:.2f} (+/-{:.2f})'.format(
    models[model_name]['f1'],
    models[model_name]['cv f1'].shape[0],
    np.mean(models[model_name]['cv f1']),
    np.std(models[model_name]['cv f1']),
) + '\n' +
         'train (n = {})'.format(y_train_binary.shape[0]) + '\n' +
         'test (n = {}, f1 = {:.2f})'.format(
             y_test.shape[0],
             models[model_name]['test f1'],
         ), style='italic', va='top', ha='left')


# Save
file_name = '../results/{}.png'.format(model_name.replace(' ','_'))
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
    print(file_name)
    with plt.style.context('../assets/context-paper.mplstyle'):
        plt.savefig(file_name, bbox_inches='tight', transparent=True)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

../results/binary_classifier_with_gradient_descent.png


In [22]:
# Cross validation
# cross_validate(clf, X_test, y_test_binary, cv=8, scoring=['accuracy', 'balanced_accuracy', 'f1'], VERBOSE=3, n_jobs=4)

<H1><a href="#clf_top">^</a></H1><a id='clf_model_2'>

# Model: random forest

## Prepare input

In [23]:
from sklearn.ensemble import RandomForestClassifier

model_name = 'random forest'

X, y = drz['data'], drz['target']
print(X.shape)
print(y.shape)

(4159, 10880)
(4159,)


## Fit

In [24]:
# instantiate a dict in models at key with name of this model
models[model_name] = dict()

#test_size = 0.3
#i_split = round(n_obs * (1-0.3)) # index where to split
#shuffle_index = np.random.RandomState(seed=42).permutation(n_obs)
#assert shuffle_index[0] == 309 # sanity check
# actual split
#X_train, y_train = X[:i_split], y[:i_split]
#X_test, y_test = X[i_split:], y[i_split:]

# split train/test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

n_test = len(X_test)


(2911, 10880)
(1248, 10880)


## Binary classifier: Random Forest Classifier

In [None]:
# class
predict_class = PREDICT_CLASS
y_binary, y_train_binary, y_test_binary = (y == predict_class), (y_train == predict_class), (y_test == predict_class)

if VERBOSE > 1 :
    plt.hist(y_binary.astype('int'), bins=[-0.5,0.5,1.5])

In [26]:
# create classifier
clf = RandomForestClassifier(random_state=42, verbose=VERBOSE>1)
models[model_name].update({'model':clf})

# fit
clf.fit(X_train, y_train_binary)
y_pred = clf.predict(X_test)

models[model_name].update({'n':y.shape[0]})
models[model_name].update({'n features':X.shape[1]})
models[model_name].update({'classes':[predict_class]})

# parameters
betas = [np.NaN, clf.feature_importances_] # NaN, because rndforclf has no intercept
models[model_name].update({'betas':betas})

# scoring

# accuracy
models[model_name].update({'acc':clf.score(X,y_binary)})
models[model_name].update({'test acc':clf.score(X_test,y_test_binary)})
cv_results = cross_val_score(clf, X_test, y_test_binary, cv=5, scoring='accuracy')
models[model_name].update({'cv acc':cv_results})
# f1
models[model_name].update({'f1':f1_score(y_binary, clf.predict(X))})
models[model_name].update({'test f1':f1_score(y_test_binary, y_pred)})
cv_results = cross_val_score(clf, X_test, y_test_binary, cv=5, scoring='f1')
models[model_name].update({'cv f1':cv_results})

# sanity check to see if score is accuracy
y_train_pred = clf.predict(X_train)
assert sum(y_pred == y_test_binary) / n_test == clf.score(X_test,y_test_binary)
assert sum(y_pred == y_test_binary) / n_test == models[model_name]['test acc']
assert clf.n_features_ == X.shape[1]


In [27]:
# plot classifier
plt.figure(figsize=[16,4])
plt.imshow(np.array(betas[1:]).reshape([image_size[1], image_size[0]]), cmap='Reds')
plt.axis('off')
plt.title('classifier {}'.format(predict_class))
cbar = plt.colorbar()
cbar.set_label('Coefficient', rotation=270)
# cbar.ax.plot(0, models[model_name]['betas'][0], 'xb')
# cbar.ax.text(0, models[model_name]['betas'][0], '  intercept>  ', style='italic', color='b', va='center', ha='right')
# stats
xy=[plt.gca().get_xlim()[1], plt.gca().get_ylim()[1]]
plt.text(xy[0]*1.5,xy[1], 'f1 = {:.2f}, f1$_{{cv{:g}}}$ = {:.2f} (+/-{:.2f})'.format(
    models[model_name]['f1'],
    models[model_name]['cv f1'].shape[0],
    np.mean(models[model_name]['cv f1']),
    np.std(models[model_name]['cv f1']),
) + '\n' +
         'train (n = {})'.format(y_train_binary.shape[0]) + '\n' +
         'test (n = {}, f1 = {:.2f})'.format(
             y_test.shape[0],
             models[model_name]['test f1'],
         ), style='italic', va='top', ha='left')


# Save
file_name = '../results/{}.png'.format(model_name.replace(' ','_'))
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
    print(file_name)
    with plt.style.context('../assets/context-paper.mplstyle'):
        plt.savefig(file_name, bbox_inches='tight', transparent=True)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

../results/random_forest.png


<H1><a href="#clf_top">^</a></H1><a id='clf_model_3'>

# Model: multiclass 

## Prepare input

In [28]:
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone

encoding_labels = LabelEncoder()
drz['encoded_target'] = encoding_labels.fit_transform(drz['target'])

model_name = 'gradient descent multiclass'

X, y = drz['data'], drz['encoded_target']
print(X.shape)
print(y.shape)

(4159, 10880)
(4159,)


## Fit

In [29]:
# instantiate a dict in models at key with name of this model
models[model_name] = dict()

#test_size = 0.3
#i_split = round(n_obs * (1-0.3)) # index where to split
#shuffle_index = np.random.RandomState(seed=42).permutation(n_obs)
#assert shuffle_index[0] == 309 # sanity check
# actual split
#X_train, y_train = X[:i_split], y[:i_split]
#X_test, y_test = X[i_split:], y[i_split:]

# split train/test set
clss,cnt = np.unique(y, return_counts=True)
clss[cnt > 1]
sel = np.isin(y, clss[cnt > 1])
X_train, X_test, y_train, y_test = train_test_split(X[sel],y[sel], test_size = 0.3, random_state=42, stratify=y[sel])
print(X_train.shape)
print(X_test.shape)

n_test = len(X_test)


(2902, 10880)
(1245, 10880)


In [30]:
# create classifier: Copy from earlier model
clf = clone(models['binary classifier with gradient descent']['model'])

models[model_name].update({'model':clf})

# fit
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

models[model_name].update({'n':y.shape[0]})
models[model_name].update({'n features':X.shape[1]})
models[model_name].update({'classes':encoding_labels.classes_})

# parameters
betas = np.concatenate([np.matrix(clf.intercept_).T, clf.coef_], axis=1)
models[model_name].update({'betas':betas})

# scoring

# accuracy
models[model_name].update({'acc':clf.score(X,y)})
models[model_name].update({'test acc':clf.score(X_test,y_test)})
cv_results = cross_val_score(clf, X_test, y_test, cv=5, scoring='accuracy')
models[model_name].update({'cv acc':cv_results})
# f1
models[model_name].update({'f1':f1_score(y, clf.predict(X), average=None)})
models[model_name].update({'test f1':f1_score(y_test, y_pred, average=None)})
cv_results = cross_val_score(clf, X_test, y_test, cv=5, scoring='f1_micro') # micro takes into account class imbalance
models[model_name].update({'cv f1':cv_results})

# sanity check to see if score is accuracy
assert sum(y_pred == y_test) / n_test == clf.score(X_test,y_test)
assert sum(y_pred == y_test) / n_test == models[model_name]['test acc']
assert f1_score(y_test, y_pred, average=None).mean() == f1_score(y_test, y_pred, average='macro')



In [None]:
hist_ = pd.DataFrame(columns=['orig','train','test','class_name'])
hist_.class_name=models[model_name]['classes']
clss,cnt = np.unique(y, return_counts=True)
hist_.loc[clss,'orig'] = cnt
clss,cnt = np.unique(y_train, return_counts=True)
hist_.loc[clss,'train'] = cnt
clss,cnt = np.unique(y_test, return_counts=True)
hist_.loc[clss,'test'] = cnt

hist_.set_index('class_name').loc[:,['train', 'test']].plot(kind='bar', stacked=True, figsize=[8,4])
plt.plot(hist_.set_index('class_name').loc[:,['orig']],'P')
plt.yscale('log')
hist_.set_index('class_name').loc[:,['train', 'test']].div(hist_.set_index('class_name').orig, axis=0).plot(kind='bar', stacked=True, figsize=[8,4])


In [32]:
# betas = [clf.intercept_[0], *clf.coef_[0]] 

plot_observations(clf.coef_, models[model_name]['classes'][clf.classes_], image_size=image_size)
file_name = '../results/{}.png'.format(model_name.replace(' ','_'))
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
    print(file_name)
    with plt.style.context('../assets/context-paper.mplstyle'):
        plt.savefig(file_name, bbox_inches='tight', transparent=True)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

plot_observations(np.array([clf.coef_.var(axis=0)]), 'variance within coefficients', image_size=image_size)
# Save
file_name = '../results/variance-within-{}.png'.format(model_name.replace(' ','_'))
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
    print(file_name)
    with plt.style.context('../assets/context-paper.mplstyle'):
        plt.savefig(file_name, bbox_inches='tight', transparent=True)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

../results/gradient_descent_multiclass.png
../results/variance-within-gradient_descent_multiclass.png


<H1><a href="#clf_top">^</a></H1><a id='clf_performance'>

# Model performances

In [33]:
# plot f1
k = 'cv f1'

# counter for x-offset
c=0

# figure
fig = plt.figure(figsize=[1,2])
ax = fig.gca()
xs = ys = [None]

# loop over all models
for name,res in models.items():

    c+=1 # x-offset

    score = res[k]
    
    # add scores and offset to vectors
    ys = np.concatenate([ys,score])
    xs = np.concatenate([xs,np.ones_like(score) * c])

# actual plotting
sns.swarmplot(x=xs, y=ys, ax=ax)
# prettify
ax.set_xticklabels(models.keys(), rotation=45, va='top', ha='right', style='italic')
ax.set_ylim(bottom=0, top=1)
ax.set_title('Classifier performance\n', style='italic')
ax.set_ylabel('F1 score', style='italic')

# save
file_name = '../results/model-performance-classification.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
    print(file_name)
    with plt.style.context('../assets/context-paper.mplstyle'):
        plt.savefig(file_name, bbox_inches='tight', transparent=True)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')    

../results/model-performance-classification.png


<H1><a href="#clf_top">^</a></H1><a id='clf_performance_confmtx'>

### Confusion matrix
https://en.wikipedia.org/wiki/Precision_and_recall

In [34]:
# plot confusion matrix
def plot_confusion_matrix(mtx, class_name):
    '''
    Present the confusion matrix as an image
    '''
    
    # results
    n_tp = mtx[1,1] # hit
    n_tn = mtx[0,0] # corr reject
    n_fp = mtx[0,1] # false alarm, Type I err
    n_fn = mtx[1,0] # miss, Type II err
    n_cp = sum(mtx[1,:]) # condition pos
    n_cn = sum(mtx[0,:]) # condition neg
    n_pp = sum(mtx[:,1]) # prediction pos
    n_pn = sum(mtx[:,0]) # prediction negative
    n_corr = n_tp + n_tn
    n_wrong = n_fp + n_fn

    plt.figure(figsize=[4,4])
    # rotate axis to get true pos upper left
    plt.xlim(left=1.5, right=-0.5)
    plt.ylim(top=1.5, bottom=-0.5)
    # prettify
    plt.xlabel('Predicted', style='italic')
    plt.ylabel('Real', style='italic')
    plt.title('Confusion matrix\n(n={})'.format(sum(mtx.ravel())), style='italic')
    plt.xticks(ticks=[0,1], labels=['not a ' + class_name, class_name])
    plt.yticks(ticks=[0,1], labels=['not a ' + class_name, class_name])

    # plot image
    im = plt.imshow(mtx, cmap='Blues_r')
    
    # add values
    for ir, pred in enumerate(mtx):
        for ic, n in enumerate(pred):
            plt.text(ic, ir, str(n), va='center', ha='center', weight='bold')

    # add summed values
    plt.text(-0.5, -0.5, n_corr, va='top', ha='left', style='italic')
    plt.text(-0.5, 1.5, n_wrong, va='bottom', ha='left', style='italic')
    plt.text(0, -0.5, n_pn, va='bottom', ha='center', style='italic')
    plt.text(1, -0.5, n_pp, va='bottom', ha='center', style='italic')
    plt.text(-0.5, 0, n_cn, va='center', ha='right', style='italic')
    plt.text(-0.5, 1, n_cp, va='center', ha='right', style='italic')

    # other stats
    recall = n_tp/n_cp # true pos rate
    specificity = n_tn/n_cn # true neg rate
    F1 = 2*(recall * specificity / (recall + specificity))
    accu = n_corr / (n_corr + n_wrong)
    bacc = (recall + specificity)/2
    prec = n_tp/n_pp

    
    xy=[plt.gca().get_xlim()[1], plt.gca().get_ylim()[1]]
    plt.text(xy[0]*1.2,xy[1],'\n'.join([
        'F1: {:.3f}', 
        'accuracy:{:.1%}',
        'balanced acc.: {:.1%}',
        'precision: {:.1%}',
        'recall: {:.1%}',
        'specificity: {:.1%}'
    ]).format(F1, accu, bacc, prec, recall, specificity),
             va='top', ha='left', style='italic'
            )

In [35]:
for model_name, model in models.items():
    clf = model['model']
    if model_name == 'gradient descent multiclass':
        y = drz['encoded_target']
    else:
        y = drz['target']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)
    y_pred = clf.predict(X_test)
    
    for predict_class_index, predict_class in enumerate(model['classes']):
        if model_name == 'gradient descent multiclass':
            y_pred_binary = y_pred == predict_class_index
            y_test_binary = y_test == predict_class_index
        else:
            y_test_binary = y_test == predict_class
            y_pred_binary = y_pred
    
        conf_mtx = confusion_matrix(y_test_binary, y_pred_binary)
        if not (conf_mtx.shape == (2,2)):
            # not possible to plot conf mtx
            print(conf_mtx)
            continue
        else:
            display(pd.DataFrame(conf_mtx, index=['real neg','real pos'], columns=['pred neg','pred pos']))
        # True negatives:  [0,0]
        # False negatives: [1,0]
        # True positives:  [1,1] 
        # False positives: [0,1].
        assert sum(conf_mtx[:,1]) == sum(y_pred_binary)
        assert sum(conf_mtx[1,:]) == sum(y_test_binary)
        assert conf_mtx[0,0] == sum(~y_pred_binary & ~y_test_binary) # correct negative

        # actual plotting
        plot_confusion_matrix(conf_mtx, predict_class)

        # save
        if not (model_name == 'gradient descent multiclass'):
            pass
        elif predict_class == models['binary classifier with gradient descent']['classes'][0]:
            pass
        else:
            # skip saving
            plt.close()
            continue
        # Save
        file_name = '../results/confusion-matrix-{}.png'.format(model_name.replace(' ','_'))
        if OPBOD:
            file_name = file_name.replace('.png', '-opbod.png')
        if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
            print(file_name)
            with plt.style.context('../assets/context-paper.mplstyle'):
                plt.savefig(file_name, bbox_inches='tight', transparent=True)
            plt.show()
        else:
            plt.show()
            print(f'Skip. {file_name} exists or saving is disabled in settings.')  

Unnamed: 0,pred neg,pred pos
real neg,1037,96
real pos,74,41


../results/confusion-matrix-binary_classifier_with_gradient_descent.png


Unnamed: 0,pred neg,pred pos
real neg,1130,3
real pos,101,14


../results/confusion-matrix-random_forest.png


Unnamed: 0,pred neg,pred pos
real neg,1242,0
real pos,2,4


Unnamed: 0,pred neg,pred pos
real neg,1246,0
real pos,1,1


Unnamed: 0,pred neg,pred pos
real neg,1129,4
real pos,50,65


../results/confusion-matrix-gradient_descent_multiclass.png


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,1,0


  prec = n_tp/n_pp


Unnamed: 0,pred neg,pred pos
real neg,1246,0
real pos,1,1


Unnamed: 0,pred neg,pred pos
real neg,1099,52
real pos,15,82


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,1,0


  prec = n_tp/n_pp


Unnamed: 0,pred neg,pred pos
real neg,1241,2
real pos,0,5


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,0,1


Unnamed: 0,pred neg,pred pos
real neg,1191,22
real pos,7,28


Unnamed: 0,pred neg,pred pos
real neg,1246,0
real pos,1,1


Unnamed: 0,pred neg,pred pos
real neg,1242,0
real pos,2,4


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1244,0
real pos,4,0


[[1248]]


  prec = n_tp/n_pp


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,1,0


[[1248]]


  prec = n_tp/n_pp


Unnamed: 0,pred neg,pred pos
real neg,1245,0
real pos,0,3


Unnamed: 0,pred neg,pred pos
real neg,1191,13
real pos,9,35


Unnamed: 0,pred neg,pred pos
real neg,1192,6
real pos,16,34


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,1,0


  prec = n_tp/n_pp


Unnamed: 0,pred neg,pred pos
real neg,1239,0
real pos,4,5


Unnamed: 0,pred neg,pred pos
real neg,1233,3
real pos,3,9


Unnamed: 0,pred neg,pred pos
real neg,1246,1
real pos,1,0


Unnamed: 0,pred neg,pred pos
real neg,1241,0
real pos,0,7


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1244,1
real pos,0,3


Unnamed: 0,pred neg,pred pos
real neg,1233,3
real pos,4,8


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1246,0
real pos,1,1


Unnamed: 0,pred neg,pred pos
real neg,1224,2
real pos,5,17


Unnamed: 0,pred neg,pred pos
real neg,1244,0
real pos,3,1


[[1248]]
[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,0,1


Unnamed: 0,pred neg,pred pos
real neg,1237,1
real pos,2,8


Unnamed: 0,pred neg,pred pos
real neg,1072,38
real pos,31,107


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,0,1


Unnamed: 0,pred neg,pred pos
real neg,1232,0
real pos,15,1


Unnamed: 0,pred neg,pred pos
real neg,1242,0
real pos,5,1


Unnamed: 0,pred neg,pred pos
real neg,1231,0
real pos,5,12


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1109,66
real pos,16,57


Unnamed: 0,pred neg,pred pos
real neg,1164,11
real pos,22,51


Unnamed: 0,pred neg,pred pos
real neg,1236,2
real pos,0,10


[[1248]]
[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1173,28
real pos,8,39


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,0,1


Unnamed: 0,pred neg,pred pos
real neg,1245,0
real pos,2,1


Unnamed: 0,pred neg,pred pos
real neg,1243,0
real pos,5,0


  prec = n_tp/n_pp


Unnamed: 0,pred neg,pred pos
real neg,1188,16
real pos,12,32


Unnamed: 0,pred neg,pred pos
real neg,1234,2
real pos,2,10


Unnamed: 0,pred neg,pred pos
real neg,1245,0
real pos,1,2


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1247,0
real pos,0,1


Unnamed: 0,pred neg,pred pos
real neg,1232,2
real pos,5,9


Unnamed: 0,pred neg,pred pos
real neg,1214,0
real pos,22,12


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1246,0
real pos,0,2


Unnamed: 0,pred neg,pred pos
real neg,911,94
real pos,48,195


[[1248]]


Unnamed: 0,pred neg,pred pos
real neg,1202,0
real pos,35,11


In [None]:
# plot classification errors
for model_name, model in models.items():
    clf = model['model']
    if model_name == 'gradient descent multiclass':
        y = drz['encoded_target']
    else:
        y = drz['target']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)
    y_pred = clf.predict(X_test)

    if model_name == 'gradient descent multiclass':
        predict_class = models['binary classifier with gradient descent']['classes'][0]
        predict_class_index = np.where(model['classes'] == predict_class)[0]
        y_test_binary = y_test == predict_class_index
        y_pred_binary = y_pred == predict_class_index
    else:
        y_test_binary = y_test == model['classes'][0]
        y_pred_binary = y_pred


    tp_test = y_pred_binary & y_test_binary
    fp_test = y_pred_binary & ~y_test_binary
    fn_test = ~y_pred_binary & y_test_binary

    if any(tp_test):
        plot_observations(X_test[tp_test], titles=y_test[tp_test], image_size=image_size)
        plt.title('true positive\n' + model_name)

    if any(fp_test):
        plot_observations(X_test[fp_test], titles=y_test[fp_test], image_size=image_size)
        plt.title('false positive\n' + model_name)

    if any(fn_test):
        plot_observations(X_test[fn_test], titles=y_test[fn_test], image_size=image_size)
        plt.title('false negative\n' + model_name)


<H1><a href="#clf_top">^</a></H1><a id='clf_performance_roc'>

### ROC [c]urve

In [37]:
# plot ROC curve
def plot_roc_curve(fpr, fnr, auc):
    '''
    Plot ROC curve. False positive (fpr) and False negative(fnr) rates are already calculated, so is area under the ROC curve (auc).
    '''
    
    
    plt.figure(figsize=[8,8])
    plt.plot([0,1], [0,1], '--k')
    plt.plot(fpr, fnr, marker='s', markeredgecolor = (0, 0, 0, 1), markerfacecolor = (1, 1, 1, .5), linestyle='None', ms=4)
    plt.axis('equal')
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xticks(ticks=np.linspace(0,1,3))
    plt.yticks(ticks=np.linspace(0,1,3))
    plt.xlabel('False positive rate', style='italic')
    plt.ylabel('True positive rate', style='italic')
    plt.title('ROC curve', style='italic')


    # stats
    xy=[plt.gca().get_xlim()[1], plt.gca().get_ylim()[1]]
    plt.text(xy[0]*1.05,xy[1],'\n'.join([
        'AUC test set: {:.3f}', 
    ]).format(auc),
             va='top', ha='left', style='italic'
            )



In [38]:
# plot ROC

# assemble results
fprs, fnrs, aucs, clss = [[],[],[],[]]
for model_name, model in models.items():
    print(model_name)
    clf = model['model']
    if model_name == 'gradient descent multiclass':
        y = drz['encoded_target']
    else:
        y = drz['target']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)
    y_pred = clf.predict(X_test)
   
    if model_name == 'gradient descent multiclass':
        predict_class = models['binary classifier with gradient descent']['classes'][0]
        predict_class_index = np.where(model['classes'] == predict_class)[0][0]
        y_pred_binary = y_pred == predict_class_index
        y_test_binary = y_test == predict_class_index

    else:
        predict_class = model['classes'][0]
        y_pred_binary = y_pred
        y_test_binary = y_test == predict_class
        

    # Scores at all thresholds. 'cross_val_score' uses theshold 0
    if (model_name == 'binary classifier with gradient descent') | (model_name == 'gradient descent multiclass'):
        clf_scores = cross_val_predict(clf, X_test, y_test_binary, cv=8, method="decision_function", n_jobs=4)
    else:
        clf_scores = cross_val_predict(clf, X_test, y_test_binary, cv=8, method="predict_proba", n_jobs=4)[:,1]

    fpr, fnr, thresholds = roc_curve(y_test_binary, clf_scores)
    auc = roc_auc_score(y_test_binary, clf_scores)
    fprs += [fpr]
    fnrs += [fnr]
    aucs += [auc]
    clss += [predict_class]

# first creates a plot
plot_roc_curve(fprs[0], fnrs[0], aucs[0])
# add to plot
for fpr, fnr, auc in zip(fprs[1:], fnrs[1:], aucs[1:]):
    plt.plot(fpr, fnr, marker='s', linestyle='None', ms=4, alpha=0.5)
# legend
labels = ['{} ({}) {:.2f}'.format(m, c, a) for m,c,a in zip(models, clss, aucs)]
plt.legend(['random clf'] + labels, loc='lower left', bbox_to_anchor=[1, 0])


# save
file_name = '../results/ROC-curves.png'
if OPBOD:
    file_name = file_name.replace('.png', '-opbod.png')
if (SKIPSAVE==False): #and (not(os.path.isfile(file_name))):
    print(file_name)
    with plt.style.context('../assets/context-paper.mplstyle'):
        plt.savefig(file_name, bbox_inches='tight', transparent=True)
else:
    plt.show()
    print(f'Skip. {file_name} exists or saving is disabled in settings.')  

binary classifier with gradient descent
random forest
gradient descent multiclass
../results/ROC-curves.png


In [39]:
assert False, 'Stop running. Below is sandboxing.'

AssertionError: Stop running. Below is sandboxing.

openML has the following structure: 

```
from sklearn.datasets import fetch_openml
#mnist = fetch_openml('mnist_784')
iris = fetch_openml('iris')

# choose dataset
openml_dataset = iris
openml_dataset.keys()



dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])
```
- - - 
The dictonary has the following keys

```
for k in openml_dataset:
    print(k, type(openml_dataset[k]))



data <class 'numpy.ndarray'>
target <class 'numpy.ndarray'>
feature_names <class 'list'>
DESCR <class 'str'>
details <class 'dict'>
categories <class 'dict'>
url <class 'str'>
```
- - - 
The fields contain the following
```
# create table
head = ['field', 'type', 'length']
width = [32, 32, 8]
pat = ['{{:>{}s}} | {{:{}s}} | {{:>{}s}}'.format(*width),
       '{{:>{}s}}-+-{{:{}s}}-+-{{:<{}s}}'.format(*width),
       '{{:>{}s}} | {{:{}s}} | {{:{}.0f}}'.format(*width),
      ]
sep = pat[1].format(''.join(['-']*width[0]), ''.join(['-']*width[1]), ''.join(['-']*width[2]))


# header
print(openml_dataset['details']['id'], openml_dataset['details']['name'], openml_dataset['details']['version'], openml_dataset['details']['original_data_url'], )
print(pat[0].format(*head))
print(sep)
# values
for k in openml_dataset:
    print(pat[2].format(k, str(type(openml_dataset[k])), len(openml_dataset[k])))

# deeper dict
k = 'details'
print(sep)
print(pat[0].format('{}[field]'.format(k), 'type', 'length'))
print(sep)
for kk in openml_dataset[k]:
    print(pat[2].format(kk, str(type(openml_dataset[k][kk])), len(openml_dataset[k][kk])))



61 iris 1 https://archive.ics.uci.edu/ml/datasets/Iris
                           field | type                             |   length
---------------------------------+----------------------------------+---------
                            data | <class 'numpy.ndarray'>          |      150
                          target | <class 'numpy.ndarray'>          |      150
                   feature_names | <class 'list'>                   |        4
                           DESCR | <class 'str'>                    |      932
                         details | <class 'dict'>                   |       19
                      categories | <class 'dict'>                   |        0
                             url | <class 'str'>                    |       27
---------------------------------+----------------------------------+---------
                  details[field] | type                             |   length
---------------------------------+----------------------------------+---------
                              id | <class 'str'>                    |        2
                            name | <class 'str'>                    |        4
                         version | <class 'str'>                    |        1
                          format | <class 'str'>                    |        4
                         creator | <class 'str'>                    |       11
                 collection_date | <class 'str'>                    |        4
                     upload_date | <class 'str'>                    |       19
                         licence | <class 'str'>                    |        6
                             url | <class 'str'>                    |       52
                         file_id | <class 'str'>                    |        2
        default_target_attribute | <class 'str'>                    |        5
                   version_label | <class 'str'>                    |        1
                             tag | <class 'list'>                   |       11
                      visibility | <class 'str'>                    |        6
               original_data_url | <class 'str'>                    |       44
                       paper_url | <class 'str'>                    |       63
                          status | <class 'str'>                    |        6
                 processing_date | <class 'str'>                    |       19
                    md5_checksum | <class 'str'>                    |       32
```
- - -
Size of _data_ and _target_ are
```
print(openml_dataset['data'].shape)
print(openml_dataset['target'].shape)



(150, 4)
(150,)
```

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
iris = fetch_openml('iris')

In [None]:
openml_dataset = drz
openml_dataset.keys()

In [None]:
iris['categories']

In [None]:
# create table
head = ['field', 'type', 'length']
width = [32, 32, 8]
pat = ['{{:>{}s}} | {{:{}s}} | {{:>{}s}}'.format(*width),
       '{{:>{}s}}-+-{{:{}s}}-+-{{:<{}s}}'.format(*width),
       '{{:>{}s}} | {{:{}s}} | {{:{}.0f}}'.format(*width),
      ]
sep = pat[1].format(''.join(['-']*width[0]), ''.join(['-']*width[1]), ''.join(['-']*width[2]))


# header
print(openml_dataset['details']['id'], openml_dataset['details']['name'], openml_dataset['details']['version'], openml_dataset['details']['original_data_url'], )
print(pat[0].format(*head))
print(sep)
# values
for k in openml_dataset:
    print(pat[2].format(k, str(type(openml_dataset[k])), len(openml_dataset[k])))

# deeper dict
k = 'details'
print(sep)
print(pat[0].format('{}[field]'.format(k), 'type', 'length'))
print(sep)
for kk in openml_dataset[k]:
    print(pat[2].format(kk, str(type(openml_dataset[k][kk])), len(openml_dataset[k][kk])))


In [None]:
print(openml_dataset['data'].shape)
print(openml_dataset['target'].shape)


In [None]:
openml_dataset['details'].keys()

In [None]:
mnist['feature_names']

In [None]:
def im2magn(im):
    fft = np.fft.fft2(im)
    magnitude_spectrum = 20*np.log(np.abs(fft))
    mm = [np.unique(magnitude_spectrum[~np.isinf(magnitude_spectrum)])[i] for i in [0,-1]]
    norm = mm[0] + magnitude_spectrum/mm[1]-mm[0]

    norm[norm<0] = 0
    norm[norm>1] = 1
    
    return norm

In [None]:
fn1 = '/home/tom/data/satdatsci-images/2021/2021-16-8083-00.jpg'
fn2 = '/home/tom/data/satdatsci-images/2021/2021-16-8082-00.jpg'

In [None]:
im = Image.open(fn2)
im = im.convert('L')
im

In [None]:
# im=np.reshape(drz['data'][-1],(85,-1))

fft = np.fft.fft2(im)
real = np.real(fft)
real[0,0] =0
imag = np.imag(fft)
fshift = fft.copy()
fshift = np.fft.fftshift(fshift)
#fshift = np.fft.ifftshift(fshift)
magnitude_spectrum = 20*np.log(np.abs(fshift))
mm = [np.unique(magnitude_spectrum[~np.isinf(magnitude_spectrum)])[i] for i in [0,-1]]
norm = mm[0] + magnitude_spectrum/mm[1]-mm[0]

norm[norm<0] = 0
norm[norm>1] = 1

In [None]:
#plt.imshow(np.real(fft))
#plt.imshow(np.real(real))
plt.imshow(norm)

In [None]:
[np.unique(np.ravel(magnitude_spectrum))[i] for i in [0,1,-2,-1]]

In [None]:
norm.max()


In [None]:
plt.imshow(np.real(np.fft.ifft2(fft))*-1)

In [None]:
plt.imshow(np.real(np.fft.ifft2(real))*-1)