# Model Evaluation and Prediction

This notebook evaluates and makes predictions with models we trained.
1. CNN
2. ArcFace
3. tfidf
4. ArcFace+tfidf

References: https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface

In [41]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB3
from sklearn import metrics
#from tensorflow.keras import backend as K
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import util

## Parameters

In [29]:
TRAIN_CSV_PATH = '../../data/train.csv'
TRAIN_IMG_DIR = '../../data/train_images/'
ARCFACE_MODEL_PATH = './trained/arcface_best_epoch_512_42.h5'
CNN_MODEL_PATH = './trained/CNN.h5'
N_CLASSES = 11014
IMAGE_SIZE = [512,512]
BATCH_SIZE = 8

AUTO = tf.data.experimental.AUTOTUNE

## Function Definition

In [30]:
def get_dataset_csv():
    '''
    Read and prepare the tabular records
    '''
    train = pd.read_csv(TRAIN_CSV_PATH)
    tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
    train['matches'] = train['label_group'].map(tmp)
    train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
    img_paths = TRAIN_IMG_DIR + train['image']
        
    return train, img_paths

In [31]:
def read_image(image):
    '''
    Parse an image
    '''
    image = tf.io.read_file(image)
    image = util.decode_image(image, IMAGE_SIZE)
    return image

# Function to get our dataset that read images
def get_dataset_img(image):
    '''
    Read and prepare the image dataset
    '''
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [32]:
def load_model(which='arcface'):
    '''
    Load pretrained image models
    '''
    
    assert(which=='arcface' or which=='cnn'), 'which==\'arcface\' or \'cnn\''
    
    if which == 'cnn':
        print('Reminder: image size = (224,224)')
        return tf.keras.models.load_model('./trained/CNN.h5')
    
    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x = EfficientNetB3(weights = 'imagenet', include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    
    margin = util.ArcMarginProduct(
        n_classes = N_CLASSES, 
        s = 30, 
        m = 0.5, 
        name='head/arc_margin', 
        dtype='float32'
    )
    
    x = margin([x, label])

    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights(ARCFACE_MODEL_PATH)
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    
    return model

In [33]:
def embed_images(img_paths, model):
    image_dataset = get_dataset_img(img_paths)
    image_embeddings = model.predict(image_dataset)   
    return image_embeddings

In [56]:
def embed_titles(df, max_features = 15500):
    tfidf = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = tfidf.fit_transform(df['title'])
    return text_embeddings

In [64]:
def nearest_neighbor(df, 
                     embeddings, 
                     thresholds = np.arange(0.3,0.6,0.01), 
                     n_neighbors = 50):
    '''
    1. Performs knn, then find the best threshold
    2. Use the best threshold to make predictions
    '''

    knn = NearestNeighbors(n_neighbors = n_neighbors, metric='cosine')
    knn.fit(embeddings)
    
    distances, indices = knn.kneighbors(embeddings)
    
    scores = []
    for threshold in thresholds:
        preds = []
        for k in range(embeddings.shape[0]):
            index_matches = np.where(distances[k,] < threshold)[0]
            id_matches = indices[k,index_matches]
            posting_ids = ' '.join(df['posting_id'].iloc[id_matches].values)
            preds.append(posting_ids)
        df['pred_matches'] = preds
        df['f1'] = util.f1_score(df['matches'], df['pred_matches'])
        score = df['f1'].mean()
        print(f'Threshold:{threshold} - F1: {score}')
        scores.append(score)
    df_thresholds = pd.DataFrame({'threshold':thresholds,'f1':scores})
    best_f1 = df_thresholds.f1.max()
    best_threshold = df_thresholds[df_thresholds.f1==best_f1].threshold.min()
    print('-'*100)
    print(f'The best threshold is {best_threshold} with an f1 of {best_f1}')
    
    best_preds = []
    for k in range(embeddings.shape[0]):
        index_matches = np.where(distances[k,] < best_threshold)[0]
        id_matches = indices[k,index_matches]
        posting_ids = df['posting_id'].iloc[id_matches].values
        best_preds.append(posting_ids)
    
    return df, best_threshold, best_f1, best_preds

## Evaluate and Predict

In [37]:
df, img_paths = get_dataset_csv()

In [38]:
model = load_model('arcface')

In [39]:
# 00:06:00
image_embeddings = embed_images(img_paths, model)

Image embeddings shape is (34250, 1536)


In [76]:
df, img_threshold, img_f1, img_preds = nearest_neighbor(df, 
                                                        image_embeddings, 
                                                        thresholds = np.arange(0.3,0.6,0.01))

Threshold:0.3 - F1: 0.807931808369746
Threshold:0.31 - F1: 0.815814757157693
Threshold:0.32 - F1: 0.8227750264643898
Threshold:0.33 - F1: 0.8296851575414907
Threshold:0.34 - F1: 0.836653371502557
Threshold:0.35000000000000003 - F1: 0.8435490339690283
Threshold:0.36000000000000004 - F1: 0.8497741344366871
Threshold:0.37000000000000005 - F1: 0.8565898347407793
Threshold:0.38000000000000006 - F1: 0.8629129501399706
Threshold:0.39000000000000007 - F1: 0.868926199591246
Threshold:0.4000000000000001 - F1: 0.8743222944555005
Threshold:0.4100000000000001 - F1: 0.8795463257153483
Threshold:0.4200000000000001 - F1: 0.8842590815368498
Threshold:0.4300000000000001 - F1: 0.8891689378633886
Threshold:0.4400000000000001 - F1: 0.8929264811939516
Threshold:0.4500000000000001 - F1: 0.8967264739132388
Threshold:0.46000000000000013 - F1: 0.9002619797067279
Threshold:0.47000000000000014 - F1: 0.9035653413378164
Threshold:0.48000000000000015 - F1: 0.9062124738966897
Threshold:0.49000000000000016 - F1: 0.908

In [None]:
title_embeddings = embed_titles(df)

In [66]:
df, text_threshold, title_f1, title_preds = nearest_neighbor(df, 
                                                      title_embeddings, 
                                                      thresholds = np.arange(0.3,0.6,0.01))

Threshold:0.3 - F1: 0.630776547251398
Threshold:0.31 - F1: 0.6352270544040849
Threshold:0.32 - F1: 0.6385704191694083
Threshold:0.33 - F1: 0.6425737224521486
Threshold:0.34 - F1: 0.6461641669803598
Threshold:0.35000000000000003 - F1: 0.6496528451842519
Threshold:0.36000000000000004 - F1: 0.653089439006744
Threshold:0.37000000000000005 - F1: 0.6557794983689451
Threshold:0.38000000000000006 - F1: 0.6579680043339512
Threshold:0.39000000000000007 - F1: 0.6598584657763932
Threshold:0.4000000000000001 - F1: 0.6616317472597671
Threshold:0.4100000000000001 - F1: 0.6635153361547791
Threshold:0.4200000000000001 - F1: 0.6651819166264529
Threshold:0.4300000000000001 - F1: 0.6659392906886799
Threshold:0.4400000000000001 - F1: 0.6660580341009082
Threshold:0.4500000000000001 - F1: 0.6656979203004432
Threshold:0.46000000000000013 - F1: 0.6644144768688384
Threshold:0.47000000000000014 - F1: 0.6638981909620011
Threshold:0.48000000000000015 - F1: 0.6619719121488122
Threshold:0.49000000000000016 - F1: 0.6

In [69]:
def combine_predictions(row):
    '''
    Combines image predictions and text predictions (through union)
    '''
    x = np.union1d(row['pred_img'], row['pred_title'])
    return ' '.join(x)

In [91]:
df['pred_img'] = img_preds
df['pred_title'] = title_preds
final_pred = df.apply(combine_predictions, axis=1)

In [92]:
df['f1'] = util.f1_score(df['matches'], final_pred)
score = df['f1'].mean()
print(f'ArcFace+tfidf f1: {score}')

ArcFace+tfidf f1: 0.8397749437764472
