In [1]:
from utils import pickle_load
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import pandas as pd
import os
from glob import glob
from evaluate import compute_sim, evaluate, l2norm_numpy
from evaluate import dict2list

def l2norm(embs):
    return {k: v/np.linalg.norm(v) for k, v in embs.items()}

infer_dirs = glob('infer/*')[:]
# infer_dirs = infer_dirs[1:]
infer_dirs = [x for x in infer_dirs if 'bk' not in x and 'eval' not in x]
weights = [1.0] * len(infer_dirs)
# weights = [1.0, 0.5, 0.5]

print(f"Ensemble of {len(infer_dirs)} models")

submit_file = f'submission.csv'
if os.path.exists('D:/whale_data/train_images-384-384/train_images-384-384'):
    train_img_dir = 'D:/whale_data/train_images-384-384/train_images-384-384'
else:
    train_img_dir = '/Users/macbook/works/train_images-384-384'

norm=True
method = 'cat'

def get_emb(infer_dirs, subset):
    li = []
    for infer_dir in infer_dirs:
        if isinstance(subset, str):
            embs = pickle_load(f"{infer_dir}/{subset}_emb.pkl")
        else:
            embs = {}
            for s in subset:
                p = f"{infer_dir}/{s}_emb.pkl"
                if os.path.exists(p):
                    embs = {**embs, **pickle_load(p)}
                elif s != "val":
                    raise FileNotFoundError(p)

        li.append(embs)

    if len(li) == 1:
        return li[0]

    print(f'Ensemble from {len(infer_dirs)} models {infer_dirs}')
    li0 = li[0]
    di = {}
    for k in li0.keys():
        di[k] = []
        for i in range(len(li)):
            e = li[i][k]
            di[k].append(e)

    return di

train_df = pd.read_csv('data/train_kfold.csv')


Ensemble of 5 models


In [2]:
from sklearn.preprocessing import normalize, QuantileTransformer

def add_border(img, color):
    bordersize = 14
    return cv2.copyMakeBorder(
        img,
        top=bordersize,
        bottom=bordersize,
        left=bordersize,
        right=bordersize,
        borderType=cv2.BORDER_CONSTANT,
        value=color
    )

In [6]:
nens = len(infer_dirs)
def load(infer_dirs, subset):
    embs = get_emb(infer_dirs, subset)
    k, v = dict2list(embs)

    # for i in range(nens):
    #     v[:, i, :] = normalize(v[:, i, :])
    # v = v.reshape(-1, 512 * nens)
    return k, v

In [8]:
# assert False
from sklearn.neighbors import NearestNeighbors
n_neighbors = 100
knn = NearestNeighbors(n_neighbors=n_neighbors,metric='cosine')

train_k, train_v = load(infer_dirs, 'train')
test_k, test_v = load(infer_dirs, 'test')

Ensemble from 5 models ['infer\\b5_cutout_800_lb0.832', 'infer\\b5_pseudo_cutout_lb0.836', 'infer\\b5_pseudo_lb0.832', 'infer\\b5_pseudo_lb0.833', 'infer\\b6_pseudo_lb0.83x']
Ensemble from 5 models ['infer\\b5_cutout_800_lb0.832', 'infer\\b5_pseudo_cutout_lb0.836', 'infer\\b5_pseudo_lb0.832', 'infer\\b5_pseudo_lb0.833', 'infer\\b6_pseudo_lb0.83x']


In [13]:
from tqdm.auto import tqdm

distances_list = []
idxs_list = []
for i in tqdm(range(nens)):
    traini, testi = train_v[:, i, :], test_v[:, i, :]
    f = QuantileTransformer(output_distribution="normal")
    f.fit(testi)
    traini = f.transform(traini)
    testi = f.transform(testi)
    knn.fit(traini)
    d, idx = knn.kneighbors(testi, n_neighbors, return_distance=True)
    distances_list.append(d)
    idxs_list.append(idx)

100%|██████████| 5/5 [05:50<00:00, 70.00s/it]


In [35]:
distances = np.stack(distances_list)
idxs = np.stack(idxs_list)

In [None]:
from tqdm import tqdm

test_df = []
train_k = np.asarray(train_k)
img2id = dict(zip(train_df.image, train_df.individual_id))

for i in tqdm(range(len(test_k))):
    dist, idx = distances[i], idxs[i]
    for d, id in zip(dist, idx):
        img_id = train_k[id]
        tar = img2id[img_id]
        test_df.append([test_k[i], tar, d])

    
cols = ['image', 'target', 'distances']
test_df = pd.DataFrame(test_df, columns=cols)
raw_test_df = test_df.copy()
test_df['confidence'] = 1-test_df['distances']
test_df = test_df.groupby(cols[:2]).confidence.max().reset_index()
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
test_df.to_csv('test_neighbors.csv')
test_df.head(3)

100%|██████████| 27956/27956 [00:03<00:00, 7072.05it/s]


Unnamed: 0,image,target,confidence
0,dd5f7eb1cbe207.jpg,84a261c0e5cf,0.999468
1,7a785b700b0339.jpg,c93996835aa8,0.999455
2,fe41a5bf4593bc.jpg,84a261c0e5cf,0.999355


In [None]:
def accum(x):
    # if len(x) > 3 and np.max(x) < 0.51:
    #     return np.max(x) - 0.002 * len(x)
    r = np.sum([v ** (8 * (i + 1)) for i, v in enumerate(x)])
    return r

raw_test_df['confidence'] = 1 - raw_test_df['distances']
test_df = raw_test_df.groupby(['image','target']).confidence.agg(accum).reset_index()
test_df = test_df.sort_values(['confidence'], ascending=False).reset_index(drop=True)
test_df.head(3)

Unnamed: 0,image,target,confidence
0,fb4f83e1a357ed.jpg,6a3af6e0c55c,24.20088
1,5acba135e9222c.jpg,6a3af6e0c55c,23.92304
2,bcd9dfc85d2d7a.jpg,6a3af6e0c55c,23.779226


In [None]:
xt = test_df.groupby('image').confidence.max().reset_index()
for thr in np.arange(0.0, 0.55, 0.0001):
    r = len(xt[xt['confidence'] <= thr]) / len(xt)
    if abs(r - 0.22) < 0.005:
        print(f"THR: {thr}: {r}")
        break

THR: 0.0039000000000000003: 0.21548147088281586


In [None]:
predictions = {}
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

for i,row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row.image in predictions:
        if len(predictions[row.image])==5:
            continue
        predictions[row.image].append(row.target)
    elif row.confidence>0.0:
        predictions[row.image] = [row.target,'new_individual']
    else:
        predictions[row.image] = ['new_individual',row.target]

c = 0
for x in tqdm(predictions):
    if len(predictions[x])<5:
        c+=1
        remaining = [y for y in sample_list if y not in predictions]
        predictions[x] = predictions[x]+remaining
        predictions[x] = predictions[x][:5]
    predictions[x] = ' '.join(predictions[x])

print(c)
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions.to_csv('submissionv2.csv',index=False)
predictions.head()

100%|██████████| 1207833/1207833 [01:00<00:00, 19947.78it/s]
100%|██████████| 27956/27956 [00:00<00:00, 1330745.32it/s]

1104





Unnamed: 0,image,predictions
0,fb4f83e1a357ed.jpg,6a3af6e0c55c new_individual 938b7e931166 5bf17...
1,5acba135e9222c.jpg,6a3af6e0c55c new_individual 938b7e931166 5bf17...
2,bcd9dfc85d2d7a.jpg,6a3af6e0c55c new_individual 938b7e931166 5bf17...
3,78773f7edcf992.jpg,6a3af6e0c55c new_individual 938b7e931166 5bf17...
4,0454ac14150afc.jpg,6a3af6e0c55c new_individual 938b7e931166 5bf17...


In [None]:
v1 = pd.read_csv('submission.csv')
v2 = predictions

m = v1.merge(v2, on='image', how='left')
(m.predictions_x == m.predictions_y).mean()

0.6054871941622549