In [1]:
from utils import pickle_load
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import pandas as pd
import os
from glob import glob
from evaluate import compute_sim, evaluate, l2norm_numpy
from evaluate import dict2list

def l2norm(embs):
    return {k: v/np.linalg.norm(v) for k, v in embs.items()}

infer_dirs = glob('infer/*')[:]
# infer_dirs = infer_dirs[1:]
infer_dirs = [x for x in infer_dirs if 'bk' not in x and 'eval' not in x]
weights = [1.0] * len(infer_dirs)
# weights = [1.0, 0.5, 0.5]

print(f"Ensemble of {len(infer_dirs)} models")

submit_file = f'submission.csv'
if os.path.exists('D:/whale_data/train_images-384-384/train_images-384-384'):
    train_img_dir = 'D:/whale_data/train_images-384-384/train_images-384-384'
else:
    train_img_dir = '/Users/macbook/works/train_images-384-384'

norm=True
method = 'cat'

def get_emb(infer_dirs, subset):
    li = []
    for infer_dir in infer_dirs:
        if isinstance(subset, str):
            embs = pickle_load(f"{infer_dir}/{subset}_emb.pkl")
        else:
            embs = {}
            for s in subset:
                p = f"{infer_dir}/{s}_emb.pkl"
                if os.path.exists(p):
                    embs = {**embs, **pickle_load(p)}
                elif s != "val":
                    raise FileNotFoundError(p)

        li.append(embs)

    if len(li) == 1:
        return li[0]

    print(f'Ensemble from {len(infer_dirs)} models {infer_dirs}')
    li0 = li[0]
    di = {}
    for k in li0.keys():
        di[k] = []
        for i in range(len(li)):
            e = li[i][k]
            di[k].append(e)

    return di

train_df = pd.read_csv('data/train_kfold.csv')


Ensemble of 4 models


In [2]:
from sklearn.preprocessing import normalize, QuantileTransformer

def add_border(img, color):
    bordersize = 14
    return cv2.copyMakeBorder(
        img,
        top=bordersize,
        bottom=bordersize,
        left=bordersize,
        right=bordersize,
        borderType=cv2.BORDER_CONSTANT,
        value=color
    )

In [3]:
def load(infer_dirs, subset):
    nens = len(infer_dirs)
    embs = get_emb(infer_dirs, subset)
    k, v = dict2list(embs)

    for i in range(nens):
        v[:, i, :] = normalize(v[:, i, :])
    v = v.reshape(-1, 512 * nens)
    return k, v

In [14]:
# assert False
from sklearn.neighbors import NearestNeighbors
n_neighbors = 100
knn = NearestNeighbors(n_neighbors=n_neighbors,metric='cosine')

train_k, train_v = load(infer_dirs, 'train')
test_k, test_v = load(infer_dirs, 'test')
quantile = True
f = QuantileTransformer(output_distribution="normal")
if quantile:
    f.fit(test_v)
    train_v = f.transform(train_v)
    test_v = f.transform(test_v)
knn.fit(train_v)

Ensemble from 4 models ['infer\\b5_pseudo_cutout_lb0.836', 'infer\\b5_pseudo_lb0.833', 'infer\\b5_pseudo_m0.6_lb0.842', 'infer\\b6_pseudo_lb0.83x']
Ensemble from 4 models ['infer\\b5_pseudo_cutout_lb0.836', 'infer\\b5_pseudo_lb0.833', 'infer\\b5_pseudo_m0.6_lb0.842', 'infer\\b6_pseudo_lb0.83x']


NearestNeighbors(metric='cosine', n_neighbors=100)

In [15]:
distances, idxs = knn.kneighbors(test_v, n_neighbors, return_distance=True)

In [16]:
from tqdm import tqdm

test_df = []
train_k = np.asarray(train_k)
img2id = dict(zip(train_df.image, train_df.individual_id))

for i in tqdm(range(len(test_k))):
    dist, idx = distances[i], idxs[i]
    for d, id in zip(dist, idx):
        img_id = train_k[id]
        tar = img2id[img_id]
        test_df.append([test_k[i], tar, d])

    
cols = ['image', 'target', 'distances']
test_df = pd.DataFrame(test_df, columns=cols)
raw_test_df = test_df.copy()
test_df['confidence'] = 1-test_df['distances']
test_df = test_df.groupby(cols[:2]).confidence.max().reset_index()
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
test_df.to_csv('test_neighbors.csv')
test_df.head(3)

100%|██████████| 27956/27956 [00:04<00:00, 6482.07it/s]


Unnamed: 0,image,target,confidence
0,fe41a5bf4593bc.jpg,84a261c0e5cf,0.999504
1,dd5f7eb1cbe207.jpg,84a261c0e5cf,0.999496
2,7a785b700b0339.jpg,c93996835aa8,0.999414


In [17]:
def accum(x):
    # if len(x) > 3 and np.max(x) < 0.51:
    #     return np.max(x) - 0.002 * len(x)
    r = np.sum([v ** (8 * (i + 1)) for i, v in enumerate(x)])
    return r

raw_test_df['confidence'] = 1 - raw_test_df['distances']
test_df = raw_test_df.groupby(['image','target']).confidence.max().reset_index()
test_df = test_df.sort_values(['confidence'], ascending=False).reset_index(drop=True)
test_df.head(3)

Unnamed: 0,image,target,confidence
0,fe41a5bf4593bc.jpg,84a261c0e5cf,0.999504
1,dd5f7eb1cbe207.jpg,84a261c0e5cf,0.999496
2,7a785b700b0339.jpg,c93996835aa8,0.999414


In [18]:
xt = test_df.groupby('image').confidence.max().reset_index()
for thr in np.arange(0.0, 0.55, 0.0001):
    r = len(xt[xt['confidence'] <= thr]) / len(xt)
    if abs(r - 0.22) < 0.005:
        print(f"THR: {thr}: {r}")
        break

THR: 0.49110000000000004: 0.21505222492488196


In [19]:
predictions = {}
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

for i,row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row.image in predictions:
        if len(predictions[row.image])==5:
            continue
        predictions[row.image].append(row.target)
    elif row.confidence>.5:
        predictions[row.image] = [row.target,'new_individual']
    else:
        predictions[row.image] = ['new_individual',row.target]

c = 0
for x in tqdm(predictions):
    if len(predictions[x])<5:
        c+=1
        remaining = [y for y in sample_list if y not in predictions]
        predictions[x] = predictions[x]+remaining
        predictions[x] = predictions[x][:5]
    predictions[x] = ' '.join(predictions[x])
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
submit_file = 'submission_mean.csv'
if quantile:
    submit_file = 'submission_quantile.csv'
predictions.to_csv(submit_file,index=False)
predictions.head()

100%|██████████| 1183022/1183022 [01:00<00:00, 19482.32it/s]
100%|██████████| 27956/27956 [00:00<00:00, 965673.69it/s]

1115





Unnamed: 0,image,predictions
0,fe41a5bf4593bc.jpg,84a261c0e5cf new_individual cf0aca801a93 a8e1f...
1,dd5f7eb1cbe207.jpg,84a261c0e5cf new_individual cf0aca801a93 a8e1f...
2,7a785b700b0339.jpg,c93996835aa8 new_individual d36d5a07500f 39af3...
3,8de09ac45aa1df.jpg,7fdeba948ee8 new_individual 0f35764e14aa 5da93...
4,e9abb76a5bed89.jpg,35f898e6595e new_individual c737ccb75e16 524b4...


In [22]:
# v1 = pd.read_csv('C:/Users/msi/Downloads/effnetv1_b5_avg_768_m0.5_drop0.3_cutout_full_pseudo_submission.csv')
# v2 = pd.read_csv('C:/Users/msi/Downloads/effnetv1_b5_avg_768_full_pseudo_m0.5_lseps0.01_submission.csv')
v1 = pd.read_csv('submission_quantile.csv')
v2 = pd.read_csv('submission_mean.csv')
v1['top1'] = v1['predictions'].str.split(' ', 1).str[0]
v2['top1'] = v2['predictions'].str.split(' ', 1).str[0]
m = v1.merge(v2, on='image', how='left')
(m.top1_x == m.top1_y).mean()
# (m.predictions_x == m.predictions_y).mean()

0.9932393761625411