# Import Libraries

In [None]:
import os
import math
import random
import time

import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from glob import glob
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import optuna

from pandas_profiling import ProfileReport

# Configuration

In [None]:
CFG = {'seed': 1337}

# Helper Functions

In [None]:
def seed_everything(seed: int = 1337):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
seed_everything(CFG['seed'])

# Load Test Data

In [None]:
train = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=CFG['seed'])

train['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(np.arange(train.shape[0]), train['label'].values)):
    train.at[val_idx, 'fold'] = fold

In [None]:
train = train.set_index('image_id')
train.head(5)

# Load predictions

## Align image_id

In [None]:
def align_image_ids(train: pd.DataFrame, df: pd.DataFrame):
    df = df.set_index('image_id')

    y = pd.DataFrame()
    y_image_ids, y_prob0, y_prob1, y_prob2, y_prob3, y_prob4 = [], [], [], [], [], []

    for image_id in train.index:
        row = df.loc[image_id][[f'logits{i}' for i in range(5)]]

        p0, p1, p2, p3, p4 = row.values

        y_image_ids.append(image_id)
        y_prob0.append(p0)
        y_prob1.append(p1)
        y_prob2.append(p2)
        y_prob3.append(p3)
        y_prob4.append(p4)

    y['image_id'] = y_image_ids
    y['logits0'] = y_prob0
    y['logits1'] = y_prob1
    y['logits2'] = y_prob2
    y['logits3'] = y_prob3
    y['logits4'] = y_prob4

    return y

In [None]:
def get_predictions(preds):
    return np.array([preds['logits0'], preds['logits1'], preds['logits2'], preds['logits3'], preds['logits4']]).T

In [None]:
def calc_p(preds, a0, a1, a2, a3, a4, a5, a6):
# def calc_p(preds, a0, a1, a2, a3, a4, a5):
# def calc_p(preds, a0, a1, a2, a3, a4):
# def calc_p(preds, a0, a1, a2, a3):
    # logits = a0 * preds[0] + a1 * preds[1] + a2 * preds[2] + a3 * preds[3]
    # logits = a0 * preds[0] + a1 * preds[1] + a2 * preds[2] + a3 * preds[3] + a4 * preds[4]
    # logits = a0 * preds[0] + a1 * preds[1] + a2 * preds[2] + a3 * preds[3] + a4 * preds[4] + a5 * preds[5]
    logits = a0 * preds[0] + a1 * preds[1] + a2 * preds[2] + a3 * preds[3] + a4 * preds[4] + a5 * preds[5] + a6 * preds[6]
    return logits.argmax(1)

In [None]:
cv_preds = sorted(glob(os.path.join('../input/leaf-disease-validation', '*.csv')))[:-1]
cv_preds

In [None]:
cv_dfs = [align_image_ids(train, pd.read_csv(path)) for path in cv_preds]

In [None]:
preds = [
    get_predictions(cv_dfs[0]),
    get_predictions(cv_dfs[1]),
    get_predictions(cv_dfs[2]),
    get_predictions(cv_dfs[3]),
    get_predictions(cv_dfs[4]),
    get_predictions(cv_dfs[5]),
    get_predictions(cv_dfs[6]),  # corrected one
]

In [None]:
labels = np.asarray([train.loc[image_id]['label'] for image_id in cv_dfs[0]['image_id'].values])

In [None]:
# # cutting 2019 data
# preds = [pred[:-4940, ...] for pred in preds]
# labels = labels[:-4940, ...]

In [None]:
# weights = [0.64438387, 0.06787352, 0.21374317, 0.92894338, 0.30073056, 0.25681572]
# logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2] + weights[3] * preds[3] + weights[4] * preds[4] + weights[5] * preds[5]

# train['pseudo'] = logits.argmax(1)
# train.to_csv('pseudo_label_train.csv')
# train

# Tuning Ensemble Weights

In [None]:
from heapq import heappush, heappop, heappushpop

CAPACITY_INCREMENT = 1000


class _Simplex:
	def __init__(self, pointIndices, testCoords, contentFractions, objectiveScore, opportunityCost, contentFraction, difference):
		self.pointIndices = pointIndices
		self.testCoords = testCoords
		self.contentFractions = contentFractions
		self.contentFraction = contentFraction
		self.__objectiveScore = objectiveScore
		self.__opportunityCost = opportunityCost
		self.update(difference)

	def update(self, difference):
		self.acquisitionValue = -(self.__objectiveScore + (self.__opportunityCost * difference))
		self.difference = difference

	def __eq__(self, other):
		return self.acquisitionValue == other.acquisitionValue

	def __lt__(self, other):
		return self.acquisitionValue < other.acquisitionValue

    
class SimpleTuner:
	def __init__(self, cornerPoints, objectiveFunction, exploration_preference=0.15):
		self.__cornerPoints = cornerPoints
		self.__numberOfVertices = len(cornerPoints)
		self.queue = []
		self.capacity = self.__numberOfVertices + CAPACITY_INCREMENT
		self.testPoints = np.empty((self.capacity, self.__numberOfVertices))
		self.objective = objectiveFunction
		self.iterations = 0
		self.maxValue = None
		self.minValue = None
		self.bestCoords = []
		self.opportunityCostFactor = exploration_preference #/ self.__numberOfVertices

	def optimize(self, maxSteps=10):
		for step in tqdm(range(maxSteps)):
			if len(self.queue) > 0:
				targetSimplex = self.__getNextSimplex()
				newPointIndex = self.__testCoords(targetSimplex.testCoords)
				for i in range(self.__numberOfVertices):
					tempIndex = targetSimplex.pointIndices[i]
					targetSimplex.pointIndices[i] = newPointIndex
					newContentFraction = targetSimplex.contentFraction * targetSimplex.contentFractions[i]
					newSimplex = self.__makeSimplex(targetSimplex.pointIndices, newContentFraction)
					heappush(self.queue, newSimplex)
					targetSimplex.pointIndices[i] = tempIndex
			else:
				testPoint = self.__cornerPoints[self.iterations]
				testPoint.append(0)
				testPoint = np.array(testPoint, dtype=np.float64)
				self.__testCoords(testPoint)
				if self.iterations == (self.__numberOfVertices - 1):
					initialSimplex = self.__makeSimplex(np.arange(self.__numberOfVertices, dtype=np.intp), 1)
					heappush(self.queue, initialSimplex)
			self.iterations += 1

	def get_best(self):
		return (self.maxValue, self.bestCoords[0:-1])

	def __getNextSimplex(self):
		targetSimplex = heappop(self.queue)
		currentDifference = self.maxValue - self.minValue
		while currentDifference > targetSimplex.difference:
			targetSimplex.update(currentDifference)
			# if greater than because heapq is in ascending order
			if targetSimplex.acquisitionValue > self.queue[0].acquisitionValue:
				targetSimplex = heappushpop(self.queue, targetSimplex)
		return targetSimplex
		
	def __testCoords(self, testCoords):
		objectiveValue = self.objective(testCoords[0:-1])
		if self.maxValue == None or objectiveValue > self.maxValue: 
			self.maxValue = objectiveValue
			self.bestCoords = testCoords
			if self.minValue == None: self.minValue = objectiveValue
		elif objectiveValue < self.minValue:
			self.minValue = objectiveValue
		testCoords[-1] = objectiveValue
		if self.capacity == self.iterations:
			self.capacity += CAPACITY_INCREMENT
			self.testPoints.resize((self.capacity, self.__numberOfVertices))
		newPointIndex = self.iterations
		self.testPoints[newPointIndex] = testCoords
		return newPointIndex


	def __makeSimplex(self, pointIndices, contentFraction):
		vertexMatrix = self.testPoints[pointIndices]
		coordMatrix = vertexMatrix[:, 0:-1]
		barycenterLocation = np.sum(vertexMatrix, axis=0) / self.__numberOfVertices

		differences = coordMatrix - barycenterLocation[0:-1]
		distances = np.sqrt(np.sum(differences * differences, axis=1))
		totalDistance = np.sum(distances)
		barycentricTestCoords = distances / totalDistance

		euclideanTestCoords = vertexMatrix.T.dot(barycentricTestCoords)
		
		vertexValues = vertexMatrix[:,-1]

		testpointDifferences = coordMatrix - euclideanTestCoords[0:-1]
		testPointDistances = np.sqrt(np.sum(testpointDifferences * testpointDifferences, axis=1))

		inverseDistances = 1 / testPointDistances
		inverseSum = np.sum(inverseDistances)
		interpolatedValue = inverseDistances.dot(vertexValues) / inverseSum

		currentDifference = self.maxValue - self.minValue
		opportunityCost = self.opportunityCostFactor * math.log(contentFraction, self.__numberOfVertices)

		return _Simplex(pointIndices.copy(), euclideanTestCoords, barycentricTestCoords, interpolatedValue, opportunityCost, contentFraction, currentDifference)

In [None]:
def f(weights):
    norm_weights = weights / np.sum(weights)
    valid_preds = np.average(preds, axis=0, weights=norm_weights)
    return [np.argmax(pred) for pred in valid_preds]


def acc_function(weights):
    y_preds = f(weights)
    n_eq = [result == ref for result, ref in zip(y_preds, labels)]
    return np.sum(n_eq) / len(y_preds)

In [None]:
# optimization_domain_vertices = [
#     [0, 0, 0, 0, 0, 0], 
#     [0, 0, 0, 0, 0, 1], 
#     [0, 0, 0, 0, 1, 0], 
#     [0, 0, 0, 1, 0, 0], 
#     [0, 0, 1, 0, 0, 0], 
#     [0, 1, 0, 0, 0, 0],
#     [1, 0, 0, 0, 0, 0],
# ]
optimization_domain_vertices = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [1, 0, 0, 0, 0]]
# optimization_domain_vertices = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]]
# optimization_domain_vertices = [[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0]]

number_of_iterations = 5000
exploration = 0.01 # optional, default 0.15

In [None]:
tuner = SimpleTuner(optimization_domain_vertices, acc_function, exploration_preference=exploration)
tuner.optimize(number_of_iterations)

best_objective_value, best_weights = tuner.get_best()

In [None]:
print(f'Best objective value = {best_objective_value:.6f}')
print(f'Optimum weights = {best_weights}')
print(f'Ensembled Accuracy (same as best objective value) = {acc_function(best_weights):.6f}')

# Tuning with Optuna

In [None]:
r_min, r_max = .0, 1.

# You can increase iteration number.
iteration = 5000

optuna.logging.disable_default_handler()

In [None]:
def objective(trial):
    a = trial.suggest_uniform('a', r_min, r_max)
    # a = trial.suggest_uniform('a', .1, .3)
    b = trial.suggest_uniform('b', r_min, r_max)
    # b = trial.suggest_uniform('b', .1, .3)
    c = trial.suggest_uniform('c', r_min, r_max)
    # c = trial.suggest_uniform('c', .1, .3)
    d = trial.suggest_uniform('d', r_min, r_max)
    # d = trial.suggest_uniform('d', .1, .3)
    e = trial.suggest_uniform('e', r_min, r_max)
    # e = trial.suggest_uniform('e', .1, .3)
    f = trial.suggest_uniform('f', r_min, r_max)
    # f = trial.suggest_uniform('e', .25, .35)
    g = trial.suggest_uniform('g', r_min, r_max)
    # g = trial.suggest_uniform('e', .25, .35)

    # score = accuracy_score(calc_p(preds, a, b, c, d), labels)
    # score = accuracy_score(calc_p(preds, a, b, c, d, e), labels)
    # score = accuracy_score(calc_p(preds, a, b, c, d, e, f), labels)
    score = accuracy_score(calc_p(preds, a, b, c, d, e, f, g), labels)

    # print(f'a:{a:.6f}, b:{b:.6f}, c:{c:.6f}, d:{d:.6f}, score:{score:.6f}')
    # print(f'a:{a:.6f}, b:{b:.6f}, c:{c:.6f}, d:{d:.6f}, e:{e:.6f}, score:{score:.6f}')
    # print(f'a:{a:.6f}, b:{b:.6f}, c:{c:.6f}, d:{d:.6f}, e:{e:.6f}, f:{f:.6f}, score:{score:.6f}')
    print(f'a:{a:.6f}, b:{b:.6f}, c:{c:.6f}, d:{d:.6f}, e:{e:.6f}, f:{f:.6f}, g:{g:.6f}, score:{score:.6f}')
    return score

In [None]:
%%time
SEED: int = 1337

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=iteration, n_jobs=4)

In [None]:
print(f'cv score : {study.best_value:.6f}')

In [None]:
study.best_params

In [None]:
plt.plot([trial.params['a'] for trial in study.trials], label='a')
plt.plot([trial.params['b'] for trial in study.trials], label='b')
plt.plot([trial.params['c'] for trial in study.trials], label='c')
plt.plot([trial.params['d'] for trial in study.trials], label='d')
plt.plot([trial.params['e'] for trial in study.trials], label='e')
plt.plot([trial.params['f'] for trial in study.trials], label='f')
plt.plot([trial.params['g'] for trial in study.trials], label='g')
plt.legend()
plt.grid()
plt.show()

In [None]:
plt.plot([trial.value for trial in study.trials])
plt.grid()
plt.show()

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(study)

In [None]:
# from optuna.visualization import plot_param_importances

# plot_param_importances(study)

In [None]:
# from optuna.visualization import plot_contour

# plot_contour(study)

# Metrics

In [None]:
params = study.best_params

# weights = [params['a'], params['b'], params['c'], params['d']]
# weights = [params['a'], params['b'], params['c'], params['d'], params['e']]
# weights = [params['a'], params['b'], params['c'], params['d'], params['e'], params['f']]
weights = [params['a'], params['b'], params['c'], params['d'], params['e'], params['f'], params['g']]

weights = [round(weight, 8) for weight in weights]
weights

In [None]:
# weights = best_weights

# weights = [0.28008428, 0.08930099, 0.19287446, 0.13415098, 0.2855688]
# logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2] + weights[3] * preds[3] + weights[4] * preds[4]

# weights = [0.34618164, 0.19092364, 0.38515934, 0.91232422, 0.00026995, 0.70023081]
# weights = [0.64438387, 0.06787352, 0.21374317, 0.92894338, 0.30073056, 0.25681572]
# logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2] + weights[3] * preds[3] + weights[4] * preds[4] + weights[5] * preds[5]

logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2] + weights[3] * preds[3] + weights[4] * preds[4] + weights[5] * preds[5] + weights[6] * preds[6]

# weights = [0.32264375, 0.19517635, 0.10858799, 0.33353971]
# weights = [0.22155271, 0.1881944, 0.38943474, 0.1644162]
# logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2] + weights[3] * preds[3]

# weights = [0.33547759, 0.30181755, 0.28914882]
# logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2]

# weights = [0.15093364, 0.15979473, 0.38676311, 0.26914147]
# logits = weights[0] * preds[0] + weights[1] * preds[1] + weights[2] * preds[2] + weights[3] * preds[3]

In [None]:
# logits = np.mean(preds, axis=0)

In [None]:
valid_acc = np.sum(labels == logits.argmax(1)) / len(logits) * 100.
print(f'[*] valid top-1 acc : {valid_acc:.4f}')

In [None]:
cm = confusion_matrix(labels, logits.argmax(1))
cm

In [None]:
for i, val in enumerate(cm):
    print(f'[+] Class {i} | top-1 acc : {val[i] / sum(val) * 100.:.4f}')

In [None]:
for i, val in enumerate(cm[:-1]):
    print(f'[+] Class {i} | possibility to mistake for healthy : {val[4] / val[i] * 100.:.4f}')

In [None]:
print(classification_report(labels, logits.argmax(1), digits=6))

# Public vs Private

In [None]:
n_iters: int = 100000
ratio: float = 0.31

n_samples: int = labels.shape[0]
n_pub_samples: int = int(ratio * n_samples)

In [None]:
p = (logits.argmax(1) == labels)
p

In [None]:
from sklearn.model_selection import train_test_split

pub, priv = [], []
indexes = np.arange(n_samples)
for i in tqdm(range(n_iters)):
    np.random.shuffle(indexes)
    pub_idx = indexes[:n_pub_samples]
    priv_idx = indexes[n_pub_samples:]

    pub_score = np.sum(p[pub_idx]) / pub_idx.shape[0]
    priv_score = np.sum(p[priv_idx]) / priv_idx.shape[0]

    pub.append(pub_score)
    priv.append(priv_score)

In [None]:
np.mean(priv), np.median(priv)

In [None]:
sns.distplot(pub, hist=False, rug=True, color='blue', label='public')
sns.distplot(priv, hist=False, rug=True, color='red', label='private')

plt.title('public vs private')
plt.xlabel('score')
plt.ylabel('n_samples')
plt.legend(prop={'size': 12}, title='group')
plt.show()

# EOF