# hard voting

In [1]:
import numpy as np
from sklearn import metrics
from typing import Dict

In [2]:
np.random.seed(42)

In [3]:
labels = np.load("outputs/animals_10-test_labels.npy")

In [4]:
preds_paths = {  # 已按表现降序
    "vit": "outputs/vit_aug/vit_b_16-animals_10_aug-250103-1502-test_preds.npy",  # 98.61
    "densenet": "outputs/densenet_aug/densenet201-animals_10_aug-250103-1500-test_preds.npy",  # 98.44
    "resnet": "outputs/resnet_aug/resnet152-animals_10_aug-250103-1619-test_preds.npy",  # 98.28
    "vgg": "outputs/vgg_aug/vgg19_bn-animals_10_aug-250103-1622-test_preds.npy",  # 97.83
    "googlenet": "outputs/googlenet_aug/googlenet-animals_10_aug-250103-1617-test_preds.npy",  # 95.83
    "alexnet": "outputs/alexnet_aug/alexnet-animals_10_aug-250103-1501-test_preds.npy",  # 91.67
}

In [5]:
preds_dict = {key: np.load(preds_paths[key]) for key in preds_paths.keys()}

In [6]:
def evaluate(labels: np.ndarray, preds: np.ndarray) -> Dict[str, float]:
    acc = round(metrics.accuracy_score(labels, preds), 4)
    macro_f1 = round(metrics.f1_score(labels, preds, labels=range(6), average="macro"), 4)
    return {"acc": acc, "f1": macro_f1}

In [7]:
def hard_voting(preds_all: np.ndarray) -> np.ndarray:
    preds = []

    for i in range(preds_all.shape[1]):
        votes = preds_all[:, i]
        count = np.bincount(votes)
        max_count = np.max(count)
        pred_tie = np.array(np.where(count == max_count)).flatten()
        pred = np.random.choice(pred_tie)
        preds.append(pred)
    preds = np.array(preds)
    return preds

In [8]:
def dfs(n: int, begin: int = 0, k: int = 0):
    if k == n:
        preds_all = np.array(select_values)
        preds = hard_voting(preds_all)
        print(select_names, evaluate(labels, preds))
        return
    for i in range(begin, num):
        if vis[i]:
            continue
        select_names.append(keys[i])
        select_values.append(values[i])
        vis[i] = True
        dfs(n, i + 1, k + 1)
        select_names.pop()
        select_values.pop()
        vis[i] = False

In [9]:
def grid_search(maxn: int) -> None:
    for n in range(3, maxn + 1):
        dfs(n)

In [10]:
num = len(preds_dict)
vis = [False] * num
keys = list(preds_dict.keys())
values = list(preds_dict.values())
select_names = []
select_values = []
grid_search(maxn=num)

['vit', 'densenet', 'resnet'] {'acc': 0.9867, 'f1': 0.9867}
['vit', 'densenet', 'vgg'] {'acc': 0.985, 'f1': 0.985}
['vit', 'densenet', 'googlenet'] {'acc': 0.985, 'f1': 0.985}
['vit', 'densenet', 'alexnet'] {'acc': 0.9822, 'f1': 0.9822}
['vit', 'resnet', 'vgg'] {'acc': 0.9856, 'f1': 0.9856}
['vit', 'resnet', 'googlenet'] {'acc': 0.985, 'f1': 0.985}
['vit', 'resnet', 'alexnet'] {'acc': 0.9833, 'f1': 0.9833}
['vit', 'vgg', 'googlenet'] {'acc': 0.9817, 'f1': 0.9817}
['vit', 'vgg', 'alexnet'] {'acc': 0.9806, 'f1': 0.9806}
['vit', 'googlenet', 'alexnet'] {'acc': 0.975, 'f1': 0.975}
['densenet', 'resnet', 'vgg'] {'acc': 0.9844, 'f1': 0.9844}
['densenet', 'resnet', 'googlenet'] {'acc': 0.9828, 'f1': 0.9828}
['densenet', 'resnet', 'alexnet'] {'acc': 0.9817, 'f1': 0.9817}
['densenet', 'vgg', 'googlenet'] {'acc': 0.9817, 'f1': 0.9817}
['densenet', 'vgg', 'alexnet'] {'acc': 0.9817, 'f1': 0.9817}
['densenet', 'googlenet', 'alexnet'] {'acc': 0.9733, 'f1': 0.9733}
['resnet', 'vgg', 'googlenet'] {'ac

实验结果表明用多个表现同样尽可能好的分类器 hard voting 往往会带来更好的表现  
表现明显更差的分类器往往会拉低表现最好分类器的表现，尽管有时并不一定，如果能带来新的有价值的不同视角，有时也许可能提高表现    
实验中组合的最好结果是：  
vit + densenet + resnet，98.67  
vit + densenet + resnet + vgg，98.67  

基于上述结果，三到四个分类器组合就是充足的，以及考虑到我们的 fusion 方法已经用到 densenet、vit。所以尝试对 fusion 方法和其他方法的 voting 组合如下  

In [11]:
preds_paths = {  # 已按表现降序
    "glip": "outputs/glip/glip-animals_10_aug-250104-1413-test_preds.npy",  # 99.06
    "point_fusion": "outputs/point_fusion/point_fusion-animals_10_aug-250103-1954-test_preds.npy",  # 98.67
    "vit": "outputs/vit_aug/vit_b_16-animals_10_aug-250103-1502-test_preds.npy",  # 98.61
    "decision_fusion": "outputs/decision_fusion/decision_fusion-animals_10_aug-250105-1338-test_preds.npy",  # 98.61
    "wfcg": "outputs/wfcg/wfcg-animals_10_aug-250105-1523-test_preds.npy",  # 98.50
    "densenet": "outputs/densenet_aug/densenet201-animals_10_aug-250103-1500-test_preds.npy",  # 98.44
    "resnet": "outputs/resnet_aug/resnet152-animals_10_aug-250103-1619-test_preds.npy",  # 98.28
    "mvc": "outputs/mvc/mvc-animals_10_aug-250105-2233-test_preds.npy",  # 98.17
    "vgg": "outputs/vgg_aug/vgg19_bn-animals_10_aug-250103-1622-test_preds.npy",  # 97.83
}

In [12]:
preds_dict = {key: np.load(preds_paths[key]) for key in preds_paths.keys()}
num = len(preds_dict)
vis = [False] * num
keys = list(preds_dict.keys())
values = list(preds_dict.values())
select_names = []
select_values = []
grid_search(maxn=num)

['glip', 'point_fusion', 'vit'] {'acc': 0.9883, 'f1': 0.9883}
['glip', 'point_fusion', 'decision_fusion'] {'acc': 0.9883, 'f1': 0.9883}
['glip', 'point_fusion', 'wfcg'] {'acc': 0.9883, 'f1': 0.9883}
['glip', 'point_fusion', 'densenet'] {'acc': 0.9906, 'f1': 0.9906}
['glip', 'point_fusion', 'resnet'] {'acc': 0.9889, 'f1': 0.9889}
['glip', 'point_fusion', 'mvc'] {'acc': 0.9872, 'f1': 0.9872}
['glip', 'point_fusion', 'vgg'] {'acc': 0.9894, 'f1': 0.9895}
['glip', 'vit', 'decision_fusion'] {'acc': 0.9878, 'f1': 0.9878}
['glip', 'vit', 'wfcg'] {'acc': 0.9883, 'f1': 0.9883}
['glip', 'vit', 'densenet'] {'acc': 0.99, 'f1': 0.99}
['glip', 'vit', 'resnet'] {'acc': 0.9883, 'f1': 0.9883}
['glip', 'vit', 'mvc'] {'acc': 0.9861, 'f1': 0.9861}
['glip', 'vit', 'vgg'] {'acc': 0.9883, 'f1': 0.9883}
['glip', 'decision_fusion', 'wfcg'] {'acc': 0.9878, 'f1': 0.9878}
['glip', 'decision_fusion', 'densenet'] {'acc': 0.99, 'f1': 0.99}
['glip', 'decision_fusion', 'resnet'] {'acc': 0.9883, 'f1': 0.9883}
['glip', '

实验结果表明，某些组合中确实可能因为 voting 提高了表现。  
但是始终没有超过 glip 的表现，与 glip 的组合往往也只是拉低表现，最多依然没有超过 99.06  
这表明在我们的分类器范围内，有 0.94 的样本极难分类，所有分类器几乎都做错了。我们会在混淆矩阵中进一步分析它  
想要进一步提高表现，只能尝试扩大数据增强规模 或 选用更加强大的 backbone  