In [None]:
pip install infomap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting infomap
  Downloading infomap-2.6.0.tar.gz (261 kB)
[K     |████████████████████████████████| 261 kB 4.3 MB/s 
[?25hBuilding wheels for collected packages: infomap
  Building wheel for infomap (setup.py) ... [?25l[?25hdone
  Created wheel for infomap: filename=infomap-2.6.0-cp37-cp37m-linux_x86_64.whl size=5607166 sha256=da50380618f89137cb09adb12091ce77ae8c6040e0120dcf8ecf2337b66e88e0
  Stored in directory: /root/.cache/pip/wheels/37/53/74/637a39ce6617c0f779fa1f955b30726fd5d32a94bd4960f73d
Successfully built infomap
Installing collected packages: infomap
Successfully installed infomap-2.6.0


In [None]:
!apt install libomp-dev
!python -m pip install --upgrade faiss faiss-gpu

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 20 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 2s (159 kB/s)
Selecting previously unselected package libomp5:amd64.
(Reading database ... 155676 files and directories currently installed.)
Preparing to unpack .../libomp5_5.0.1-1_amd64.deb ...
Unpacking libomp5:amd64 (5.0.1-1) ...
Se

In [None]:
!pip install faiss-cpu
!pip install faiss-gpu
#python3 -m pip install --upgrade faiss faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 3.9 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2


In [None]:
#faiss_knn
# -*- coding: utf-8 -*-

import numpy as np
import faiss
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


def l2norm(vec):
    vec /= np.linalg.norm(vec, axis=1).reshape(-1, 1)
    return vec


def load_feat(feat_path, feat_dim=256):
    if '.npy' in feat_path:
        feat = np.load(feat_path).astype(np.float32)
    else:
        feat = np.fromfile(feat_path, dtype=np.float32)
        feat = feat.reshape(-1, feat_dim)
    return feat


def faiss_knn(feat_path, knn_path, feat_dim, k=256):
    feat = load_feat(feat_path, feat_dim)
    print('features shape:', feat.shape)
    feat = l2norm(feat)

    index = faiss.IndexFlatIP(feat_dim)

    # use single gpu
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

    # use all gpus
    # index = faiss.index_cpu_to_all_gpus(index)

    index.add(feat)
    batch_size = 200000
    n = int(np.ceil(feat.shape[0] / batch_size))
    sims = np.array([], dtype=np.float32).reshape(-1, k+1)
    nbrs = np.array([], dtype=np.uint32).reshape(-1, k+1)

    for i in tqdm(range(n)):
        start = i * batch_size
        end = (i+1) * batch_size
        query = feat[start:end]
        sim, nbr = index.search(query, k+1)
        sims = np.vstack((sims, sim))
        nbrs = np.vstack((nbrs, nbr))

    # remove itself
    for i in range(nbrs.shape[0]):
        if i == nbrs[i, 0]:
            pass
        else:
            for j, x in enumerate(nbrs[i, 1:]):
                if i == x:
                    nbrs[i, 1:j+1] = nbrs[i, :j]
                    sims[i, 1:j+1] = sims[i, :j]
                    break
    sims = sims[:, 1:]
    nbrs = nbrs[:, 1:]

    x = [(np.array(nbr, dtype=np.uint32), np.array(sim, dtype=np.float32)) for nbr, sim in zip(nbrs, sims)]
    np.savez_compressed(knn_path, data=np.array(x))
    return nbrs, sims

In [None]:
#config

window_size = 20
topK = 256

# MS1M feat_dim=256, CASIA and VGG feat_dim=512
feat_dim = 256
dataset = 'MS1M'
test_name = 'part1_test'

knn_path = './data/{}/knns/{}_faiss_top{}.npz'.format(dataset, test_name, topK)
feat_path = './data/{}/features/{}.bin'.format(dataset, test_name)
#label_path = './data/{}/labels/{}.meta'.format(dataset, test_name)
result_path = './result/{}/part1_test_top{}_winds{}.npy'.format(dataset, topK, window_size)

In [None]:
# -*- coding: utf-8 -*-

import os
from time import time
import numpy as np
from tqdm import tqdm
import infomap
#from utlis.faiss_knn import faiss_knn
#from configs import config
import warnings
warnings.filterwarnings('ignore')


def outlier_detect(delta_p, window_size):
    omega = window_size
    z = np.zeros_like(delta_p, dtype=np.float32)
    for j in tqdm(range(delta_p.shape[1]-omega, -1, -1)):
        mu_test = np.mean(delta_p[:, j:j+omega], axis=1)
        mu_ref = np.mean(delta_p[:, j:], axis=1)
        sigma_ref = np.std(delta_p[:, j:], axis=1)
        q = j + (omega+1)//2
        z[:, q] = np.abs(mu_test - mu_ref) / sigma_ref
    q_star = np.argmax(z, axis=1)
    return q_star
    


class FaceMap():

    def __init__(self):
        self.omega = window_size
        self.topK = topK
        self.knn_path = knn_path
        #self.label_path = label_path
        self.feat_path = feat_path
        self.feat_dim = feat_dim
        self.result_path = result_path
        os.makedirs(os.path.split(self.knn_path)[0], exist_ok=True)
        os.makedirs(os.path.split(self.result_path)[0], exist_ok=True)
        self._load_knn()
        self.t = time()

    def _load_knn(self):
        t0 = time()
        if os.path.exists(self.knn_path):
            knn = np.load(self.knn_path)
            knn = knn['data']
            if isinstance(knn, list):
                knn = np.array(knn)
            self.nbrs = knn[:, 0, :self.topK].astype(np.int32)
            self.sims = knn[:, 1, :self.topK].astype(np.float32)
        else:
            self.nbrs, self.sims = faiss_knn(self.feat_path, self.knn_path, self.feat_dim, self.topK)
        print('time cost of load knn: {:.2f}s'.format(time() - t0))

    def transition_prob_by_threshold(self, th=0.62):
        single, links, weights = [], [], []
        for i in tqdm(range(self.nbrs.shape[0])):
            c = 0
            for j, nbr in enumerate(self.nbrs[i]):
                if self.sims[i, j] >= th:
                    c += 1
                    links.append((i, nbr))
                    weights.append(self.sims[i, j])
                else:
                    break
            if c == 0:
                single.append(i)
        self.links = np.array(links, dtype=np.uint32)
        self.weights = np.array(weights, dtype=np.float32)
        self.single = np.array(single, dtype=np.uint32)

    def adjust_transition_prob(self):
        p = self.sims / np.sum(self.sims, axis=1, keepdims=True)
        t0 = time()
        delta_p = p[:, :-1] - p[:, 1:]
        q = outlier_detect(delta_p, self.omega)
        print('time cost of outlier_detect: {:.2f}s'.format(time() - t0))
        
        single, links, weights = [], [], []
        for i, k in enumerate(q):
            count = 0
            for idx, j in enumerate(self.nbrs[i, :k+1]):
                if i == j:
                    pass
                else:
                    count += 1
                    links.append((i, j))
                    weights.append(p[i, idx])
            if count == 0:
                single.append(i)
        self.links = np.array(links, dtype=np.uint32)
        self.weights = np.array(weights, dtype=np.float32)
        self.single = np.array(single, dtype=np.uint32)
    
    def face_cluster(self):
        info = infomap.Infomap("--two-level", flow_model='undirected')
        for (i, j), sim in tqdm(zip(self.links, self.weights)):
            _ = info.addLink(i, j, sim)
        del self.links
        del self.weights

        info.run(seed=100)

        lb2idx = {}
        self.idx2lb = {}
        for node in info.iterTree():
            if node.moduleIndex() not in lb2idx:
                lb2idx[node.moduleIndex()] = []
            lb2idx[node.moduleIndex()].append(node.physicalId)

        for k, v in lb2idx.items():
            if k == 0:
                lb2idx[k] = v[2:]
                for u in v[2:]:
                    self.idx2lb[u] = k
            else:
                lb2idx[k] = v[1:]
                for u in v[1:]:
                    self.idx2lb[u] = k

        lb_len = len(lb2idx)
        if len(self.single) > 0:
            for k in self.single:
                if k in self.idx2lb:
                    continue
                self.idx2lb[k] = lb_len
                lb2idx[lb_len] = [k]
                lb_len += 1
        print('time cost of FaceMap: {:.2f}s'.format(time() - self.t))

        #pred_labels = np.zeros(len(self.idx2lb)) - 1
        #for k, v in self.idx2lb.items():
           # pred_labels[k] = v
       # np.save(self.result_path, pred_labels)
        np.save(self.result_path)

In [None]:
face_map = FaceMap()
face_map.adjust_transition_prob()
face_map.face_cluster()

to prepare directory :
https://github.com/yl-1993/learn-to-cluster/blob/master/DATASET.md
