In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
import collections
import numpy as np
import copy
cancer = load_breast_cancer()
# X, y = cancer.data, cancer.target
# print(cancer.target.dtype)
# print(np.sum(y==0))
# print(np.sum(y==1))

# class_0 = X[y==0]
# class_1 = X[y==1]

# class_0 = np.concatenate((class_0, np.zeros((class_0.shape[0], 1))), axis=1)
# class_1 = np.concatenate((class_1, np.ones((class_1.shape[0], 1))), axis=1)

# np.random.shuffle(class_0)
# np.random.shuffle(class_1)

# train_0 = class_0[0:100]
# train_1 = class_1[0:100]
# noise_data_0 = class_0[100:]
# noise_data_1 = class_1[100:]
# noise_size = 30
# noise_data_0, noise_data_1 = np.concatenate((noise_data_1[:noise_size], noise_data_0[noise_size:]),axis=0), np.concatenate((noise_data_0[:noise_size], noise_data_1[noise_size:]),axis=0)

def make_datasets(raw_data):
  X, y = raw_data.data, raw_data.target
  gt_labels = np.unique(y).tolist()
  noise_size = 30
  noisy_data = np.array([])
  train_data = {}
  noise_data = {}
  for label in gt_labels:
    class_ = X[y==label]
    class_ = np.concatenate((class_, label*np.ones((class_.shape[0],1))), axis=1)
    train_data[label] = class_[:100]
    tmp = class_[100:]
    noise_data[label] = tmp[noise_size:]
    try:
      noisy_data = np.concatenate((noisy_data,tmp[:noise_size]), axis=0)
    except:
      noisy_data = tmp[:noise_size]
  np.random.shuffle(noisy_data)
  cnt = 0
  for label, _ in noise_data.items():
    noise_data[label] = np.concatenate((noise_data[label],noisy_data[cnt*noise_size: (cnt + 1)*noise_size]), axis=0)
    cnt += 1
  return train_data, noise_data, gt_labels

def default_array():
  return np.array([])

def data_cleaning(train_data, noise_data, gt_labels, clf, epoch, contamination=0.2):
  cleaned_data = collections.defaultdict(default_array)
  cost = 0
  for e in range(epoch):
    print(f'now epoch: {e}...')
    # 使用已有或经过扩充的数据集对待清洗数据集进行异常检测，异常数据假设使用正确率百分之百的人工标注
    for key, val in train_data.items():
      clf.fit(val[:,:-1])
      noise_data_ = noise_data[key]
      labels = clf.predict(noise_data_[:,:-1])
      gt = noise_data_[:,-1] != key
      pred = labels==-1
      tp = np.sum(gt&pred)
      fp = np.sum(gt&(~pred))
      fn = np.sum((~gt)&pred)
      tn = np.sum((~gt)&(~pred))
      percesion = -1 if tp + fp == 0 else tp / (tp + fp)
      recall = -1 if tp + fn == 0 else tp / (tp + fn)
      print(f'anomal data, class {key}: tp:{tp}, fp:{fp}, fn:{fn}, tn:{tn}, precision:{percesion:.2f}, recall:{recall:.2f}')
      annotation_data = noise_data_[labels==-1]
      anno = annotation_data[:,-1]
      cost += anno.shape[0]
      # 更新到已经清洗过的数据集
      for i in gt_labels:
        try:
          cleaned_data[i] = np.concatenate((cleaned_data[i], annotation_data[anno==i]), axis=0)
        except:
          cleaned_data[i] = annotation_data[anno==i]
      noise_data[key] = noise_data_[labels==1]
    # 使用经过人工标注的数据扩充训练集
    for key, val in cleaned_data.items():
      train_data[key] = np.concatenate((train_data[key], val), axis=0)
    # 统计经过一轮清洗后各类别数据的质量
    for key, val in cleaned_data.items():
      tmp = np.concatenate((noise_data[key], val), axis=0)
      tp = np.sum(tmp[:,-1] == key)
      print(f'cleaned data, class: {key}, tp:{tp}, total: {tmp.shape[0]}, precision: {tp / tmp.shape[0]:.2f}')
  print(f'cost: {cost}')

contamination = 0.10
LOF_clf = LocalOutlierFactor(n_neighbors=20, contamination=contamination, novelty=True)
EE_clf = EllipticEnvelope(contamination=contamination)
IF_clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=contamination)

def compute_percesion(data):
  for key, val in data.items():
    tp = np.sum(val[:,-1]==key)
    print(f'class: {key}, tp: {tp}, total: {val.shape[0]}, precision: {tp / val.shape[0]:.2f}')

train_data, noise_data, gt_labels = make_datasets(cancer)

print('LOF_clf')
compute_percesion(noise_data)
data_cleaning(train_data.copy(), noise_data.copy(), gt_labels.copy(), LOF_clf, 2)
print('-----------------------------------')

print('EE_clf')
compute_percesion(noise_data)
data_cleaning(train_data.copy(), noise_data.copy(), gt_labels.copy(), EE_clf, 2)
print('-----------------------------------')

print('IF_clf')
compute_percesion(noise_data)
data_cleaning(train_data.copy(), noise_data.copy(), gt_labels.copy(), IF_clf, 2)
print('-----------------------------------')