## Isolation Forest（孤立森林）

是一個非監督式的異常檢測演算法，其核心原理是不斷選取特徵去區分子群後，透過判別每一個資料點的深度，決定是否為異常值。

Isolation Forest與一般需要計算密度、距離的演算法不同，透過特徵切分，如果是異常資料點，其特徵會與大多數正常資料不同，因此很容易在淺層被區分出來，因此透過計算樣本在每一棵樹的深度，就可以去區分樣本是否為異常值了！
（通常計算效率高也可以處理大量資料）

如果特徵選取剛好沒有鑑別力，造成深度很淺，造成誤以為是異常值怎麼辦？其實這個問題，可以透過多棵樹去避開這個問題，因為每一棵樹都會隨機選擇特徵以及threshold，多棵樹去綜合計算類似平均深度，就不容易誤判了！

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from tqdm import tqdm
import logging
import tensorflow as tf
import joblib  # Import joblib for saving the model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline


In [None]:

# 設置 logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()


# 2. 數據預處理
input_path = '../dataSets'
train_data = pd.read_csv(os.path.join(input_path, "training.csv"))

# 分割出驗證集
train_data = train_data.drop(columns=["lettr"])

VALIDATE_SIZE = 0.3
RANDOM_SEED = 42
X_train, X_valid = train_test_split(train_data, test_size=VALIDATE_SIZE, random_state=RANDOM_SEED)

print(f"""Shape of the datasets:
    training (rows, cols) = {X_train.shape}
    validate (rows, cols) = {X_valid.shape}""")



# configure our pipeline
pipeline = Pipeline([('normalizer', Normalizer()),
                     ('scaler', MinMaxScaler())])

pipeline.fit(X_train)

x_train = pipeline.transform(X_train)
x_valid = pipeline.transform(X_valid)



Shape of the datasets:
    training (rows, cols) = (2940, 16)
    validate (rows, cols) = (1260, 16)


In [3]:

# def calculate_isolation_loss(data, model):
#     # 對每個樣本給定一個異常分數
#     anomaly_scores = model.decision_function(data) 
#     isolation_errors = np.mean(np.power(anomaly_scores, 2)) # mse
#     return isolation_errors



# 目標函數
def objective(params):
    # 記錄開始時間
    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
    
    # 建立 Isolation Forest 模型
    model = IsolationForest(n_estimators=int(params['n_estimators']), 
                            # max_samples=params['max_samples'], 
                            max_samples='auto',
                            contamination=params['contamination'], 
                            random_state=42)
    
    model.fit(x_train)
    
    # 計算異常分數
    anomaly_scores = model.decision_function(x_valid) 

    # 計算異常分數平均誤差
    val_loss = np.mean(anomaly_scores)
    
    # 保存當前最佳超參數和權重
    if val_loss < objective.best_loss:  # 只在損失改進時儲存最佳權重和參數
        objective.best_loss = val_loss
        objective.best_params = params
        
        # 使用 joblib 儲存模型
        model_dir = 'best_forest'
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir,'best_model_weights.pkl')
        joblib.dump(model, model_path)  # 使用 joblib 保存模型
        

        # 顯示最佳超參數
        print("Best hyperparameters: ", params)
        print(f"Best loss: {val_loss}")
      
    return {'loss': val_loss, 'status': STATUS_OK, 'params': params}


# 初始化
objective.best_loss = float('inf')

# 超參數搜尋空間
space = {
    'n_estimators': hp.choice('n_estimators', [100, 200, 300, 1000, 2000, 3000]),  # 樹的數量
    'max_samples': hp.uniform('max_samples', 0.5, 1.0),             # 隨機樣本數量
    'contamination': hp.uniform('contamination', 0.01, 0.1)         # 異常樣本的比例
}

# # 超參數搜尋空間
# space = {
#     'n_estimators': hp.choice('n_estimators', [100]),               # 樹的數量
#     'max_samples': hp.uniform('max_samples', 0.5, 1.0),             # 隨機樣本數量
#     'contamination': hp.uniform('contamination', 0.01, 0.1)         # 異常樣本的比例
# }

# 使用 Hyperopt 進行超參數搜尋
trials = Trials()
best = fmin(fn=objective, 
            space=space, 
            algo=tpe.suggest, 
            max_evals=10, 
            trials=trials, 
            verbose=0, 
            show_progressbar=True
        )


# 輸出最佳結果
best_params = trials.best_trial['result']['params']
best_loss = trials.best_trial['result']['loss']
logger.info("Best hyperparameters: %s", best_params) 
logger.info("Best loss: %f", best_loss)



INFO:hyperopt.tpe:build_posterior_wrapper took 0.000000 seconds
INFO:hyperopt.tpe:TPE using 0 trials
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000536 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 0.052843
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000000 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 0.049191


Best hyperparameters:  {'contamination': 0.07826188124409683, 'max_samples': 0.6945413344448359, 'n_estimators': 1000}
Best loss: 0.05284319155483954
Best hyperparameters:  {'contamination': 0.09540391905903309, 'max_samples': 0.7895819578647452, 'n_estimators': 100}
Best loss: 0.04919099034564791


INFO:hyperopt.tpe:build_posterior_wrapper took 0.001000 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 0.049191
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000000 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 0.049191
INFO:hyperopt.tpe:build_posterior_wrapper took 0.001000 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 0.049191
INFO:hyperopt.tpe:build_posterior_wrapper took 0.000000 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 0.049191
INFO:hyperopt.tpe:build_posterior_wrapper took 0.001001 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 0.048760


Best hyperparameters:  {'contamination': 0.09765716209726158, 'max_samples': 0.9008788036187498, 'n_estimators': 200}
Best loss: 0.04875991769112048


INFO:hyperopt.tpe:build_posterior_wrapper took 0.000000 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 0.048760
INFO:hyperopt.tpe:build_posterior_wrapper took 0.001002 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss 0.048437


Best hyperparameters:  {'contamination': 0.09492990215205609, 'max_samples': 0.8157492680691261, 'n_estimators': 3000}
Best loss: 0.04843745293005307


INFO:root:Best hyperparameters: {'contamination': 0.09492990215205609, 'max_samples': 0.8157492680691261, 'n_estimators': 3000}
INFO:root:Best loss: 0.048437


In [4]:
# load test data
test_data = pd.read_csv(os.path.join(input_path, "test_X.csv"))
x_test = pipeline.transform(test_data)

# 載入最佳結果進行預測
best_model_path = 'best_forest/best_model_weights.pkl'  # 儲存的最佳結果路徑
best_model = joblib.load(best_model_path)  # 使用 joblib 載入模型

# 計算異常分數
anomaly_scores = best_model.decision_function(x_test) 

# 計算異常分數平均誤差
test_loss = np.mean(anomaly_scores)
print(f"test_X.csv")
print(f"test_loss: {test_loss}")

# 儲存結果
results = pd.DataFrame({'id': test_data.index, 'outliers': anomaly_scores})
results.to_csv("best_forest/anomaly_detection_results.csv", index=False)

test_X.csv
test_loss: 0.03097978191847972


In [5]:
# import pandas as pd
# import os
# # 讀取 CSV 檔案
# input_path = 'best_forest/anomaly_detection_results_val.csv'  # 根據實際情況修改此路徑
# result = pd.read_csv(input_path)
# result_cout = len(result)
# print(f"Total data count for letters: {result_cout}")


# # 目標字母ID
# target_ids = ['B', 'E', 'K', 'N', 'X', 'Z']

# # 1. 找出 id 屬於 ['B', 'E', 'K', 'N', 'X', 'Z'] 且 outliers = -1 的情況
# outliers_negative = result[(result['id'].isin(target_ids)) & (result['outliers'] == -1)]

# # 2. 找出 id 不是 ['B', 'E', 'K', 'N', 'X', 'Z'] 且 outliers = 1 的情況
# outliers_positive = result[~result['id'].isin(target_ids) & (result['outliers'] == 1)]

# # 3. 統計符合上述條件的數量
# total_outliers = len(outliers_negative) + len(outliers_positive)

# # 顯示結果
# print(f"Count of outliers where id in ['B', 'E', 'K', 'N', 'X', 'Z'] and outliers = -1: {len(outliers_negative)}")
# print(f"Count of outliers where id not in ['B', 'E', 'K', 'N', 'X', 'Z'] and outliers = 1: {len(outliers_positive)}")
# print(f"Total count: {total_outliers}")