In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import os
from itertools import chain
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# import transform_fns as trans
import pdm_functions as fns
import tensor_vstack as pfns
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from scipy.stats import gaussian_kde
from sklearn.utils.class_weight import compute_class_weight

import pickle
from tqdm.notebook import tqdm


In [4]:
data_stacked = torch.load('data/spec_datasets.pt', weights_only=True)
X_train = data_stacked['X_train'].view(2600, 36, 2048)
y_train = data_stacked['y_train']
X_train.shape, y_train.shape

(torch.Size([2600, 36, 2048]), torch.Size([2600]))

In [6]:
X_test = data_stacked['X_test'].view(37, 36, 2048)
y_test = data_stacked['y_test']
X_test.shape, y_test.shape

X_dev = data_stacked['X_dev'].view(370, 36, 2048)
y_dev = data_stacked['y_dev']
X_dev.shape, y_dev.shape

(torch.Size([370, 36, 2048]), torch.Size([370]))

In [None]:
# df_expanded.drop(columns=['asset_id', 'created_at'],inplace=True)

In [9]:
results = []

# 각 샘플에 대해 Isolation Forest 적용
for i in range(X_train.size(0)):  # 2600개 샘플에 대해 반복
    sample = X_train[i].view(36, 2048).numpy()  # (36, 2048) 형태로 변환
    model = IsolationForest()
    model.fit(sample)  # 모델 학습
    preds = model.predict(sample)  # 이상치 예측
    results.append(preds)  # 결과 저장

In [None]:
x_data_scaled, y_data_scaled, z_data_scaled

In [None]:
data_stack

In [6]:
# 모델 초기화
contamination = 0.001
model = IsolationForest(contamination=contamination)  # contamination은 데이터에서 이상치 비율을 추정

# Isolation Forest 모델 초기화 및 학습
models = {
    'x': IsolationForest(contamination=contamination),
    'y': IsolationForest(contamination=contamination),
    'z': IsolationForest(contamination=contamination),
    'xy': IsolationForest(contamination=contamination),
    'yz': IsolationForest(contamination=contamination),
    'xz': IsolationForest(contamination=contamination),
    'xyz': IsolationForest(contamination=contamination)
}

# 모델 학습
models['x'].fit(x_data_scaled)
models['y'].fit(y_data_scaled)
models['z'].fit(z_data_scaled)
models['xy'].fit(np.hstack([x_data_scaled, y_data_scaled]))
models['yz'].fit(np.hstack([y_data_scaled, z_data_scaled]))
models['xz'].fit(np.hstack([x_data_scaled, z_data_scaled]))
models['xyz'].fit(np.hstack([x_data_scaled, y_data_scaled, z_data_scaled]))

# 이상치 예측
predictions = {
    'x': models['x'].predict(x_data_scaled),
    'y': models['y'].predict(y_data_scaled),
    'z': models['z'].predict(z_data_scaled),
    'xy': models['xy'].predict(np.hstack([x_data_scaled, y_data_scaled])),
    'yz': models['yz'].predict(np.hstack([y_data_scaled, z_data_scaled])),
    'xz': models['xz'].predict(np.hstack([x_data_scaled, z_data_scaled])),
    'xyz': models['xyz'].predict(np.hstack([x_data_scaled, y_data_scaled, z_data_scaled]))
}


In [8]:
# 예측 결과 -1은 이상치, 1은 정상 데이터
anomalies = {key: (pred == -1).astype(int) for key, pred in predictions.items()}

In [None]:
# 플로팅 함수 정의
def plot_1d(ax, data, anomalies, title, xlabel, ylabel):
    ax.scatter(range(len(data)), data, c=anomalies, cmap='coolwarm', label='Anomaly', edgecolor='k')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()

def plot_2d(ax, x_data, y_data, anomalies, title, xlabel, ylabel):
    scatter = ax.scatter(x_data, y_data, c=anomalies, cmap='coolwarm', label='Anomaly', edgecolor='k')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()
    plt.colorbar(scatter, ax=ax, label='Anomaly')

def plot_3d(ax, x_data, y_data, z_data, anomalies):
    scatter = ax.scatter3D(x_data, y_data, z_data, c=anomalies, cmap='coolwarm', label='Anomaly', edgecolor='k')
    ax.set_xlabel('X Axis')
    ax.set_ylabel('Y Axis')
    ax.set_zlabel('Z Axis')
    ax.set_title('XYZ Space')
    ax.legend()
    plt.colorbar(scatter, ax=ax, label='Anomaly')

# 플롯 생성
fig = plt.figure(figsize=(20, 15))

# 1D 플롯
ax = plt.subplot(2, 4, 1)
plot_1d(ax, x_data_scaled, anomalies['x'], 'X Axis Data', 'Index', 'X Axis')

ax = plt.subplot(2, 4, 2)
plot_1d(ax, y_data_scaled, anomalies['y'], 'Y Axis Data', 'Index', 'Y Axis')

ax = plt.subplot(2, 4, 3)
plot_1d(ax, z_data_scaled, anomalies['z'], 'Z Axis Data', 'Index', 'Z Axis')

# 2D 플롯
ax = plt.subplot(2, 4, 4)
plot_2d(ax, x_data_scaled, y_data_scaled, anomalies['xy'], 'XY Plane', 'X Axis', 'Y Axis')

ax = plt.subplot(2, 4, 5)
plot_2d(ax, y_data_scaled, z_data_scaled, anomalies['yz'], 'YZ Plane', 'Y Axis', 'Z Axis')

ax = plt.subplot(2, 4, 6)
plot_2d(ax, x_data_scaled, z_data_scaled, anomalies['xz'], 'XZ Plane', 'X Axis', 'Z Axis')

# 3D 플롯
ax = plt.subplot(2, 4, 7, projection='3d')
plot_3d(ax, x_data_scaled, y_data_scaled, z_data_scaled, anomalies['xyz'])

plt.tight_layout()
plt.show()

  plt.tight_layout()
  fig.canvas.print_figure(bytes_io, **kw)


In [None]:
# 이상치 예측
predictions = model.predict(data_stack)

# 예측 결과 -1은 이상치, 1은 정상 데이터
anomalies = (predictions == -1)

In [None]:
# 3D 플로팅
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# 정상 데이터
ax.scatter(x_data_scaled[~anomalies], y_data_scaled[~anomalies], z_data_scaled[~anomalies], c='blue', label='Normal')

# 이상치
ax.scatter(x_data_scaled[anomalies], y_data_scaled[anomalies], z_data_scaled[anomalies], c='red', label='Anomaly')

# 레이블 및 타이틀 설정
ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.set_zlabel('Z Axis')
ax.set_title('Isolation Forest Anomaly Detection')
ax.legend()

plt.show()