In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

# JSON 파일 읽기
df=pd.read_json('data/spectrum.json').sort_values('created_at')
df['created_at'] = pd.to_datetime(df['created_at'], unit='s', utc=True) + pd.DateOffset(hours=9)
# spectrum 열을 리스트로 변환하고 각 값을 float로 변환
df['spectrum_x_amp'] = df['spectrum_x_amp'].apply(lambda x: [float(i) for i in x.split(',')])
df['spectrum_y_amp'] = df['spectrum_y_amp'].apply(lambda x: [float(i) for i in x.split(',')])
df['spectrum_z_amp'] = df['spectrum_z_amp'].apply(lambda x: [float(i) for i in x.split(',')])

In [2]:
# 빈 리스트를 준비
expanded_rows = []

# 각 행을 처리
for _, row in df.iterrows():
    x_list = row['spectrum_x_amp']
    y_list = row['spectrum_y_amp']
    z_list = row['spectrum_z_amp']
    
    # 모든 리스트의 길이가 같다고 가정
    for i in range(len(x_list)):
        expanded_rows.append({
            'asset_id': row['asset_id'],
            'created_at': row['created_at'],
            'spectrum_x_amp': x_list[i],
            'spectrum_y_amp': y_list[i],
            'spectrum_z_amp': z_list[i]
        })

# 데이터프레임으로 변환
df_expanded = pd.DataFrame(expanded_rows)

# 결과 확인
print(df_expanded.head())

                               asset_id                created_at  \
0  55285839-9b78-48d8-9f4e-573190ace016 2024-01-01 00:37:34+00:00   
1  55285839-9b78-48d8-9f4e-573190ace016 2024-01-01 00:37:34+00:00   
2  55285839-9b78-48d8-9f4e-573190ace016 2024-01-01 00:37:34+00:00   
3  55285839-9b78-48d8-9f4e-573190ace016 2024-01-01 00:37:34+00:00   
4  55285839-9b78-48d8-9f4e-573190ace016 2024-01-01 00:37:34+00:00   

   spectrum_x_amp  spectrum_y_amp  spectrum_z_amp  
0        0.000136        0.000158        0.000022  
1        0.000262        0.000263        0.000111  
2        0.000426        0.000355        0.000135  
3        0.000392        0.000121        0.000149  
4        0.000238        0.000450        0.000267  


In [None]:
# df_expanded.drop(columns=['asset_id', 'created_at'],inplace=True)

In [3]:
x_data = df_expanded['spectrum_x_amp'].values.reshape(-1,1)
y_data = df_expanded['spectrum_y_amp'].values.reshape(-1,1)
z_data = df_expanded['spectrum_z_amp'].values.reshape(-1,1)

In [4]:
scaler = MinMaxScaler()
x_data_scaled = scaler.fit_transform(x_data)
y_data_scaled = scaler.fit_transform(y_data)
z_data_scaled = scaler.fit_transform(z_data)

In [None]:
x_data_scaled, y_data_scaled, z_data_scaled

In [5]:
# 데이터를 수평으로 결합
data_stack = np.hstack([x_data_scaled, y_data_scaled, z_data_scaled])

In [None]:
data_stack

In [6]:
# 모델 초기화
contamination = 0.001
model = IsolationForest(contamination=contamination)  # contamination은 데이터에서 이상치 비율을 추정

# Isolation Forest 모델 초기화 및 학습
models = {
    'x': IsolationForest(contamination=contamination),
    'y': IsolationForest(contamination=contamination),
    'z': IsolationForest(contamination=contamination),
    'xy': IsolationForest(contamination=contamination),
    'yz': IsolationForest(contamination=contamination),
    'xz': IsolationForest(contamination=contamination),
    'xyz': IsolationForest(contamination=contamination)
}

# 모델 학습
models['x'].fit(x_data_scaled)
models['y'].fit(y_data_scaled)
models['z'].fit(z_data_scaled)
models['xy'].fit(np.hstack([x_data_scaled, y_data_scaled]))
models['yz'].fit(np.hstack([y_data_scaled, z_data_scaled]))
models['xz'].fit(np.hstack([x_data_scaled, z_data_scaled]))
models['xyz'].fit(np.hstack([x_data_scaled, y_data_scaled, z_data_scaled]))

# 이상치 예측
predictions = {
    'x': models['x'].predict(x_data_scaled),
    'y': models['y'].predict(y_data_scaled),
    'z': models['z'].predict(z_data_scaled),
    'xy': models['xy'].predict(np.hstack([x_data_scaled, y_data_scaled])),
    'yz': models['yz'].predict(np.hstack([y_data_scaled, z_data_scaled])),
    'xz': models['xz'].predict(np.hstack([x_data_scaled, z_data_scaled])),
    'xyz': models['xyz'].predict(np.hstack([x_data_scaled, y_data_scaled, z_data_scaled]))
}


In [8]:
# 예측 결과 -1은 이상치, 1은 정상 데이터
anomalies = {key: (pred == -1).astype(int) for key, pred in predictions.items()}

In [None]:
# 플로팅 함수 정의
def plot_1d(ax, data, anomalies, title, xlabel, ylabel):
    ax.scatter(range(len(data)), data, c=anomalies, cmap='coolwarm', label='Anomaly', edgecolor='k')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()

def plot_2d(ax, x_data, y_data, anomalies, title, xlabel, ylabel):
    scatter = ax.scatter(x_data, y_data, c=anomalies, cmap='coolwarm', label='Anomaly', edgecolor='k')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()
    plt.colorbar(scatter, ax=ax, label='Anomaly')

def plot_3d(ax, x_data, y_data, z_data, anomalies):
    scatter = ax.scatter3D(x_data, y_data, z_data, c=anomalies, cmap='coolwarm', label='Anomaly', edgecolor='k')
    ax.set_xlabel('X Axis')
    ax.set_ylabel('Y Axis')
    ax.set_zlabel('Z Axis')
    ax.set_title('XYZ Space')
    ax.legend()
    plt.colorbar(scatter, ax=ax, label='Anomaly')

# 플롯 생성
fig = plt.figure(figsize=(20, 15))

# 1D 플롯
ax = plt.subplot(2, 4, 1)
plot_1d(ax, x_data_scaled, anomalies['x'], 'X Axis Data', 'Index', 'X Axis')

ax = plt.subplot(2, 4, 2)
plot_1d(ax, y_data_scaled, anomalies['y'], 'Y Axis Data', 'Index', 'Y Axis')

ax = plt.subplot(2, 4, 3)
plot_1d(ax, z_data_scaled, anomalies['z'], 'Z Axis Data', 'Index', 'Z Axis')

# 2D 플롯
ax = plt.subplot(2, 4, 4)
plot_2d(ax, x_data_scaled, y_data_scaled, anomalies['xy'], 'XY Plane', 'X Axis', 'Y Axis')

ax = plt.subplot(2, 4, 5)
plot_2d(ax, y_data_scaled, z_data_scaled, anomalies['yz'], 'YZ Plane', 'Y Axis', 'Z Axis')

ax = plt.subplot(2, 4, 6)
plot_2d(ax, x_data_scaled, z_data_scaled, anomalies['xz'], 'XZ Plane', 'X Axis', 'Z Axis')

# 3D 플롯
ax = plt.subplot(2, 4, 7, projection='3d')
plot_3d(ax, x_data_scaled, y_data_scaled, z_data_scaled, anomalies['xyz'])

plt.tight_layout()
plt.show()

  plt.tight_layout()
  fig.canvas.print_figure(bytes_io, **kw)


In [None]:
# 이상치 예측
predictions = model.predict(data_stack)

# 예측 결과 -1은 이상치, 1은 정상 데이터
anomalies = (predictions == -1)

In [None]:
# 3D 플로팅
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# 정상 데이터
ax.scatter(x_data_scaled[~anomalies], y_data_scaled[~anomalies], z_data_scaled[~anomalies], c='blue', label='Normal')

# 이상치
ax.scatter(x_data_scaled[anomalies], y_data_scaled[anomalies], z_data_scaled[anomalies], c='red', label='Anomaly')

# 레이블 및 타이틀 설정
ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.set_zlabel('Z Axis')
ax.set_title('Isolation Forest Anomaly Detection')
ax.legend()

plt.show()