In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split

In [2]:
# 1. 데이터 로드 및 전처리
# rms_x, rms_y, ..., vel_peak2peak_z 38개 피처

df = pd.read_csv('55285839-9b78-48d8-9f4e-573190ace016_data.csv') # 데이터 로드
X = df.iloc[:, 4:42].values  # 피처만 추출
y = df.iloc[:, 44:45].values # imbalance 44
y = np.where(np.isnan(y), 1, y) # nan을 1(정상)으로
y = np.where(y != 1, 0, y) # 1이 아니면 0(비정상)으로

# 피처 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [3]:
# 2. 1D-CNN 모델 구축
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_scaled.shape[1], 1)))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))  # 잠재 벡터 크기
model.add(Dense(1, activation='sigmoid'))  # 정상/비정상 분류를 위한 출력층 (있을 경우)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [4]:
# 3. 잠재 벡터 추출
extractor = Sequential(model.layers[:-1])  # 마지막 레이어를 제외하고 잠재 벡터를 추출하는 모델 생성
latent_vectors = extractor.predict(X_scaled)



In [5]:
# 4. 군집화
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(latent_vectors)

# 5. 군집 결과 확인
df['cluster'] = kmeans.labels_
print(df[['created_at', 'cluster']])

# 정상(0), 비정상(1)

      created_at  cluster
0     1704037054        0
1     1704044261        0
2     1704051464        0
3     1704058671        0
4     1704065874        0
...          ...      ...
2835  1724568616        0
2836  1724575824        0
2837  1724583026        0
2838  1724590235        0
2839  1724597437        0

[2840 rows x 2 columns]


  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
df[df['cluster'] == 1]

Unnamed: 0,asset_id,created_at,created_at_datetime,temperature,rms_x,rms_y,rms_z,rms_xyz,vel_rms_x,vel_rms_y,...,vel_peak2peak_x,vel_peak2peak_y,vel_peak2peak_z,time,imbalance_health,misalignment_health,looseness_health,bearing_health,asset_health,cluster
16,55285839-9b78-48d8-9f4e-573190ace016,1704152339,2024-01-02 08:38:59,16.0,0.149122,0.190456,0.120783,0.270368,0.877801,0.853366,...,5.75759,5.57709,4.26231,2024-01-03,,,,,1.000000,1
17,55285839-9b78-48d8-9f4e-573190ace016,1704159548,2024-01-02 10:39:08,21.0,0.223930,0.235981,0.149175,0.357889,1.428040,1.138620,...,9.50220,7.77415,4.43434,2024-01-03,,,,,1.000000,1
19,55285839-9b78-48d8-9f4e-573190ace016,1704173958,2024-01-02 14:39:18,26.0,0.220175,0.226797,0.131103,0.342202,1.412050,1.224570,...,8.89010,8.10443,4.40064,2024-01-03,,,,,1.000000,1
20,55285839-9b78-48d8-9f4e-573190ace016,1704181161,2024-01-02 16:39:21,27.0,0.233231,0.249498,0.162325,0.378147,1.316060,1.092420,...,8.44642,6.47384,5.29463,2024-01-03,,,,,1.000000,1
32,55285839-9b78-48d8-9f4e-573190ace016,1704267632,2024-01-03 16:40:32,24.0,0.253442,0.229400,0.150004,0.373307,1.208150,1.278480,...,9.01112,7.82503,5.12223,2024-01-04,,,,,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,55285839-9b78-48d8-9f4e-573190ace016,1723956140,2024-08-18 13:42:20,39.0,0.186857,0.348881,0.162665,0.427894,0.716945,0.803257,...,5.13138,5.98466,3.84684,2024-08-19,1.0,1.0,1.0,1.0,1.000000,1
2751,55285839-9b78-48d8-9f4e-573190ace016,1723963343,2024-08-18 15:42:23,40.0,0.178800,0.329450,0.171311,0.412134,1.039540,0.959725,...,6.09286,6.55454,5.05450,2024-08-19,1.0,1.0,1.0,1.0,1.000000,1
2802,55285839-9b78-48d8-9f4e-573190ace016,1724330832,2024-08-22 21:47:12,38.0,0.182306,0.320357,0.164498,0.403638,0.970890,0.802556,...,6.20782,5.49682,3.82732,2024-08-23,1.0,1.0,1.0,1.0,0.977088,1
2806,55285839-9b78-48d8-9f4e-573190ace016,1724359655,2024-08-23 05:47:35,35.0,0.179574,0.299458,0.164737,0.386083,0.852224,0.764971,...,5.65891,4.77972,4.38385,2024-08-24,1.0,1.0,1.0,1.0,0.993823,1


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 날짜별로 cluster가 1인 값(비정상)이 하나라도 있는지 확인
anomaly_by_date = df.groupby(['time'])['cluster'].apply(lambda x: 1 in x.values).reset_index(name='is_anomaly')

# 날짜별로 health가 0(비정상)이 아닌 값이 있는지 확인
# x.isna().any() or 
health_by_date = df.groupby(['time'])['imbalance_health'].apply(lambda x: x.isna().any() or any(x < 1)).reset_index(name='is_not_healthy')

print(anomaly_by_date) # False: 정상 판단, True: 비정상 판단
print(health_by_date) # False: 실제정상, True: 실제 비정상

# 두 결과를 병합하여 평가 데이터셋 생성
evaluation_df = pd.merge(anomaly_by_date, health_by_date, on=['time'])

# 실제 값과 예측 값
y_true = evaluation_df['is_not_healthy']
y_pred = evaluation_df['is_anomaly']

# 정확도, 정밀도, 재현율, F1 스코어 계산
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


           time  is_anomaly
0    2024-01-02       False
1    2024-01-03        True
2    2024-01-04        True
3    2024-01-05        True
4    2024-01-06        True
..          ...         ...
232  2024-08-22       False
233  2024-08-23        True
234  2024-08-24        True
235  2024-08-25        True
236  2024-08-26       False

[237 rows x 2 columns]
           time  is_not_healthy
0    2024-01-02            True
1    2024-01-03            True
2    2024-01-04            True
3    2024-01-05            True
4    2024-01-06            True
..          ...             ...
232  2024-08-22           False
233  2024-08-23           False
234  2024-08-24           False
235  2024-08-25           False
236  2024-08-26           False

[237 rows x 2 columns]
Accuracy: 0.32
Precision: 0.17
Recall: 0.66
F1 Score: 0.27


In [30]:
evaluation_df[evaluation_df['is_anomaly'] == True] #비정상 판단

Unnamed: 0,time,is_anomaly,is_not_healthy
1,2024-01-03,True,True
2,2024-01-04,True,True
3,2024-01-05,True,True
4,2024-01-06,True,True
6,2024-01-08,True,True
...,...,...,...
228,2024-08-18,True,False
229,2024-08-19,True,False
233,2024-08-23,True,False
234,2024-08-24,True,False


In [31]:
len(evaluation_df[evaluation_df['is_not_healthy'] == True]) #실제 비정상개수

44