In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.inspection import permutation_importance
from tqdm import tqdm
from sklearn.ensemble import IsolationForest
from datetime import datetime, timedelta

In [2]:
data_dir = "data/PdM/"

df_sigdata = pd.read_csv(data_dir + "ics_asset_sigdata.csv", na_values='\\N')
df_status_hist = pd.read_csv(data_dir + "ics_asset_status_hist.csv", na_values='\\N')

df_sigdata.columns = ['asset_id', 'created_at',	'temperature', 'voltage',
                      'rms_x', 'rms_y', 'rms_z', 'rms_xyz', 'vel_rms_x', 'vel_rms_y', 'vel_rms_z', 'vel_rms_xyz',
                      'skewness_x', 'skewness_y', 'skewness_z', 'vel_skewness_x', 'vel_skewness_y', 'vel_skewness_z',
                      'kurtosis_x', 'kurtosis_y', 'kurtosis_z', 'vel_kurtosis_x', 'vel_kurtosis_y', 'vel_kurtosis_z',
                      'crest_factor_x', 'crest_factor_y', 'crest_factor_z', 'vel_crest_factor_x', 'vel_crest_factor_y', 'vel_crest_factor_z',
                      'peak_x', 'peak_y', 'peak_z', 'vel_peak_x', 'vel_peak_y', 'vel_peak_z',
                      'peak2peak_x', 'peak2peak_y', 'peak2peak_z', 'vel_peak2peak_x', 'vel_peak2peak_y', 'vel_peak2peak_z']
df_status_hist.columns = ['asset_id', 'time', 'imbalance_health', 'misalignment_health', 'looseness_health', 'bearing_health', 'asset_health', 'CRT_DT']

In [3]:
# created_at을 datetime으로 변환
df_sigdata['created_at'] = pd.to_datetime(df_sigdata['created_at'], unit='s') + pd.Timedelta(hours=9)
# status_hist의 time을 datetime으로 변환
df_status_hist['time'] = pd.to_datetime(df_status_hist['time'])

In [4]:
# 1월 10일부터 7월 31일까지의 데이터만 사용
df_sigdata = df_sigdata[(df_sigdata['created_at'] >= '2024-01-10') & (df_sigdata['created_at'] <= '2024-07-31')].reset_index(drop=True)
df_status_hist = df_status_hist[(df_status_hist['time'] >= '2024-01-10') & (df_status_hist['time'] <= '2024-07-31')].reset_index(drop=True).drop('CRT_DT', axis=1)

In [5]:
# 전체 통계 정보를 저장할 리스트
modeling_list = []
proceed = 0

# 각 asset_id에 대해 데이터를 분리하고 모델 학습
for asset_id in df_sigdata['asset_id'].unique():
    proceed += 1
    print(f"Processing asset_id: {asset_id}  진행도: {proceed}/{len(df_sigdata['asset_id'].unique())}")
    
    # 해당 asset_id에 대한 데이터 필터링
    asset_data = df_sigdata[df_sigdata['asset_id'] == asset_id].copy()

    # 학습에 제외할 값
    X = asset_data.drop(columns=['created_at', 'asset_id', 'temperature'])

    # Isolation Forest 모델 적용
    model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
    model.fit(X)
            
    # 이상값 탐지 결과 추가
    asset_data['anomaly'] = model.predict(X)

    # 모델링 결과 넣기
    modeling_list.append(asset_data)


Processing asset_id: 02dc4105-ca5e-4770-a6fc-d1fdff11fc1c  진행도: 1/47
Processing asset_id: 09e5b7b7-43d6-456a-96e5-3fb35a13210b  진행도: 2/47
Processing asset_id: 0d3cfbf4-d417-4e83-9ab7-8d49643c74bf  진행도: 3/47
Processing asset_id: 0f5e48b9-459c-4338-adb2-03de5debeb02  진행도: 4/47
Processing asset_id: 12b5eba2-5d2a-4d96-806e-6d8a37bb34db  진행도: 5/47
Processing asset_id: 195394c0-792d-4cf8-b021-aa27e0539554  진행도: 6/47
Processing asset_id: 22fca627-7792-4e6d-8651-fc7b1f9b1f3e  진행도: 7/47
Processing asset_id: 351d2688-2c91-4963-8db0-53fb4fce18ce  진행도: 8/47
Processing asset_id: 352ddf13-01d8-45bd-b8a9-22f9968a0e11  진행도: 9/47
Processing asset_id: 41a2ab5e-ef77-4480-9466-37b667a18a18  진행도: 10/47
Processing asset_id: 435996ea-6625-4c91-b7d5-297b7742cb24  진행도: 11/47
Processing asset_id: 471e8e21-9649-4405-8da7-ad8900ad0b49  진행도: 12/47
Processing asset_id: 4b63fc16-6b90-4823-9622-c8533e7102cc  진행도: 13/47
Processing asset_id: 4caeb80e-ad80-46d4-bb0f-778eaa849daf  진행도: 14/47
Processing asset_id: 51333266

In [6]:
# 전체 결과를 하나의 데이터프레임으로 결합
final_result = pd.concat(modeling_list)

In [7]:
final_result.loc[:, 'time'] = pd.to_datetime(final_result['created_at']).dt.date
final_result['time'] = final_result['time'].astype('datetime64[ns]')

final_result = pd.merge(
    final_result,
    df_status_hist[['asset_id', 'time', 'imbalance_health', 'misalignment_health', 'looseness_health', 'bearing_health', 'asset_health']],
    on=['asset_id', 'time'],
    how='left'
)
final_result.drop(columns=['time'], inplace=True)
print(final_result)

                                   asset_id          created_at  temperature  \
0      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 01:51:24           14   
1      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 03:51:33           13   
2      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 05:51:36           14   
3      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 07:51:46           12   
4      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 09:51:49           13   
...                                     ...                 ...          ...   
96857  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-13 23:57:55           25   
96858  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:02:02           26   
96859  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:06:08           26   
96860  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:10:16           26   
96861  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:14:24           26   

       voltage     rms_x     rms_y     

In [8]:
final_result.isna().sum()

asset_id                   0
created_at                 0
temperature                0
voltage                    0
rms_x                      0
rms_y                      0
rms_z                      0
rms_xyz                    0
vel_rms_x                  0
vel_rms_y                  0
vel_rms_z                  0
vel_rms_xyz                0
skewness_x                 0
skewness_y                 0
skewness_z                 0
vel_skewness_x             0
vel_skewness_y             0
vel_skewness_z             0
kurtosis_x                 0
kurtosis_y                 0
kurtosis_z                 0
vel_kurtosis_x             0
vel_kurtosis_y             0
vel_kurtosis_z             0
crest_factor_x             0
crest_factor_y             0
crest_factor_z             0
vel_crest_factor_x         0
vel_crest_factor_y         0
vel_crest_factor_z         0
peak_x                     0
peak_y                     0
peak_z                     0
vel_peak_x                 0
vel_peak_y    

In [9]:
# 평가

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 날짜별로 anomaly가 -1인 값이 하나라도 있는지 확인
final_result.loc[:, 'date'] = final_result['created_at'].dt.date
anomaly_by_date = final_result.groupby(['asset_id', 'date'])['anomaly'].apply(lambda x: -1 in x.values).reset_index(name='is_anomaly')

# 날짜별로 asset_health가 0.9미만이 아닌 값이 있는지 확인->고장판단
# x.isna().any() or 
health_by_date = final_result.groupby(['asset_id', 'date'])['asset_health'].apply(lambda x: any(x < 0.9)).reset_index(name='is_not_healthy')

# 두 결과를 병합하여 평가 데이터셋 생성
evaluation_df = pd.merge(anomaly_by_date, health_by_date, on=['asset_id', 'date'])

# 실제 값과 예측 값
y_true = evaluation_df['is_not_healthy']
y_pred = evaluation_df['is_anomaly']

# 정확도, 정밀도, 재현율, F1 스코어 계산
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.83
Precision: 0.13
Recall: 0.14
F1 Score: 0.14


In [11]:
# True=비정상 데이터, False=정상 데이터
evaluation_df

Unnamed: 0,asset_id,date,is_anomaly,is_not_healthy
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10,True,False
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-11,True,False
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-12,False,False
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-13,False,False
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-14,False,False
...,...,...,...,...
7906,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-10,False,False
7907,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-11,True,True
7908,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-12,False,False
7909,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13,True,False


In [12]:
evaluation_df[evaluation_df['is_not_healthy'] == True]

Unnamed: 0,asset_id,date,is_anomaly,is_not_healthy
392,0d3cfbf4-d417-4e83-9ab7-8d49643c74bf,2024-01-10,False,True
393,0d3cfbf4-d417-4e83-9ab7-8d49643c74bf,2024-01-11,True,True
395,0d3cfbf4-d417-4e83-9ab7-8d49643c74bf,2024-01-13,False,True
396,0d3cfbf4-d417-4e83-9ab7-8d49643c74bf,2024-01-14,False,True
397,0d3cfbf4-d417-4e83-9ab7-8d49643c74bf,2024-01-15,True,True
...,...,...,...,...
7896,ff153d23-b944-4394-b1a1-be523e79febb,2024-03-31,False,True
7900,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-04,False,True
7901,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-05,False,True
7905,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-09,False,True


In [13]:
evaluation_df[evaluation_df['is_anomaly'] == True]

Unnamed: 0,asset_id,date,is_anomaly,is_not_healthy
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10,True,False
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-11,True,False
5,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-15,True,False
9,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-19,True,False
14,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-24,True,False
...,...,...,...,...
7886,ff153d23-b944-4394-b1a1-be523e79febb,2024-03-21,True,True
7888,ff153d23-b944-4394-b1a1-be523e79febb,2024-03-23,True,False
7890,ff153d23-b944-4394-b1a1-be523e79febb,2024-03-25,True,True
7907,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-11,True,True
