In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

#import shap
#from tqdm import tqdm
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from datetime import datetime, timedelta

In [2]:
data_dir = "data/PdM/"

df_mst = pd.read_csv(data_dir + "ics_asset_mst.csv", na_values='\\N')
df_sigdata = pd.read_csv(data_dir + "ics_asset_sigdata.csv", na_values='\\N')
df_status_hist = pd.read_csv(data_dir + "ics_asset_status_hist.csv", na_values='\\N')

df_mst.columns = ['ASSET_ID', 'ASSET_NAME', 'SENSOR_NUMBER']
df_sigdata.columns = ['asset_id', 'created_at',	'temperature', 'voltage',
                      'rms_x', 'rms_y', 'rms_z', 'rms_xyz', 'vel_rms_x', 'vel_rms_y', 'vel_rms_z', 'vel_rms_xyz',
                      'skewness_x', 'skewness_y', 'skewness_z', 'vel_skewness_x', 'vel_skewness_y', 'vel_skewness_z',
                      'kurtosis_x', 'kurtosis_y', 'kurtosis_z', 'vel_kurtosis_x', 'vel_kurtosis_y', 'vel_kurtosis_z',
                      'crest_factor_x', 'crest_factor_y', 'crest_factor_z', 'vel_crest_factor_x', 'vel_crest_factor_y', 'vel_crest_factor_z',
                      'peak_x', 'peak_y', 'peak_z', 'vel_peak_x', 'vel_peak_y', 'vel_peak_z',
                      'peak2peak_x', 'peak2peak_y', 'peak2peak_z', 'vel_peak2peak_x', 'vel_peak2peak_y', 'vel_peak2peak_z']
df_status_hist.columns = ['asset_id', 'time', 'imbalance_health', 'misalignment_health', 'looseness_health', 'bearing_health', 'asset_health', 'CRT_DT']

In [3]:
# created_at을 datetime으로 변환
df_sigdata['created_at'] = pd.to_datetime(df_sigdata['created_at'], unit='s') + pd.Timedelta(hours=9)
# status_hist의 time을 datetime으로 변환
df_status_hist['time'] = pd.to_datetime(df_status_hist['time'])

In [4]:
# 1월 10일부터 7월 31일까지의 데이터만 사용
df_sigdata = df_sigdata[(df_sigdata['created_at'] >= '2024-01-10') & (df_sigdata['created_at'] <= '2024-07-31')].reset_index(drop=True)
df_status_hist = df_status_hist[(df_status_hist['time'] >= '2024-01-10') & (df_status_hist['time'] <= '2024-07-31')].reset_index(drop=True).drop('CRT_DT', axis=1)

In [5]:
df_sigdata

Unnamed: 0,asset_id,created_at,temperature,voltage,rms_x,rms_y,rms_z,rms_xyz,vel_rms_x,vel_rms_y,...,peak_z,vel_peak_x,vel_peak_y,vel_peak_z,peak2peak_x,peak2peak_y,peak2peak_z,vel_peak2peak_x,vel_peak2peak_y,vel_peak2peak_z
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 01:51:24,14,3.03610,0.040108,0.023774,0.027209,0.053984,0.420678,0.326804,...,0.102623,1.359420,1.082990,1.15136,0.228516,0.169922,0.187500,2.48051,2.12872,2.16012
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 03:51:33,13,3.03610,0.041760,0.032989,0.036067,0.064289,0.419369,0.434728,...,0.124080,1.234560,1.671750,1.02504,0.281250,0.216797,0.228516,2.42803,3.18777,1.97753
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 05:51:36,14,3.03717,0.016262,0.016420,0.020824,0.031108,0.244851,0.252537,...,0.084699,0.820916,1.018130,1.05911,0.128906,0.134766,0.158203,1.53343,1.74055,2.11214
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 07:51:46,12,3.03610,0.014130,0.013664,0.018916,0.027280,0.227452,0.210682,...,0.066130,0.806396,0.664531,1.11374,0.117188,0.117188,0.123047,1.32377,1.31428,2.03676
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 09:51:49,13,3.03610,0.013966,0.013929,0.019151,0.027493,0.189314,0.204104,...,0.084322,0.532177,0.599794,1.11316,0.105469,0.117188,0.146484,1.05098,1.11015,1.94272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96857,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 23:57:55,25,2.66424,0.017096,0.017468,0.025699,0.035466,0.311741,0.310432,...,0.091305,0.994007,1.021720,1.42772,0.123047,0.140625,0.175781,1.96028,1.90967,2.71046
96858,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:02:02,26,2.63522,0.013915,0.015449,0.024680,0.032271,0.277927,0.266314,...,0.091060,1.011380,0.825312,1.13278,0.123047,0.111328,0.175781,1.88001,1.51558,2.17786
96859,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:06:08,26,2.61803,0.014689,0.016384,0.025625,0.033777,0.311608,0.264719,...,0.109045,0.964852,0.844137,1.08841,0.123047,0.123047,0.205078,1.88867,1.52986,2.11678
96860,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:10:16,26,2.59761,0.015783,0.017464,0.028054,0.036621,0.285147,0.285888,...,0.126984,1.203480,0.804413,2.00890,0.123047,0.123047,0.222656,2.19879,1.54369,3.85447


In [6]:
df_status_hist

Unnamed: 0,asset_id,time,imbalance_health,misalignment_health,looseness_health,bearing_health,asset_health
0,351d2688-2c91-4963-8db0-53fb4fce18ce,2024-01-10,1.0,1.0,1.0,1.0,0.986692
1,dd2a077e-75cb-4db0-8c20-b5afc01a04df,2024-01-10,1.0,1.0,1.0,1.0,0.899360
2,54508486-0b79-4fae-9aa1-507980f65a6e,2024-01-10,,,,,
3,4caeb80e-ad80-46d4-bb0f-778eaa849daf,2024-01-10,1.0,1.0,1.0,1.0,1.000000
4,5398c2fd-1393-4d0c-a4a3-68806f1d1d43,2024-01-10,1.0,1.0,1.0,1.0,0.985291
...,...,...,...,...,...,...,...
9536,d94a2922-ca24-41cc-a141-6bcef4ed79f4,2024-07-30,,,,,
9537,d94a2922-ca24-41cc-a141-6bcef4ed79f4,2024-07-29,,,,,
9538,5ef02fb5-fdb9-4043-9752-0f1bc5ac763f,2024-07-30,,,,,
9539,5ef02fb5-fdb9-4043-9752-0f1bc5ac763f,2024-07-31,,,,,


In [7]:
# 각 status_hist의 time에 해당하는 sigdata 데이터를 필터링
#for index, row in tqdm(df_sigdata.iterrows(), total=len(df_sigdata), desc="Processing periods"):

    # 특성 데이터 선택 (시간 정보는 제외)
X = df_sigdata.drop(columns=['created_at', 'asset_id', 'voltage'])

    # Isolation Forest 모델 적용
model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
model.fit(X)

    # # 배경 데이터에서 KMeans 클러스터링을 통해 요약
    # background_data = shap.kmeans(X, 100)
    
    # # SHAP 값을 사용하여 특성 중요도 분석
    # explainer = shap.KernelExplainer(model.decision_function, background_data)
    # shap_values = explainer.shap_values(X)
    
    # # SHAP summary plot
    # shap.summary_plot(shap_values, X)
    # shap.summary_plot(shap_values, X, plot_type='bar')
    # 이상값 탐지 결과 추가
df_sigdata.loc[:, 'anomaly'] = model.predict(X)


In [8]:
df_sigdata

Unnamed: 0,asset_id,created_at,temperature,voltage,rms_x,rms_y,rms_z,rms_xyz,vel_rms_x,vel_rms_y,...,vel_peak_x,vel_peak_y,vel_peak_z,peak2peak_x,peak2peak_y,peak2peak_z,vel_peak2peak_x,vel_peak2peak_y,vel_peak2peak_z,anomaly
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 01:51:24,14,3.03610,0.040108,0.023774,0.027209,0.053984,0.420678,0.326804,...,1.359420,1.082990,1.15136,0.228516,0.169922,0.187500,2.48051,2.12872,2.16012,1
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 03:51:33,13,3.03610,0.041760,0.032989,0.036067,0.064289,0.419369,0.434728,...,1.234560,1.671750,1.02504,0.281250,0.216797,0.228516,2.42803,3.18777,1.97753,1
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 05:51:36,14,3.03717,0.016262,0.016420,0.020824,0.031108,0.244851,0.252537,...,0.820916,1.018130,1.05911,0.128906,0.134766,0.158203,1.53343,1.74055,2.11214,1
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 07:51:46,12,3.03610,0.014130,0.013664,0.018916,0.027280,0.227452,0.210682,...,0.806396,0.664531,1.11374,0.117188,0.117188,0.123047,1.32377,1.31428,2.03676,1
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 09:51:49,13,3.03610,0.013966,0.013929,0.019151,0.027493,0.189314,0.204104,...,0.532177,0.599794,1.11316,0.105469,0.117188,0.146484,1.05098,1.11015,1.94272,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96857,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 23:57:55,25,2.66424,0.017096,0.017468,0.025699,0.035466,0.311741,0.310432,...,0.994007,1.021720,1.42772,0.123047,0.140625,0.175781,1.96028,1.90967,2.71046,1
96858,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:02:02,26,2.63522,0.013915,0.015449,0.024680,0.032271,0.277927,0.266314,...,1.011380,0.825312,1.13278,0.123047,0.111328,0.175781,1.88001,1.51558,2.17786,1
96859,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:06:08,26,2.61803,0.014689,0.016384,0.025625,0.033777,0.311608,0.264719,...,0.964852,0.844137,1.08841,0.123047,0.123047,0.205078,1.88867,1.52986,2.11678,1
96860,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:10:16,26,2.59761,0.015783,0.017464,0.028054,0.036621,0.285147,0.285888,...,1.203480,0.804413,2.00890,0.123047,0.123047,0.222656,2.19879,1.54369,3.85447,1


In [9]:
final_result = df_sigdata.copy()

In [10]:
# 전체 결과 데이터
final_result.drop_duplicates(subset=['asset_id', 'created_at'], keep='first', inplace=True)
final_result

Unnamed: 0,asset_id,created_at,temperature,voltage,rms_x,rms_y,rms_z,rms_xyz,vel_rms_x,vel_rms_y,...,vel_peak_x,vel_peak_y,vel_peak_z,peak2peak_x,peak2peak_y,peak2peak_z,vel_peak2peak_x,vel_peak2peak_y,vel_peak2peak_z,anomaly
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 01:51:24,14,3.03610,0.040108,0.023774,0.027209,0.053984,0.420678,0.326804,...,1.359420,1.082990,1.15136,0.228516,0.169922,0.187500,2.48051,2.12872,2.16012,1
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 03:51:33,13,3.03610,0.041760,0.032989,0.036067,0.064289,0.419369,0.434728,...,1.234560,1.671750,1.02504,0.281250,0.216797,0.228516,2.42803,3.18777,1.97753,1
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 05:51:36,14,3.03717,0.016262,0.016420,0.020824,0.031108,0.244851,0.252537,...,0.820916,1.018130,1.05911,0.128906,0.134766,0.158203,1.53343,1.74055,2.11214,1
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 07:51:46,12,3.03610,0.014130,0.013664,0.018916,0.027280,0.227452,0.210682,...,0.806396,0.664531,1.11374,0.117188,0.117188,0.123047,1.32377,1.31428,2.03676,1
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 09:51:49,13,3.03610,0.013966,0.013929,0.019151,0.027493,0.189314,0.204104,...,0.532177,0.599794,1.11316,0.105469,0.117188,0.146484,1.05098,1.11015,1.94272,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96857,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 23:57:55,25,2.66424,0.017096,0.017468,0.025699,0.035466,0.311741,0.310432,...,0.994007,1.021720,1.42772,0.123047,0.140625,0.175781,1.96028,1.90967,2.71046,1
96858,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:02:02,26,2.63522,0.013915,0.015449,0.024680,0.032271,0.277927,0.266314,...,1.011380,0.825312,1.13278,0.123047,0.111328,0.175781,1.88001,1.51558,2.17786,1
96859,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:06:08,26,2.61803,0.014689,0.016384,0.025625,0.033777,0.311608,0.264719,...,0.964852,0.844137,1.08841,0.123047,0.123047,0.205078,1.88867,1.52986,2.11678,1
96860,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:10:16,26,2.59761,0.015783,0.017464,0.028054,0.036621,0.285147,0.285888,...,1.203480,0.804413,2.00890,0.123047,0.123047,0.222656,2.19879,1.54369,3.85447,1


In [11]:
# 비정상 데이터만
anomaly_df = final_result[final_result['anomaly'] == -1]
anomaly_df

Unnamed: 0,asset_id,created_at,temperature,voltage,rms_x,rms_y,rms_z,rms_xyz,vel_rms_x,vel_rms_y,...,vel_peak_x,vel_peak_y,vel_peak_z,peak2peak_x,peak2peak_y,peak2peak_z,vel_peak2peak_x,vel_peak2peak_y,vel_peak2peak_z,anomaly
1298,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-04-27 08:11:33,25,3.07049,0.015776,0.016927,0.021572,0.031635,0.270121,0.278902,...,1.071700,1.49371,1.466830,0.164062,0.257812,0.275391,1.87545,2.72566,2.69851,-1
1310,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-04-28 08:12:45,25,3.07049,0.014656,0.016789,0.020402,0.030214,0.258940,0.312971,...,1.035230,1.48844,1.614720,0.134766,0.228516,0.263672,1.91106,2.60561,2.45716,-1
1607,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-05-23 02:43:12,26,3.07801,0.013821,0.016404,0.020284,0.029522,0.252643,0.298262,...,0.814886,1.05039,0.931748,0.140625,0.257812,0.240234,1.51668,1.86403,1.82292,-1
1702,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-05-31 00:52:51,27,3.07801,0.016481,0.022111,0.025691,0.037691,0.304568,0.336990,...,1.098020,1.90094,1.686000,0.152344,0.298828,0.357422,2.08689,3.27950,3.20081,-1
3425,09e5b7b7-43d6-456a-96e5-3fb35a13210b,2024-04-12 16:52:15,31,3.05114,0.254514,0.133654,0.071642,0.296266,1.477430,2.672710,...,8.385070,8.69479,3.694320,5.982420,1.130860,0.908203,16.51290,15.82930,7.25322,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96840,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-12 15:52:13,34,2.72120,0.842451,0.747189,0.365572,1.183920,2.458230,1.859510,...,8.418380,7.14682,5.483550,4.939450,4.500000,2.466800,16.31180,13.42300,10.15090,-1
96841,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-12 17:52:21,33,2.72335,0.904140,0.762085,0.438895,1.261300,2.752400,1.982610,...,8.359240,6.92619,6.627670,5.238280,5.226560,3.070310,16.23050,13.04540,12.65250,-1
96846,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 03:52:51,30,2.72443,0.092837,0.100908,0.072751,0.155223,1.413690,0.830293,...,11.328800,8.31221,6.239970,2.121090,2.625000,1.406250,20.06440,13.76510,11.15330,-1
96849,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 09:53:10,36,2.73410,0.763290,1.707100,0.861609,2.058920,1.745340,3.283270,...,6.568160,9.94061,6.006670,5.185550,9.855470,4.951170,12.87760,18.69220,11.83950,-1


In [12]:
# final_data 전처리 후 csv로 확인

In [13]:
final_result.loc[:, 'time'] = pd.to_datetime(final_result['created_at']).dt.date

In [14]:
final_result['time'] = final_result['time'].astype('datetime64[ns]')

In [15]:
final_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96862 entries, 0 to 96861
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   asset_id            96862 non-null  object        
 1   created_at          96862 non-null  datetime64[ns]
 2   temperature         96862 non-null  int64         
 3   voltage             96862 non-null  float64       
 4   rms_x               96862 non-null  float64       
 5   rms_y               96862 non-null  float64       
 6   rms_z               96862 non-null  float64       
 7   rms_xyz             96862 non-null  float64       
 8   vel_rms_x           96862 non-null  float64       
 9   vel_rms_y           96862 non-null  float64       
 10  vel_rms_z           96862 non-null  float64       
 11  vel_rms_xyz         96862 non-null  float64       
 12  skewness_x          96862 non-null  float64       
 13  skewness_y          96862 non-null  float64   

In [16]:
df_status_hist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9541 entries, 0 to 9540
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   asset_id             9541 non-null   object        
 1   time                 9541 non-null   datetime64[ns]
 2   imbalance_health     6766 non-null   float64       
 3   misalignment_health  6766 non-null   float64       
 4   looseness_health     6766 non-null   float64       
 5   bearing_health       6766 non-null   float64       
 6   asset_health         6766 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 521.9+ KB


In [17]:
final_result = pd.merge(
    final_result,
    df_status_hist[['asset_id', 'time', 'imbalance_health', 'misalignment_health', 'looseness_health', 'bearing_health', 'asset_health']],
    on=['asset_id', 'time'],
    how='left'
)

In [18]:
final_result.drop(columns=['time'], inplace=True)

In [19]:
print(final_result)

                                   asset_id          created_at  temperature  \
0      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 01:51:24           14   
1      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 03:51:33           13   
2      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 05:51:36           14   
3      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 07:51:46           12   
4      02dc4105-ca5e-4770-a6fc-d1fdff11fc1c 2024-01-10 09:51:49           13   
...                                     ...                 ...          ...   
96857  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-13 23:57:55           25   
96858  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:02:02           26   
96859  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:06:08           26   
96860  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:10:16           26   
96861  ff153d23-b944-4394-b1a1-be523e79febb 2024-04-14 00:14:24           26   

       voltage     rms_x     rms_y     

In [20]:
final_result.isna().sum()

asset_id                   0
created_at                 0
temperature                0
voltage                    0
rms_x                      0
rms_y                      0
rms_z                      0
rms_xyz                    0
vel_rms_x                  0
vel_rms_y                  0
vel_rms_z                  0
vel_rms_xyz                0
skewness_x                 0
skewness_y                 0
skewness_z                 0
vel_skewness_x             0
vel_skewness_y             0
vel_skewness_z             0
kurtosis_x                 0
kurtosis_y                 0
kurtosis_z                 0
vel_kurtosis_x             0
vel_kurtosis_y             0
vel_kurtosis_z             0
crest_factor_x             0
crest_factor_y             0
crest_factor_z             0
vel_crest_factor_x         0
vel_crest_factor_y         0
vel_crest_factor_z         0
peak_x                     0
peak_y                     0
peak_z                     0
vel_peak_x                 0
vel_peak_y    

In [21]:
# csv파일로 저장
#final_result.to_csv('modeling.csv', index=False)

In [22]:
columns_to_drop = [
    'temperature', 'voltage', 'rms_x', 'rms_y', 'rms_z', 'rms_xyz',
    'vel_rms_x', 'vel_rms_y', 'vel_rms_z', 'vel_rms_xyz',
    'skewness_x', 'skewness_y', 'skewness_z',
    'vel_skewness_x', 'vel_skewness_y', 'vel_skewness_z',
    'kurtosis_x', 'kurtosis_y', 'kurtosis_z',
    'vel_kurtosis_x', 'vel_kurtosis_y', 'vel_kurtosis_z',
    'crest_factor_x', 'crest_factor_y', 'crest_factor_z',
    'vel_crest_factor_x', 'vel_crest_factor_y', 'vel_crest_factor_z',
    'peak_x', 'peak_y', 'peak_z', 'vel_peak_x', 'vel_peak_y', 'vel_peak_z',
    'peak2peak_x', 'peak2peak_y', 'peak2peak_z',
    'vel_peak2peak_x', 'vel_peak2peak_y', 'vel_peak2peak_z'
]

In [23]:
final_result.drop(columns=columns_to_drop, inplace=True)

In [24]:
final_result

Unnamed: 0,asset_id,created_at,anomaly,imbalance_health,misalignment_health,looseness_health,bearing_health,asset_health
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 01:51:24,1,1.0,1.0,1.0,1.0,1.000000
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 03:51:33,1,1.0,1.0,1.0,1.0,1.000000
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 05:51:36,1,1.0,1.0,1.0,1.0,1.000000
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 07:51:46,1,1.0,1.0,1.0,1.0,1.000000
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 09:51:49,1,1.0,1.0,1.0,1.0,1.000000
...,...,...,...,...,...,...,...,...
96857,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 23:57:55,1,1.0,1.0,1.0,1.0,0.984976
96858,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:02:02,1,,,,,
96859,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:06:08,1,,,,,
96860,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:10:16,1,,,,,


In [25]:
final_result

Unnamed: 0,asset_id,created_at,anomaly,imbalance_health,misalignment_health,looseness_health,bearing_health,asset_health
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 01:51:24,1,1.0,1.0,1.0,1.0,1.000000
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 03:51:33,1,1.0,1.0,1.0,1.0,1.000000
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 05:51:36,1,1.0,1.0,1.0,1.0,1.000000
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 07:51:46,1,1.0,1.0,1.0,1.0,1.000000
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10 09:51:49,1,1.0,1.0,1.0,1.0,1.000000
...,...,...,...,...,...,...,...,...
96857,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13 23:57:55,1,1.0,1.0,1.0,1.0,0.984976
96858,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:02:02,1,,,,,
96859,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:06:08,1,,,,,
96860,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-14 00:10:16,1,,,,,


In [26]:
# csv파일로 저장
#final_result.to_csv('modeling_status.csv', index=False)

In [27]:
# 평가

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 날짜별로 anomaly가 -1인 값이 하나라도 있는지 확인
final_result['date'] = final_result['created_at'].dt.date
anomaly_by_date = final_result.groupby(['asset_id', 'date'])['anomaly'].apply(lambda x: -1 in x.values).reset_index(name='is_anomaly')

# 날짜별로 asset_health가 1이 아닌 값이 있는지 확인
health_by_date = final_result.groupby(['asset_id', 'date'])['imbalance_health'].apply(lambda x: x.isna().any() or (x != 1).any() ).reset_index(name='is_not_healthy')

# 두 결과를 병합하여 평가 데이터셋 생성
evaluation_df = pd.merge(anomaly_by_date, health_by_date, on=['asset_id', 'date'])

# 실제 값과 예측 값
y_true = evaluation_df['is_not_healthy']
y_pred = evaluation_df['is_anomaly']

# 정확도, 정밀도, 재현율, F1 스코어 계산
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.65
Precision: 0.18
Recall: 0.22
F1 Score: 0.20


In [30]:
# True=비정상 데이터, False=정상 데이터
evaluation_df

Unnamed: 0,asset_id,date,is_anomaly,is_not_healthy
0,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-10,False,False
1,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-11,False,False
2,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-12,False,False
3,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-13,False,False
4,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-14,False,False
...,...,...,...,...
7906,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-10,True,False
7907,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-11,True,True
7908,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-12,True,False
7909,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-13,True,False


In [31]:
evaluation_df[evaluation_df['is_not_healthy'] == True]

Unnamed: 0,asset_id,date,is_anomaly,is_not_healthy
6,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-01-16,False,True
30,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-02-09,False,True
31,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-02-10,False,True
32,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-02-11,False,True
33,02dc4105-ca5e-4770-a6fc-d1fdff11fc1c,2024-02-12,False,True
...,...,...,...,...
7896,ff153d23-b944-4394-b1a1-be523e79febb,2024-03-31,True,True
7900,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-04,True,True
7901,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-05,True,True
7907,ff153d23-b944-4394-b1a1-be523e79febb,2024-04-11,True,True
