In [4]:
import pandas as pd

In [5]:
folder_path = '../../data/'

new_data = pd.read_csv(folder_path + 'AI4I-PMDI.csv')
old_data = pd.read_csv(folder_path + 'ai4i2020.csv')

## preprocess

In [6]:
# 1. 欄位名稱對應（僅要對應的欄位做改名）
col_map = {
    'Air temperature (K)': 'Air temperature',
    'Process temperature (K)': 'Process temperature',
    'Rotational speed (rpm)': 'Rotational speed',
    'Torque (Nm)': 'Torque',
    'Tool wear (min)': 'Tool wear',
}
# 用 rename 只處理指定欄位
df = new_data.rename(columns=col_map).copy()

# 2. 新增 failure 欄位
for col in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
    df[col] = 0

# 3. 解析 Diagnostic 欄位，自動 one-hot
failure_map = {
    'Tool Wear Failure': 'TWF',
    'Heat Dissipation Failure': 'HDF',
    'Power Failure': 'PWF',
    'Overstrain Failure': 'OSF',
    'Random Failure': 'RNF'
}

def parse_failure(diag):
    # 支援多個failure（分號、逗號、空格等分隔）
    result = {k:0 for k in failure_map.values()}
    if pd.isnull(diag) or diag.strip() == '' or diag == 'No Failure':
        return result
    # 允許一列有多個failure
    for key, col in failure_map.items():
        if key in diag:
            result[col] = 1
    return result

failures = df['Diagnostic'].apply(parse_failure).apply(pd.Series)
for col in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
    df[col] = failures[col]
    
# 4. 新增 Machine failure 欄位（有任一 failure 則為 1）
df['Machine failure'] = df[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].max(axis=1)

processed_new_data = df


## auto-encoder

In [7]:
import glob
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers


In [8]:
read_file_path = '../../instance/imputed_data_on_system'

# 搜尋所有主資料夾下（例如 linear、ffill、rolling）子資料夾的所有 csv
all_csv_paths = glob.glob(os.path.join(read_file_path, '*', '*.csv'))

print(f'共找到 {len(all_csv_paths)} 個檔案')
for path in all_csv_paths:
    print(path)

# 讀取全部表格
dfs = []
for path in all_csv_paths:
    df = pd.read_csv(path)
    dfs.append((os.path.basename(path), df))  # (檔名, df) 一起存


共找到 12 個檔案
../../instance/imputed_data_on_system\ffill\PMDI_imputed_ffill_knn_on_system.csv
../../instance/imputed_data_on_system\ffill\PMDI_imputed_ffill_mean_on_system.csv
../../instance/imputed_data_on_system\ffill\PMDI_imputed_ffill_median_on_system.csv
../../instance/imputed_data_on_system\ffill\PMDI_imputed_ffill_rolling_on_system.csv
../../instance/imputed_data_on_system\linear\PMDI_imputed_linear_knn_on_system.csv
../../instance/imputed_data_on_system\linear\PMDI_imputed_linear_mean_on_system.csv
../../instance/imputed_data_on_system\linear\PMDI_imputed_linear_median_on_system.csv
../../instance/imputed_data_on_system\linear\PMDI_imputed_linear_rolling_on_system.csv
../../instance/imputed_data_on_system\rolling\PMDI_imputed_rolling_knn_on_system.csv
../../instance/imputed_data_on_system\rolling\PMDI_imputed_rolling_mean_on_system.csv
../../instance/imputed_data_on_system\rolling\PMDI_imputed_rolling_median_on_system.csv
../../instance/imputed_data_on_system\rolling\PMDI_imputed

In [9]:
# --- 設定你要補的欄位 ---
cols = [
    'Air temperature',
    'Process temperature',
    'Rotational speed',
    'Torque',
    # 'Tool wear'
]

In [10]:
root_out_dir = '../../instance/auto-encoder'

In [None]:
# 例如你要對每張表跑 autoencoder
for path, df in dfs:
    print(f"正在處理：{path}")
    # 2. 標準化
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[cols]),
        columns=cols
    )

    train_data = df_scaled.values  # 補值過的完整資料

    # 3. 建立 Autoencoder
    input_dim = train_data.shape[1]
    input_layer = keras.Input(shape=(input_dim,))
    encoded = layers.Dense(8, activation='relu')(input_layer)
    encoded = layers.Dense(4, activation='relu')(encoded)
    decoded = layers.Dense(8, activation='relu')(encoded)
    decoded = layers.Dense(input_dim)(decoded)

    autoencoder = keras.Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # 4. 訓練 autoencoder
    autoencoder.fit(train_data, train_data, epochs=100, batch_size=64, verbose=1)

    # 5. 用 autoencoder 來補原始有缺值的資料
    def ae_impute_row(row, model, scaler):
        row = row.values.astype(float)
        nan_idx = np.isnan(row)
        if not np.any(nan_idx):
            return row  # 沒缺值直接回傳
        row_filled = row.copy()
        row_filled[nan_idx] = 0  # 先補0
        row_filled = row_filled.reshape(1, -1)
        pred = model.predict(row_filled, verbose=0)[0]
        # 只補 nan 欄位
        row[nan_idx] = pred[nan_idx]
        return row

    df_scaled_orig = pd.DataFrame(
        scaler.transform(processed_new_data[cols]),
        columns=cols
    )

    df_imputed_scaled = df_scaled_orig.copy()
    for i, row in df_scaled_orig.iterrows():
        if row.isna().any():
            df_imputed_scaled.iloc[i] = ae_impute_row(row, autoencoder, scaler)

    # 6. 反標準化
    df_imputed = pd.DataFrame(
        scaler.inverse_transform(df_imputed_scaled),
        columns=cols
    )

    # 7. 補回原本資料
    df_result = processed_new_data.copy()
    for col in cols:
        df_result[col] = np.where(processed_new_data[col].isna(), df_imputed[col], processed_new_data[col])

    df_result['Tool wear'] = df['Tool wear']


    # 分類儲存路徑
    # category = os.path.basename(os.path.dirname(path))
    file_name = os.path.basename(path)
    new_filename = file_name.replace('.csv', '_autoencoder.csv')
    # out_dir = os.path.join(root_out_dir, category)
    # os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(root_out_dir, new_filename)
    df_result.to_csv(out_path, index=False)
    print(f'儲存 {out_path}')

    # # 4. 儲存
    # df_result.to_csv(out_path, index=False)
    # print(f'儲存 {out_path}')

正在處理：PMDI_imputed_ffill_knn_on_system.csv
Epoch 1/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.9697  
Epoch 2/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5351  
Epoch 3/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 954us/step - loss: 0.2504
Epoch 4/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 959us/step - loss: 0.1088
Epoch 5/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0801
Epoch 6/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0680  
Epoch 7/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0611
Epoch 8/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0495
Epoch 9/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0415
Epoch 10/100
[1m157/157[0m 

KeyboardInterrupt: 