In [1]:
import pandas as pd

In [2]:
folder_path = '../../raw_data/'

new_data = pd.read_csv(folder_path + 'AI4I-PMDI.csv')
old_data = pd.read_csv(folder_path + 'ai4i2020.csv')

## preprocess

In [3]:
# 1. 欄位名稱對應（僅要對應的欄位做改名）
col_map = {
    'Air temperature (K)': 'Air temperature',
    'Process temperature (K)': 'Process temperature',
    'Rotational speed (rpm)': 'Rotational speed',
    'Torque (Nm)': 'Torque',
    'Tool wear (min)': 'Tool wear',
}
# 用 rename 只處理指定欄位
df = new_data.rename(columns=col_map).copy()

# 2. 新增 failure 欄位
for col in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
    df[col] = 0

# 3. 解析 Diagnostic 欄位，自動 one-hot
failure_map = {
    'Tool Wear Failure': 'TWF',
    'Heat Dissipation Failure': 'HDF',
    'Power Failure': 'PWF',
    'Overstrain Failure': 'OSF',
    'Random Failure': 'RNF'
}

def parse_failure(diag):
    # 支援多個failure（分號、逗號、空格等分隔）
    result = {k:0 for k in failure_map.values()}
    if pd.isnull(diag) or diag.strip() == '' or diag == 'No Failure':
        return result
    # 允許一列有多個failure
    for key, col in failure_map.items():
        if key in diag:
            result[col] = 1
    return result

failures = df['Diagnostic'].apply(parse_failure).apply(pd.Series)
for col in ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']:
    df[col] = failures[col]
    
# 4. 新增 Machine failure 欄位（有任一 failure 則為 1）
df['Machine failure'] = df[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].max(axis=1)

processed_new_data = df


In [8]:
import os
import glob

read_file_path = '../../instance/on_system/'

# 搜尋所有主資料夾下（例如 linear、ffill、rolling）子資料夾的所有 csv
all_csv_paths = glob.glob(os.path.join(read_file_path, '*', '*.csv'))

print(f'共找到 {len(all_csv_paths)} 個檔案')
for path in all_csv_paths:
    print(path)

# 讀取全部表格
dfs = []
for path in all_csv_paths:
    df = pd.read_csv(path)
    dfs.append((os.path.basename(path), df))  # (檔名, df) 一起存


共找到 12 個檔案
../../instance/on_system\ffill\PMDI_imputed_ffill_knn_on_system.csv
../../instance/on_system\ffill\PMDI_imputed_ffill_mean_on_system.csv
../../instance/on_system\ffill\PMDI_imputed_ffill_median_on_system.csv
../../instance/on_system\ffill\PMDI_imputed_ffill_rolling_on_system.csv
../../instance/on_system\linear\PMDI_imputed_linear_knn_on_system.csv
../../instance/on_system\linear\PMDI_imputed_linear_mean_on_system.csv
../../instance/on_system\linear\PMDI_imputed_linear_median_on_system.csv
../../instance/on_system\linear\PMDI_imputed_linear_rolling_on_system.csv
../../instance/on_system\rolling\PMDI_imputed_rolling_knn_on_system.csv
../../instance/on_system\rolling\PMDI_imputed_rolling_mean_on_system.csv
../../instance/on_system\rolling\PMDI_imputed_rolling_median_on_system.csv
../../instance/on_system\rolling\PMDI_imputed_rolling_rolling_on_system.csv


In [18]:
# --- 設定你要補的欄位 ---
cols = [
    'Air temperature',
    'Process temperature',
    'Rotational speed',
    'Torque',
    'Tool wear'
]

# GAIN

In [10]:
root_out_dir = '../../instance/GAIN'

In [23]:
import numpy as np
import pandas as pd
import pickle

import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from keras.saving import register_keras_serializable

# ---------------------------------------------------
# 先假設你已經有：
# - dfs: List of (檔案路徑, 讀進來的 DataFrame) 
# - cols: 要做補值的數值欄位列表，例如 ['Air temperature','Process temperature','Rotational speed', 'Torque','Tool wear']
# - processed_new_data: 跟 df_scaled_orig 同形狀、維度的原始 DataFrame（包含 NaN），用於最終「決定要不要把 imputed 值套回去」
# - root_out_dir: 輸出資料夾路徑
# ---------------------------------------------------

# ----- GAIN 網路定義 -----
class GainGenerator(keras.Model):
    def __init__(self, input_dim, hidden_dim=64):
        super(GainGenerator, self).__init__()
        # Generator 的輸入：x_obs (缺失填 0 後) 與 mask
        self.dense1 = layers.Dense(hidden_dim, activation='relu')
        self.dense2 = layers.Dense(hidden_dim, activation='relu')
        # 最後重建整個維度
        self.out_layer = layers.Dense(input_dim, activation=None)  # 無 activation，回傳實數

    # def call(self, x_obs, m):
    #     # x_obs, m 都是同一 batch shape (batch_size, D)
    #     inp = tf.concat([x_obs, m], axis=1)
    #     h = self.dense1(inp)
    #     h = self.dense2(h)
    #     x_tilde = self.out_layer(h)
    #     return x_tilde

    def call(self, inputs):  # inputs 是 shape=(batch, D*2) 的合併張量
        h = self.dense1(inputs)
        h = self.dense2(h)
        x_tilde = self.out_layer(h)
        return x_tilde

# @register_keras_serializable()
# class GainGenerator(keras.Model):
#     def __init__(self, input_dim=8, hidden_dim=64, **kwargs):  # 給 default 值 + **kwargs 接 Keras 系統參數
#         super(GainGenerator, self).__init__(**kwargs)
#         self.input_dim = input_dim
#         self.hidden_dim = hidden_dim
#         self.dense1 = layers.Dense(hidden_dim, activation='relu')
#         self.dense2 = layers.Dense(hidden_dim, activation='relu')
#         self.out_layer = layers.Dense(input_dim, activation=None)

#     def call(self, inputs):
#         h = self.dense1(inputs)
#         h = self.dense2(h)
#         return self.out_layer(h)

#     def get_config(self):
#         config = super().get_config()
#         config.update({
#             'input_dim': self.input_dim,
#             'hidden_dim': self.hidden_dim
#         })
#         return config

#     @classmethod
#     def from_config(cls, config):
#         return cls(**config)



class GainDiscriminator(keras.Model):
    def __init__(self, input_dim, hidden_dim=64):
        super(GainDiscriminator, self).__init__()
        # Discriminator 的輸入：x_hat (Generator 補值後的完整向量) 與 H_hint
        self.dense1 = layers.Dense(hidden_dim, activation='relu')
        self.dense2 = layers.Dense(hidden_dim, activation='relu')
        # 最後對每個維度都輸出一個 [0,1] 機率
        self.out_layer = layers.Dense(input_dim, activation='sigmoid')

    def call(self, x_hat, h):
        inp = tf.concat([x_hat, h], axis=1)
        h1 = self.dense1(inp)
        h1 = self.dense2(h1)
        d_prob = self.out_layer(h1)
        return d_prob


# 生成 Hint Vector 的簡單函式 (可調整 hint_rate)
def sample_hint(m, hint_rate=0.9):
    """
    m: mask 矩陣 (batch_size, D)，1 表示該位置原本可見，0 表示缺失
    hint_rate: 保留原本 mask 的比例，其餘隨機設為 0（讓 Discriminator 無法得知全部缺失位置）
    返回 shape=(batch_size, D) 的 Hint 矩陣
    """
    # 隨機產生一個 [0,1] uniform，若 < hint_rate 就保留原本的 m，否則設定為 0
    rand_uniform = tf.random.uniform(shape=tf.shape(m), minval=0., maxval=1.)
    hint = tf.where(rand_uniform < hint_rate, m, tf.zeros_like(m))
    return hint


# 自訂損失：只對可見位置 (m=1) 做 Reconstruct Loss，對缺失位置 (m=0) 做 Adversarial Loss
mse_loss = keras.losses.MeanSquaredError()
bce_loss = keras.losses.BinaryCrossentropy(from_logits=False)  # 因為 Disc 最後一層有 sigmoid

# ---------------------------------------------------
# 主要流程：跑 GAIN 補值
# ---------------------------------------------------
for path, df in dfs:
    print(f"正在處理檔案：{path}")
    if "PMDI_imputed_ffill_mean" not in path:
        continue

    # 1. 對要補值的 cols 做標準化
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[cols]),
        columns=cols
    )

    # 2. 準備訓練資料：X_obs, M_mask
    data_matrix = df_scaled.values.astype(np.float32)  # shape = (N, D)
    m_mask = (~np.isnan(data_matrix)).astype(np.float32)  # 原始非缺失處為 1，缺失處為 0

    # 如果有 NaN 先用 0 填滿，方便 feed 進網路
    X_obs = np.nan_to_num(data_matrix, nan=0.0).astype(np.float32)

    N, D = X_obs.shape

    # 3. 建立 Generator、Discriminator 物件
    generator = GainGenerator(input_dim=D, hidden_dim=64)
    discriminator = GainDiscriminator(input_dim=D, hidden_dim=64)

    # 4. 定義 Optimizer
    g_optimizer = keras.optimizers.Adam(learning_rate=1e-3)
    d_optimizer = keras.optimizers.Adam(learning_rate=1e-3)

    # 5. 訓練參數
    BATCH_SIZE = 128
    EPOCHS = 50
    HINT_RATE = 0.9
    ALPHA = 10.0  # Reconstruction Loss 的權重 (可自行調整)

    # 6. 轉成 tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((X_obs, m_mask))
    dataset = dataset.shuffle(buffer_size=N).batch(BATCH_SIZE)

    # 7. 訓練迴圈
    for epoch in range(EPOCHS):
        for step, (x_batch, m_batch) in enumerate(dataset):
            # 7.1 產生 hint
            h_batch = sample_hint(m_batch, hint_rate=HINT_RATE)

            # 7.2 Generator forward & compute Generator Loss
            with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
                # Generator 補值：x_tilde = G(X_obs, M_mask)
                generator_input = tf.concat([x_batch, m_batch], axis=1)
                x_tilde = generator(generator_input)
                # 用 x_hat 表示「補值後的完整向量」：x_hat = M * X_obs + (1-M) * x_tilde
                x_hat = m_batch * x_batch + (1.0 - m_batch) * x_tilde

                # Discriminator forward on「補值後」與 hint
                d_prob = discriminator(x_hat, h_batch)

                # Discriminator Loss = BCE( M, D(x_hat, hint) ) (只在 hint=1 的位置計)
                d_loss = bce_loss(m_batch * h_batch, d_prob * h_batch)

                # Generator 的 Adversarial loss: 希望 D 在缺失位置 (1-M) 預測為 1
                g_adv_loss = bce_loss((1.0 - m_batch), d_prob * (1.0 - m_batch))

                # Generator 的 Reconstruction loss: 只在原本可見位置 (M=1) 計算 MSE(x_obs, x_tilde)
                g_rec_loss = mse_loss(x_batch * m_batch, x_tilde * m_batch)

                # 總 Generator Loss = Adv + α * Rec
                g_loss = g_adv_loss + ALPHA * g_rec_loss

            # 7.3 計算 Gradient 並更新參數
            gradients_of_generator = gen_tape.gradient(g_loss, generator.trainable_variables)
            gradients_of_discriminator = disc_tape.gradient(d_loss, discriminator.trainable_variables)
            g_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
            d_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

        # （可選）每隔若干 epoch 印一次 Loss
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{EPOCHS}  |  D Loss: {d_loss.numpy():.4f}  |  G Loss: {g_loss.numpy():.4f}")

    # 訓練完後儲存整個模型（注意：需要合併 x_obs 與 mask）
    generator_input_shape = (None, D * 2)
    generator.build(input_shape=generator_input_shape)
    generator.summary()  # 確保模型真的被建構

    model_path = os.path.join(root_out_dir, 'gain_generator_model_with_toolwear.keras')
    generator.save(model_path)
    print(f"✅ 模型已完整儲存至：{model_path}")

    # gen_weights_path = os.path.join(root_out_dir, 'gain_generator.weights.h5')
    # generator.save_weights(gen_weights_path)
    # print(f"✅ Generator 權重已儲存至：{gen_weights_path}")

    scaler_path = os.path.join(root_out_dir, 'gain_scaler_with_toolwear.pkl')
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"✅ 標準化器已儲存至：{scaler_path}")
    continue

    # 8. 訓練完成後，用整張表一次補值
    X_obs_tensor = tf.convert_to_tensor(X_obs, dtype=tf.float32)
    M_mask_tensor = tf.convert_to_tensor(m_mask, dtype=tf.float32)
    H_hint_tensor = sample_hint(M_mask_tensor, hint_rate=HINT_RATE)

    # Generator 補值
    x_tilde_full = generator(X_obs_tensor, M_mask_tensor)  # (N, D)
    x_hat_full = M_mask_tensor * X_obs_tensor + (1.0 - M_mask_tensor) * x_tilde_full
    x_hat_full_np = x_hat_full.numpy()  # np array

    # 9. 建構補值後的 DataFrame（還是標準化後）
    df_imputed_scaled = pd.DataFrame(x_hat_full_np, columns=cols)

    # 10. 反標準化
    df_imputed = pd.DataFrame(
        scaler.inverse_transform(df_imputed_scaled[cols]),
        columns=cols
    )

    # 11. 最後把補值貼回原本 processed_new_data，只覆寫那些 NaN 的 cell
    df_result = processed_new_data.copy()
    for col in cols:
        is_na = processed_new_data[col].isna()
        df_result.loc[is_na, col] = df_imputed.loc[is_na, col]

    # 12. 把最終結果存檔
    # file_name = os.path.basename(path)
    # new_filename = file_name.replace('.csv', '_GAIN.csv')
    # out_path = os.path.join(root_out_dir, new_filename)
    # df_result.to_csv(out_path, index=False)
    # print(f"已儲存至：{out_path}\n")

正在處理檔案：PMDI_imputed_ffill_knn_on_system.csv
正在處理檔案：PMDI_imputed_ffill_mean_on_system.csv
Epoch 10/50  |  D Loss: 0.0000  |  G Loss: 0.0032
Epoch 20/50  |  D Loss: 0.0000  |  G Loss: 0.0014
Epoch 30/50  |  D Loss: 0.0000  |  G Loss: 0.0005
Epoch 40/50  |  D Loss: 0.0000  |  G Loss: 0.0006
Epoch 50/50  |  D Loss: 0.0000  |  G Loss: 0.0004


✅ 模型已完整儲存至：../../instance/GAIN\gain_generator_model_with_toolwear.keras
✅ 標準化器已儲存至：../../instance/GAIN\gain_scaler_with_toolwear.pkl
正在處理檔案：PMDI_imputed_ffill_median_on_system.csv
正在處理檔案：PMDI_imputed_ffill_rolling_on_system.csv
正在處理檔案：PMDI_imputed_linear_knn_on_system.csv
正在處理檔案：PMDI_imputed_linear_mean_on_system.csv
正在處理檔案：PMDI_imputed_linear_median_on_system.csv
正在處理檔案：PMDI_imputed_linear_rolling_on_system.csv
正在處理檔案：PMDI_imputed_rolling_knn_on_system.csv
正在處理檔案：PMDI_imputed_rolling_mean_on_system.csv
正在處理檔案：PMDI_imputed_rolling_median_on_system.csv
正在處理檔案：PMDI_imputed_rolling_rolling_on_system.csv


In [16]:
generator.summary()


## auto-encoder

In [4]:
import glob
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras import layers


In [12]:
root_out_dir = '../../instance/auto-encoder'

In [None]:
# 例如你要對每張表跑 autoencoder
for path, df in dfs:
    print(f"正在處理：{path}")
    # 2. 標準化
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[cols]),
        columns=cols
    )

    train_data = df_scaled.values  # 補值過的完整資料

    # 3. 建立 Autoencoder
    input_dim = train_data.shape[1]
    input_layer = keras.Input(shape=(input_dim,))
    encoded = layers.Dense(8, activation='relu')(input_layer)
    encoded = layers.Dense(4, activation='relu')(encoded)
    decoded = layers.Dense(8, activation='relu')(encoded)
    decoded = layers.Dense(input_dim)(decoded)

    autoencoder = keras.Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # 4. 訓練 autoencoder
    autoencoder.fit(train_data, train_data, epochs=100, batch_size=64, verbose=1)

    # 5. 用 autoencoder 來補原始有缺值的資料
    def ae_impute_row(row, model, scaler):
        row = row.values.astype(float)
        nan_idx = np.isnan(row)
        if not np.any(nan_idx):
            return row  # 沒缺值直接回傳
        row_filled = row.copy()
        row_filled[nan_idx] = 0  # 先補0
        row_filled = row_filled.reshape(1, -1)
        pred = model.predict(row_filled, verbose=0)[0]
        # 只補 nan 欄位
        row[nan_idx] = pred[nan_idx]
        return row

    df_scaled_orig = pd.DataFrame(
        scaler.transform(processed_new_data[cols]),
        columns=cols
    )

    df_imputed_scaled = df_scaled_orig.copy()
    for i, row in df_scaled_orig.iterrows():
        if row.isna().any():
            df_imputed_scaled.iloc[i] = ae_impute_row(row, autoencoder, scaler)

    # 6. 反標準化
    df_imputed = pd.DataFrame(
        scaler.inverse_transform(df_imputed_scaled),
        columns=cols
    )

    # 7. 補回原本資料
    df_result = processed_new_data.copy()
    for col in cols:
        df_result[col] = np.where(processed_new_data[col].isna(), df_imputed[col], processed_new_data[col])

    df_result['Tool wear'] = df['Tool wear']


    # 分類儲存路徑
    # category = os.path.basename(os.path.dirname(path))
    file_name = os.path.basename(path)
    new_filename = file_name.replace('.csv', '_autoencoder.csv')
    # out_dir = os.path.join(root_out_dir, category)
    # os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(root_out_dir, new_filename)
    df_result.to_csv(out_path, index=False)
    print(f'儲存 {out_path}')

    # # 4. 儲存
    # df_result.to_csv(out_path, index=False)
    # print(f'儲存 {out_path}')