In [18]:
# %%
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import ParameterGrid

# %%
# 从 p_ftd_code.py 引入 P-FTD 各步骤函数
from p_ftd_code import (
    calculate_volatility,
    add_padding,
    perform_fft,
    apply_threshold,
    inverse_fft,
    remove_padding
)

# %%
# P-FTD 去噪封装
def p_ftd_denoise(data, params):
    N, m, eps = params
    D = data.shape[1]
    out = np.zeros_like(data, dtype=float)
    for i in range(D):
        series = data[:, i]
        s1, s2 = calculate_volatility(series, N)
        padded = add_padding(series, s1, s2, m)
        fft_data = perform_fft(padded)
        filtered = apply_threshold(fft_data, eps)
        den_padded = inverse_fft(filtered)
        den = remove_padding(den_padded, len(series), m)
        out[:, i] = den
    return out

# %%
# --- 数据预处理 ---
def preprocess(df, lookback=20, target_col='close'):
    features = [col for col in df.columns if col != target_col]  # 特征列（排除目标列）
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(df[features])  # 特征归一化
    y = df[target_col].values  # 目标列原始值
    
    X, y_processed = [], []
    for i in range(lookback, len(df)):
        X.append(X_scaled[i - lookback:i, :])  # 特征数据：(样本数, lookback, 特征数)
        y_processed.append(y[i])  # 目标数据：(样本数,)
    
    X = np.array(X, dtype=np.float32)
    y_processed = np.array(y_processed, dtype=np.float32)
    print(f"预处理后数据形状: X={X.shape}, y={y_processed.shape}")
    
    # 增加数据检查
    if len(X) == 0:
        print(f"警告: 生成的样本数为0，数据长度={len(df)}, lookback={lookback}")
    
    return X, y_processed, scaler

# %%
# --- 模型构建 ---
def build_lstm_model(units, input_shape):
    m = Sequential([
        InputLayer(input_shape=input_shape),  # 显式定义输入层
        LSTM(units, return_sequences=False),  # return_sequences=False 输出单个向量
        Dense(1)
    ])
    m.compile(optimizer='adam', loss='mse')
    return m

# %%
# --- 加载数据 ---
features = ['open', 'high', 'low', 'volume']  # 特征列（4个）
target_col = 'close'  # 目标列
df = pd.read_csv('AAPL.csv', parse_dates=['date']).set_index('date')
data = df[features + [target_col]]  # 合并特征和目标列
total = len(data)

# 打印数据基本信息
print(f"数据集总长度: {total}")
print(f"数据集时间范围: {df.index.min()} 到 {df.index.max()}")

# %%
# 全局 scaler 用于反归一化目标列
ref_scaler = MinMaxScaler()
ref_scaler.fit(data[[target_col]])
ref_min, ref_max = ref_scaler.data_min_[0], ref_scaler.data_max_[0]
print(f"全局缩放器: min={ref_min}, max={ref_max}")

# %%
# 参数网格
param_grid = {
    'p_ftd': [(20, 20, 0.1), (20, 20, 0.2), (40, 40, 0.2), (60, 60, 0.2)],
    'lookback': [10, 20]  # 排除lookback=60避免测试集不足
}

results = []

# ----------------------
# 训练原始数据的 LSTM 模型
# ----------------------
print("\n训练原始数据的 LSTM 模型...")
for lookback in param_grid['lookback']:
    print(f"\n=== Lookback = {lookback} ===")
    
    # 统一数据分割逻辑，使用固定的分割比例
    train_size = int(total * 0.7)
    val_size = int(total * 0.15)
    test_size = total - train_size - val_size
    
    # 确保测试集长度足够
    if test_size < lookback + 1:
        print(f"警告: lookback={lookback} 时测试集长度不足，跳过此参数")
        continue
    
    print(f"数据分割: 训练集={train_size}, 验证集={val_size}, 测试集={test_size}")
    
    train_i, val_i = train_size, train_size + val_size
    train_raw, val_raw, test_raw = data.iloc[:train_i], data.iloc[train_i:val_i], data.iloc[val_i:]
    
    # 预处理
    print("预处理训练数据...")
    X_tr_raw, y_tr_raw, sc_raw = preprocess(train_raw, lookback=lookback)
    print("预处理验证数据...")
    X_val_raw, y_val_raw, _ = preprocess(val_raw, lookback=lookback)
    print("预处理测试数据...")
    X_te_raw, y_te_raw, _ = preprocess(test_raw, lookback=lookback)
    
    # 检查数据有效性
    if len(X_tr_raw) == 0 or len(X_val_raw) == 0 or len(X_te_raw) == 0:
        print(f"警告: 存在空数据集，跳过此参数")
        continue
    
    # 构建模型
    mdl_raw = build_lstm_model(20, (lookback, len(features)))
    es = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
    
    # 训练模型，使用验证集进行早停
    print(f"开始训练 LSTM 模型 (lookback={lookback})...")
    history = mdl_raw.fit(X_tr_raw, y_tr_raw, 
               epochs=100, batch_size=32,
               validation_data=(X_val_raw, y_val_raw),
               callbacks=[es], verbose=0)
    
    # 评估模型
    train_loss = mdl_raw.evaluate(X_tr_raw, y_tr_raw, verbose=0)
    val_loss = mdl_raw.evaluate(X_val_raw, y_val_raw, verbose=0)
    test_loss = mdl_raw.evaluate(X_te_raw, y_te_raw, verbose=0)
    
    print(f"训练集损失: {train_loss:.4f}, 验证集损失: {val_loss:.4f}, 测试集损失: {test_loss:.4f}")
    
    # 预测并反归一化
    y_pred_raw = mdl_raw.predict(X_te_raw).flatten()
    y_p_raw = y_pred_raw * (ref_max - ref_min) + ref_min  # 反归一化预测值
    y_true_raw = ref_scaler.inverse_transform(y_te_raw.reshape(-1, 1)).flatten()  # 反归一化真实值
    
    # 计算指标
    mae_raw = np.mean(np.abs(y_p_raw - y_true_raw))
    rmse_raw = np.sqrt(np.mean((y_p_raw - y_true_raw)**2))
    mape_raw = np.mean(np.abs((y_p_raw - y_true_raw)/y_true_raw)) * 100
    
    results.append({
        'model': 'LSTM',
        'lookback': lookback,
        'MAE': mae_raw,
        'RMSE': rmse_raw,
        'MAPE': mape_raw,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'test_loss': test_loss
    })
    
    print(f"LSTM 模型 (lookback={lookback}) 训练完成")

# --------------------------
# 训练 PFTD + LSTM 模型
# --------------------------
print("\n训练 PFTD + LSTM 模型...")
for params in ParameterGrid(param_grid):
    lookback = params['lookback']
    p_ftd_params = params['p_ftd']
    
    print(f"\n=== Lookback = {lookback}, P-FTD 参数 = {p_ftd_params} ===")
    
    # 使用与原始LSTM相同的数据分割逻辑
    train_size = int(total * 0.7)
    val_size = int(total * 0.15)
    test_size = total - train_size - val_size
    
    if test_size < lookback + 1:
        print(f"警告: lookback={lookback} 时测试集长度不足，跳过此参数")
        continue
    
    print(f"数据分割: 训练集={train_size}, 验证集={val_size}, 测试集={test_size}")
    
    train_i, val_i = train_size, train_size + val_size
    
    # 对特征列降噪
    print("应用 P-FTD 降噪...")
    denoised_features = p_ftd_denoise(data[features].values, p_ftd_params)
    denoised_df = pd.DataFrame(denoised_features, index=data.index, columns=features)
    comb = pd.concat([denoised_df, data[target_col]], axis=1)  # 合并降噪特征和原始目标
    
    # 分割数据
    train, val, test = comb.iloc[:train_i], comb.iloc[train_i:val_i], comb.iloc[val_i:]
    
    # 预处理
    print("预处理训练数据...")
    X_tr, y_tr, sc = preprocess(train, lookback=lookback)
    print("预处理验证数据...")
    X_val, y_val, _ = preprocess(val, lookback=lookback)
    print("预处理测试数据...")
    X_te, y_te, _ = preprocess(test, lookback=lookback)
    
    # 检查数据有效性
    if len(X_tr) == 0 or len(X_val) == 0 or len(X_te) == 0:
        print(f"警告: 存在空数据集，跳过此参数")
        continue
    
    # 构建模型
    mdl = build_lstm_model(20, (lookback, len(features)))
    es = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
    
    # 训练模型，使用验证集进行早停
    print(f"开始训练 PFTD_LSTM 模型 (lookback={lookback}, p_ftd={p_ftd_params})...")
    history = mdl.fit(X_tr, y_tr, 
           epochs=100, batch_size=32,
           validation_data=(X_val, y_val),
           callbacks=[es], verbose=0)
    
    # 评估模型
    train_loss = mdl.evaluate(X_tr, y_tr, verbose=0)
    val_loss = mdl.evaluate(X_val, y_val, verbose=0)
    test_loss = mdl.evaluate(X_te, y_te, verbose=0)
    
    print(f"训练集损失: {train_loss:.4f}, 验证集损失: {val_loss:.4f}, 测试集损失: {test_loss:.4f}")
    
    # 预测并反归一化
    y_pred = mdl.predict(X_te).flatten()
    y_p = y_pred * (ref_max - ref_min) + ref_min
    y_true = ref_scaler.inverse_transform(y_te.reshape(-1, 1)).flatten()
    
    # 计算指标
    mae = np.mean(np.abs(y_p - y_true))
    rmse = np.sqrt(np.mean((y_p - y_true)**2))
    mape = np.mean(np.abs((y_p - y_true)/y_true)) * 100
    
    results.append({
        'model': 'PFTD_LSTM',
        'lookback': lookback,
        'p_ftd': p_ftd_params,
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'test_loss': test_loss
    })
    
    print(f"PFTD_LSTM 模型 (lookback={lookback}, p_ftd={p_ftd_params}) 训练完成")

# ----------------------
# 结果汇总与对比
# ----------------------
if results:
    res_df = pd.DataFrame(results)
    
    # 保存完整结果到CSV
    res_df.to_csv('model_comparison_results.csv', index=False)
    print("\n完整结果已保存到 model_comparison_results.csv")
    
    # 提取最佳模型
    best_lstm = res_df[res_df['model'] == 'LSTM'].sort_values('MAE').iloc[0]
    best_pftd_lstm = res_df[res_df['model'] == 'PFTD_LSTM'].sort_values('MAE').iloc[0]
    
    # 生成对比表格
    comparison = pd.DataFrame({
        '模型': ['LSTM', 'PFTD_LSTM'],
        'MAE': [best_lstm['MAE'], best_pftd_lstm['MAE']],
        'RMSE': [best_lstm['RMSE'], best_pftd_lstm['RMSE']],
        'MAPE': [best_lstm['MAPE'], best_pftd_lstm['MAPE']],
        '训练损失': [best_lstm['train_loss'], best_pftd_lstm['train_loss']],
        '验证损失': [best_lstm['val_loss'], best_pftd_lstm['val_loss']],
        '测试损失': [best_lstm['test_loss'], best_pftd_lstm['test_loss']],
        '参数': [
            f"lookback={best_lstm['lookback']}",
            f"lookback={best_pftd_lstm['lookback']}, p_ftd={best_pftd_lstm['p_ftd']}"
        ]
    })
    comparison['MAE 改善(%)'] = [
        0,
        ((best_lstm['MAE'] - best_pftd_lstm['MAE']) / best_lstm['MAE'] * 100).round(2)
    ]
    comparison['RMSE 改善(%)'] = [
        0,
        ((best_lstm['RMSE'] - best_pftd_lstm['RMSE']) / best_lstm['RMSE'] * 100).round(2)
    ]
    comparison['MAPE 改善(%)'] = [
        0,
        ((best_lstm['MAPE'] - best_pftd_lstm['MAPE']) / best_lstm['MAPE'] * 100).round(2)
    ]
    
    print("\n=== LSTM vs PFTD_LSTM 性能对比 ===")
    print(comparison.to_string(index=False))
    
    # 打印完整结果摘要
    print("\n=== 所有模型结果摘要 ===")
    print(res_df[['model', 'lookback', 'p_ftd', 'MAE', 'RMSE', 'MAPE']].to_string(index=False))
else:
    print("未生成有效结果，请检查数据路径或参数设置")

数据集总长度: 5285
数据集时间范围: 2004-01-02 00:00:00 到 2024-12-31 00:00:00
全局缩放器: min=21.28, max=702.1

训练原始数据的 LSTM 模型...

=== Lookback = 10 ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
预处理训练数据...
预处理后数据形状: X=(3689, 10, 4), y=(3689,)
预处理验证数据...
预处理后数据形状: X=(782, 10, 4), y=(782,)
预处理测试数据...
预处理后数据形状: X=(784, 10, 4), y=(784,)
开始训练 LSTM 模型 (lookback=10)...




训练集损失: 25687.1680, 验证集损失: 1526.7623, 测试集损失: 13900.1592
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
LSTM 模型 (lookback=10) 训练完成

=== Lookback = 20 ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
预处理训练数据...
预处理后数据形状: X=(3679, 20, 4), y=(3679,)
预处理验证数据...
预处理后数据形状: X=(772, 20, 4), y=(772,)
预处理测试数据...
预处理后数据形状: X=(774, 20, 4), y=(774,)
开始训练 LSTM 模型 (lookback=20)...




训练集损失: 31253.9688, 验证集损失: 1154.4137, 测试集损失: 8943.1260
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
LSTM 模型 (lookback=20) 训练完成

训练 PFTD + LSTM 模型...

=== Lookback = 10, P-FTD 参数 = (20, 20, 0.1) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3689, 10, 4), y=(3689,)
预处理验证数据...
预处理后数据形状: X=(782, 10, 4), y=(782,)
预处理测试数据...
预处理后数据形状: X=(784, 10, 4), y=(784,)
开始训练 PFTD_LSTM 模型 (lookback=10, p_ftd=(20, 20, 0.1))...




训练集损失: 25541.3086, 验证集损失: 1153.7734, 测试集损失: 13236.2920
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
PFTD_LSTM 模型 (lookback=10, p_ftd=(20, 20, 0.1)) 训练完成

=== Lookback = 10, P-FTD 参数 = (20, 20, 0.2) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3689, 10, 4), y=(3689,)
预处理验证数据...
预处理后数据形状: X=(782, 10, 4), y=(782,)
预处理测试数据...
预处理后数据形状: X=(784, 10, 4), y=(784,)
开始训练 PFTD_LSTM 模型 (lookback=10, p_ftd=(20, 20, 0.2))...




训练集损失: 24461.4453, 验证集损失: 1164.7823, 测试集损失: 14025.1621
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
PFTD_LSTM 模型 (lookback=10, p_ftd=(20, 20, 0.2)) 训练完成

=== Lookback = 10, P-FTD 参数 = (40, 40, 0.2) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3689, 10, 4), y=(3689,)
预处理验证数据...
预处理后数据形状: X=(782, 10, 4), y=(782,)
预处理测试数据...
预处理后数据形状: X=(784, 10, 4), y=(784,)
开始训练 PFTD_LSTM 模型 (lookback=10, p_ftd=(40, 40, 0.2))...




训练集损失: 25614.2188, 验证集损失: 1011.8828, 测试集损失: 12839.4463
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
PFTD_LSTM 模型 (lookback=10, p_ftd=(40, 40, 0.2)) 训练完成

=== Lookback = 10, P-FTD 参数 = (60, 60, 0.2) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3689, 10, 4), y=(3689,)
预处理验证数据...
预处理后数据形状: X=(782, 10, 4), y=(782,)
预处理测试数据...
预处理后数据形状: X=(784, 10, 4), y=(784,)
开始训练 PFTD_LSTM 模型 (lookback=10, p_ftd=(60, 60, 0.2))...




训练集损失: 24463.2617, 验证集损失: 1135.7742, 测试集损失: 13674.8379
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
PFTD_LSTM 模型 (lookback=10, p_ftd=(60, 60, 0.2)) 训练完成

=== Lookback = 20, P-FTD 参数 = (20, 20, 0.1) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3679, 20, 4), y=(3679,)
预处理验证数据...
预处理后数据形状: X=(772, 20, 4), y=(772,)
预处理测试数据...
预处理后数据形状: X=(774, 20, 4), y=(774,)
开始训练 PFTD_LSTM 模型 (lookback=20, p_ftd=(20, 20, 0.1))...




训练集损失: 29352.7637, 验证集损失: 866.8389, 测试集损失: 9653.7529
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
PFTD_LSTM 模型 (lookback=20, p_ftd=(20, 20, 0.1)) 训练完成

=== Lookback = 20, P-FTD 参数 = (20, 20, 0.2) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3679, 20, 4), y=(3679,)
预处理验证数据...
预处理后数据形状: X=(772, 20, 4), y=(772,)
预处理测试数据...
预处理后数据形状: X=(774, 20, 4), y=(774,)
开始训练 PFTD_LSTM 模型 (lookback=20, p_ftd=(20, 20, 0.2))...




训练集损失: 32262.8242, 验证集损失: 1508.6013, 测试集损失: 8918.3027
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
PFTD_LSTM 模型 (lookback=20, p_ftd=(20, 20, 0.2)) 训练完成

=== Lookback = 20, P-FTD 参数 = (40, 40, 0.2) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3679, 20, 4), y=(3679,)
预处理验证数据...
预处理后数据形状: X=(772, 20, 4), y=(772,)
预处理测试数据...
预处理后数据形状: X=(774, 20, 4), y=(774,)
开始训练 PFTD_LSTM 模型 (lookback=20, p_ftd=(40, 40, 0.2))...




训练集损失: 28784.3379, 验证集损失: 778.1198, 测试集损失: 9628.7412
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
PFTD_LSTM 模型 (lookback=20, p_ftd=(40, 40, 0.2)) 训练完成

=== Lookback = 20, P-FTD 参数 = (60, 60, 0.2) ===
数据分割: 训练集=3699, 验证集=792, 测试集=794
应用 P-FTD 降噪...
预处理训练数据...
预处理后数据形状: X=(3679, 20, 4), y=(3679,)
预处理验证数据...
预处理后数据形状: X=(772, 20, 4), y=(772,)
预处理测试数据...
预处理后数据形状: X=(774, 20, 4), y=(774,)
开始训练 PFTD_LSTM 模型 (lookback=20, p_ftd=(60, 60, 0.2))...




训练集损失: 27932.4336, 验证集损失: 824.4484, 测试集损失: 10479.3896
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
PFTD_LSTM 模型 (lookback=20, p_ftd=(60, 60, 0.2)) 训练完成

完整结果已保存到 model_comparison_results.csv

=== LSTM vs PFTD_LSTM 性能对比 ===
       模型          MAE         RMSE       MAPE         训练损失        验证损失        测试损失                               参数  MAE 改善(%)  RMSE 改善(%)  MAPE 改善(%)
     LSTM 63225.692741 64383.858432 212.041124 31253.968750 1154.413696 8943.125977                      lookback=20       0.00        0.00         0.0
PFTD_LSTM 62864.491691 64294.439731 218.613812 32262.824219 1508.601318 8918.302734 lookback=20, p_ftd=(20, 20, 0.2)       0.57        0.14        -3.1

=== 所有模型结果摘要 ===
    model  lookback         p_ftd          MAE         RMSE       MAPE
     LSTM        10           NaN 79379.005048 80267.950827 256.027126
     LSTM        20           NaN 63225.692741 64383.858432 212.041124
PFTD_LSTM        10 (20, 20, 0.1) 77480.200225 78327.719470 2