In [None]:
from data_process import data_preprocessing
from feature_selector import LGBMFeatureSelector
from optimizer import LGBMRollingClassifier
from backtesting import backtesting
import polars as pl

import warnings
warnings.filterwarnings("ignore")

In [None]:
# 1、資料前處理
dataset = data_preprocessing()
dataset = (dataset.to_pandas()).set_index('date')  # 转为 Pandas DataFrame 并设置日期为索引

In [None]:
# 2、構建特徵與標籤
# 90日涨跌幅大于10%标记为1，否则为0
returns = dataset['close'].pct_change()
dataset['target'] = ((dataset['close'].shift(-90)/dataset['close'] - 1).dropna() > 0.1).astype(int)

def split_data(dataset,split_date):
    """"""
    # 特征：移除目标列
    X = dataset.drop(['target'],axis=1)
    y = dataset['target']

    # 训练集：分割日期之前的数据
    X_train = X.loc[X.index < split_date]
    y_train = y.loc[y.index < split_date]

    # 测试集：分割日期之后的数据
    X_test = X.loc[X.index >= split_date]
    y_test = y.loc[y.index >= split_date]

    # 检查分割结果
    print(f"训练集日期范围: {X_train.index.min()} 至 {X_train.index.max()}")
    print(f"测试集日期范围: {X_test.index.min()} 至 {X_test.index.max()}")
    print(f"训练集样本数: {len(X_train)}")
    print(f"测试集样本数: {len(X_test)}")

    return X_train,X_test,y_train,y_test

dataset = dataset.drop(['close'],axis=1)
x_train, x_test, y_train, y_test = split_data(dataset.dropna(), '2023-03-01')

In [None]:
# 3、特征选择
# 只保留数值型特征，去除object类型（如symbol、period等）
x_train_num = x_train.select_dtypes(include=['int64', 'float64', 'bool'])

features = LGBMFeatureSelector(model_type='clf', itype='gain', n=30, random_state=42).transform(x_train_num, y_train)


In [None]:
features

In [None]:
# 4、模型训练与评估

# 使用 optuna 内存模式，避免数据库文件错误
model = LGBMRollingClassifier().optimizer(X=features, y=y_train)

In [None]:
# 5、信号生成与回测
import pandas as pd

df = pd.merge(pd.DataFrame(data=model.predict(x_test.loc[:, features.columns.tolist()]), index=x_test.index, columns=['y_pred']),
              returns.shift(-1), left_index=True, right_index=True)

df.rename(columns={'close':'return'},inplace=True)

df = pl.from_dataframe(df.reset_index()).with_columns(pl.lit('1101.TW').alias('symbol'))  # 添加 symbol 列


backtesting(df=df)

In [None]:
returns.shift(-1)