## 建立模型

In [33]:
from typing import Any, Dict, List, Tuple
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

plt.style.use("ggplot")

import warnings

warnings.filterwarnings("ignore")

In [5]:
features = pd.read_csv("features.csv", index_col="date", parse_dates=True)
features.iloc[:, :5]

Unnamed: 0_level_0,EMA_10_DIFF,EMA_50_DIFF,EMA_200_DIFF,TREND_RATIO_10_50,TREND_RATIO_10_200
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-04-05,1.571723,0.073200,-0.499952,-0.020399,-0.183081
2015-04-06,0.357594,-0.129906,-0.545783,-0.018525,-0.180488
2015-04-07,-0.127789,-0.215479,-0.563358,-0.018201,-0.179408
2015-04-08,-1.587825,-0.526950,-0.638926,-0.022351,-0.182854
2015-04-09,-1.543859,-0.559070,-0.645962,-0.026226,-0.186154
...,...,...,...,...,...
2025-02-02,-883.748999,-44.299810,148.518003,0.029274,0.226198
2025-02-03,-47.350033,103.180391,184.019767,0.027721,0.222913
2025-02-04,-681.213948,-39.438509,147.028510,0.021239,0.212569
2025-02-05,-785.788684,-87.161509,133.064296,0.014183,0.201209


In [6]:
def format_label_ratio(label_counts: dict) -> None:
    """打印标签比例"""
    total = sum(label_counts.values())
    for label, count in label_counts.items():
        ratio = count / total * 100
        print(f"标签 {label}: {ratio:.2f}% ({count}/{total})")

In [7]:
# 定义模型字典
models: Dict[str, Any] = {
    # 逻辑回归，基础模型
    "logistic_regression": LogisticRegression(class_weight="balanced"),
    # 决策树，基础模型
    "decision_tree": DecisionTreeClassifier(
        criterion="gini",  # 用于衡量节点分裂质量的函数，默认为"gini"，"gini"在多数情况下表现良好且计算速度更快，如果需要更精细的控制，可以尝试 "entropy"，通过交叉验证来选择。
        max_depth=3,  # 树的最大深度，树越深其学习能力越强，但容易过拟合，默认为None(无限制)，通常在3-10之间，根据交叉验证选择。
        min_samples_split=2,  # 分裂内部节点所需的最小样本数，默认为2，较大的值可以防止过拟合，应根据数据大小进行调整，建议从5-10开始尝试，大型数据集可以适当增加该值。
        min_samples_leaf=1,  # 叶节点所需的最小样本数，默认为1，与 min_samples_split 类似，较大的值可以防止模型过拟合，建议从5-10开始尝试，大型数据集可以适当增加该值。
        max_features=None,  # 分裂节点时要考虑的特征数量，默认为None（使用全部特征），"auto" 或 "sqrt" 通常是一个不错的起点，减少特征数量可以防止过拟合，提高模型的泛化能力。
        class_weight="balanced",  # 用于处理类别不平衡问题，默认None，"balanced" 可以自动调整每个类别的权重，使得模型更加关注少数类别，也可以手动指定每个类别的权重。如果类别不平衡，强烈建议使用此参数。
        random_state=42,  # 随机性种子，设置为一个整数，以确保结果的可重复性。 在比较不同的模型或超参数设置时，保持 random_state 不变非常重要
        min_impurity_decrease=0,  # 用于防止过拟合，特别是当树变得非常深时。默认为0，可以从一个较小的值开始尝试，例如 0.001 或 0.01，然后根据验证集上的性能进行调整。这个参数可以有效地剪枝那些对整体性能贡献不大的分支。
    ),
    # 随机森林
    "random_forest": RandomForestClassifier(
        n_estimators=100,  # 决策树的数量，默认100，树的数量越多，模型的学习能力越强，但越容易过拟合且增加计算成本，建议从100开始，逐步增加到500-1000，并观察验证集的性能变化。
        max_depth=10,  # 决策树的最大深度，默认None（无限制），值越大模型越复杂，但越容易过拟合。可以从较小的值（如 3）开始，逐步增加到 10 或 20，并观察模型性能的变化。
        min_samples_split=2,  # 分裂内部节点所需的最小样本数，默认为2，较大的值可以限制树的生长，进而减少过拟合。对于较大的数据集，可以适当增加该值。
        min_samples_leaf=1,  # 叶节点所需的最小样本数，默认为1，较大的值可以限制树的生长，进而降低过拟合。
        max_features="sqrt",  # 分裂节点时考虑的特征数量，默认为 "sqrt"，每棵树考虑的特征越多，树之间的相关性就越高，可能降低模型的泛化能力。
        bootstrap=True,  # 是否在构建树时使用 bootstrap 样本（有放回抽样），默认为True，意味着每棵树都是在原始数据集上的一个随机子集上训练的，这有助于减少过拟合并提高模型的泛化能力。False意味着使用全部样本来训练所有树，可能导致过拟合。
        class_weight="balanced",  # 用于处理类别不平衡问题，默认为None，"balanced" 可以自动调整每个类别的权重，使得模型更加关注少数类别，也可以手动指定每个类别的权重。如果类别不平衡，强烈建议使用此参数。
        random_state=42,  # 随机数种子，确保研究的可重复性
        n_jobs=-1,  # cpu数量，-1表示使用全部核
    ),
    # 梯度提升框架
    "lightgbm": LGBMClassifier(
        n_estimators=100,  # 树的数量，也就是提升的迭代次数，树的数量越多，模型的学习能力越强，但越容易过拟合，默认为100。
        learning_rate=0.1,  # 学习率，控制每棵树对最终预测的贡献程度，学习率越低，需要更多的树才能达到相同的性能，但能够避免过拟合并提升泛化能力。默认0.1。
        num_leaves=31,  # 每棵树的最大叶子节点数量，控制树的复杂度，叶子节点数量越多，模型越复杂，但越容易过拟合。默认31。
        max_depth=5,  # 每棵树的最大深度，控制树的复杂度，值越大模型越复杂，学习能力越强，但越容易过拟合。默认-1，无限制。
        min_child_samples=20,  # 每个叶子节点需要的最小样本数量，较大的数值可以避免过于精细的分割，从而降低过拟合。默认20。
        subsample=1,  # 用于训练每棵树的样本比例，通过引入随机抽样来提升模型的鲁棒性。默认1，即使用全部样本，可以测试0.7-1.0。
        colsample_bytree=1,  # 用于训练每棵树的特征比例，通过引入随机抽样来提升模型的鲁棒性。默认1，即使用全部特征，可以测试0.7-1.0。
        reg_alpha=0.1,  # L1正则化系数，L1正则化可以使一些特征的权重降为0，从而进行特征选择，防止模型过拟合。默认为0，可以尝试0.05-0.2。
        reg_lambda=0.1,  # L2正则化系数，L2正则化可以缩小特征的权重，防止模型过拟合。默认为0，可以尝试0.05-0.2。
        class_weight="balanced",  # 用于处理类别不平衡问题，默认为None，"balanced" 可以自动调整每个类别的权重，使得模型更加关注少数类别，也可以手动指定每个类别的权重。如果类别不平衡，强烈建议使用此参数。
        random_state=42,  # 随机数种子，确保研究的可重复性
    ),
}

In [43]:
# 准备特征矩阵和目标变量
start_date = "2016-01-01"
end_date = "2024-12-31"
X = features[start_date:end_date].drop(columns="target")
y = features[start_date:end_date]["target"]

# 选择模型
model_name = "lightgbm"
if model_name not in models:
    raise ValueError(
        f"Invalid model name: {model_name}. Valid options are: {list(models.keys())}"
    )
model = models[model_name]

# 构建 Pipeline
pipeline = Pipeline(
    [
        ("scaler", MinMaxScaler()),  # 特征缩放
        ("classifier", model),  # 分类模型
    ]
)

# 滚动划分训练集和检验集
train_window = 365 * 2
test_window = 10

n_splits = (len(X) - train_window) // test_window
tscv = TimeSeriesSplit(n_splits, max_train_size=train_window, test_size=test_window)
print(f"总测试轮数: {n_splits}")

# 记录所有测试集的预测结果
all_y_test: List[float] = []
all_y_pred: List[float] = []
all_probabilities: List[pd.DataFrame] = []  # 用于存储所有测试集的预测概率

# 滚动训练/预测
for i, (train_index, test_index) in enumerate(tscv.split(X, y), 1):
    print(f"\n------ 滚动窗口 {i} ------")
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    print(
        f"训练窗口: {X_train.index.min():%Y-%m-%d} --> {X_train.index.max():%Y-%m-%d} ({len(X_train)})"
    )
    print(
        f"训练窗口: {X_test.index.min():%Y-%m-%d} --> {X_test.index.max():%Y-%m-%d} ({len(X_test)})"
    )

    # 检查每个训练集的类别比例
    format_label_ratio(y_train.value_counts().to_dict())

    # 拟合模型
    pipeline.fit(X_train, y_train)

    # 生成预测
    y_pred = pipeline.predict(X_test)
    print(f"预测结果: {str(y_pred)}")

    # 记录测试集的预测结果
    all_y_test.extend(y_test)
    all_y_pred.extend(y_pred)

    # 生成预测概率
    try:
        y_prob = pipeline.predict_proba(X_test)
        y_prob_df = pd.DataFrame(
            y_prob,
            columns=pipeline.named_steps["classifier"].classes_,
            index=y_test.index,
        )
        all_probabilities.append(y_prob_df)
    except Exception as e:
        print(e)
        all_probabilities.append(pd.DataFrame())

# 整体评估结果
print("\n------ 整体评估 ------")
print(f"Accuracy: {accuracy_score(all_y_test, all_y_pred):.1%}")
print(f"Precision: {precision_score(all_y_test, all_y_pred, average="weighted"):.1%}")
print(classification_report(all_y_test, all_y_pred))

总测试轮数: 255

------ 滚动窗口 1 ------
训练窗口: 2016-01-09 --> 2018-01-07 (730)
训练窗口: 2018-01-08 --> 2018-01-17 (10)
标签 1.0: 76.44% (558/730)
标签 0.0: 21.78% (159/730)
标签 -1.0: 1.78% (13/730)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4392
[LightGBM] [Info] Number of data points in the train set: 730, number of used features: 18
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
预测结果: [ 1.  1.  1.  1.  1.  1.  1.  0.  0. -1.]

------ 滚动窗口 2 ------
训练窗口: 2016-01-19 --> 2018-01-17 (730)
训练窗口: 2018-01-18 --> 2018-01-27 (10)
标签 1.0: 75.21% (549/730)
标签 0.0: 23.01% (168/730)
标签 -1.0: 1.78% (13/730)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000301 seconds.
You can set `force_col_wise=true` to remove th

In [44]:
predict_prob_df = pd.concat(all_probabilities, axis=0)
predict_prob_df

Unnamed: 0_level_0,-1.0,0.0,1.0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-08,0.000689,0.158657,0.840654
2018-01-09,0.000349,0.039870,0.959781
2018-01-10,0.000904,0.315968,0.683128
2018-01-11,0.000736,0.097119,0.902145
2018-01-12,0.000205,0.027044,0.972750
...,...,...,...
2024-12-27,0.000840,0.997454,0.001706
2024-12-28,0.003706,0.985716,0.010578
2024-12-29,0.001185,0.995691,0.003124
2024-12-30,0.002583,0.992183,0.005234


## 可视化分析

In [45]:
file_path = "~/quant-research/data/yahoo/Bitcoin.csv"
btcusd = pd.read_csv(file_path, index_col=0, parse_dates=True)
btcusd = btcusd.drop(columns="Adj Close")
btcusd.columns = [x.lower() for x in btcusd.columns]
btcusd.index.name = "date"

df = (
    btcusd.join(predict_prob_df, on="date", how="left")
    .dropna()
    .rename(columns={1: "bullish_prob", 0: "neutral_prob", -1: "bearish_prob"})
)

df

Unnamed: 0_level_0,open,high,low,close,volume,bearish_prob,neutral_prob,bullish_prob
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-08,16476.199219,16537.900391,14208.200195,15170.099609,18413899776,0.000689,0.158657,0.840654
2018-01-09,15123.700195,15497.500000,14424.000000,14595.400391,16659999744,0.000349,0.039870,0.959781
2018-01-10,14588.500000,14973.299805,13691.200195,14973.299805,18500800512,0.000904,0.315968,0.683128
2018-01-11,14968.200195,15018.799805,13105.900391,13405.799805,16534099968,0.000736,0.097119,0.902145
2018-01-12,13453.900391,14229.900391,13158.099609,13980.599609,12065699840,0.000205,0.027044,0.972750
...,...,...,...,...,...,...,...,...
2024-12-27,95704.976562,97294.843750,93310.742188,94164.859375,52419934565,0.000840,0.997454,0.001706
2024-12-28,94160.187500,95525.898438,94014.289062,95163.929688,24107436185,0.003706,0.985716,0.010578
2024-12-29,95174.054688,95174.875000,92881.789062,93530.226562,29635885267,0.001185,0.995691,0.003124
2024-12-30,93527.195312,94903.320312,91317.132812,92643.210938,56188003691,0.002583,0.992183,0.005234


In [46]:
def find_probability_periods(
    df: pd.DataFrame, probability_threshold: float = 0.7
) -> List[Tuple[datetime, datetime, str]]:
    """
    识别上涨、下跌和震荡概率超过阈值的连续时期。

    Args:
        data: 包含概率的数据框，包含字段 'bullish_prob', 'neutral_prob', 'bearish_prob'。
        probability_threshold: 概率阈值，默认为 0.7。

    Returns:
        一个列表，包含 (开始日期, 结束日期, 状态) 的元组。
    """

    periods = []
    start_date = None
    current_state = None

    for date, row in df.iterrows():
        up_prob = row["bullish_prob"]
        down_prob = row["bearish_prob"]
        neutral_prob = row["neutral_prob"]

        if up_prob > probability_threshold:
            state = "up"
        elif down_prob > probability_threshold:
            state = "down"
        elif neutral_prob > probability_threshold:
            state = "neutral"
        else:
            state = None

        if state:
            if state == current_state:
                continue
            else:
                if start_date:
                    periods.append((start_date, date, current_state))
                start_date = date
                current_state = state
        else:
            if start_date:
                periods.append((start_date, date, current_state))
            start_date = None
            current_state = None

    # 处理最后一个时期
    if start_date:
        periods.append((start_date, df.index[-1], current_state))

    return periods

In [47]:
# 概率阈值
probability_threshold = 0.7
periods = find_probability_periods(df, probability_threshold)

# 创建图表对象
fig = go.Figure()

# 添加收盘价折线图
fig.add_trace(
    go.Scatter(
        x=df.index,
        y=df["close"],
        name="收盘价",
        line=dict(color="blue"),  # 设置线条颜色为蓝色
    )
)

# 添加颜色带
for start_date, end_date, state in periods:
    color = None
    if state == "up":
        color = "rgba(144, 238, 144, 0.5)"  # 浅绿色
    elif state == "down":
        color = "rgba(240, 128, 128, 0.5)"  # 浅红色
    elif state == "neutral":
        color = "rgba(211, 211, 211, 0.5)"  # 浅灰色

    fig.add_vrect(
        x0=start_date,
        x1=end_date,
        fillcolor=color,
        opacity=0.7,
        layer="below",
        line_width=0,
    )

# 更新布局
fig.update_layout(
    title="股票收盘价与概率阈值分析",
    width=1200,
    height=800,
    template="plotly_white",
)

fig.show()