# 教程 2: 特征工程 / Tutorial 2: Feature Engineering

本教程将教你如何创建技术指标特征，这是机器学习模型预测价格的关键步骤。

This tutorial will teach you how to create technical indicator features, which is a key step in machine learning price prediction.

## 1. 导入库 / Import Libraries

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 设置 matplotlib 支持中文显示 / Set matplotlib to support Chinese display
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

# 添加项目路径 / Add project path
sys.path.append('..')

from utils.data_processor import DataProcessor
from utils.binance_client import BinanceUtility

print("库导入成功！/ Libraries imported successfully!")

## 2. 加载数据 / Load Data

我们使用教程 1 中保存的数据，或者直接从币安获取。
We use the data saved in tutorial 1, or fetch directly from Binance.

In [None]:
# 方法 1: 从本地文件加载 / Method 1: Load from local file
data_path = '../data/BTCUSDT_hist.csv'
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    print(f"从本地加载数据 / Loaded data from local: {df.shape}")
else:
    # 方法 2: 从币安获取 / Method 2: Fetch from Binance
    print("本地数据不存在，从币安获取... / Local data not found, fetching from Binance...")
    client = BinanceUtility()
    df = client.fetch_historical_data('BTCUSDT', '1h', '6 months ago UTC')

print(f"\n前 5 行数据 / First 5 rows:")
df.head()

## 3. 什么是技术指标？/ What are Technical Indicators?

技术指标是基于历史价格和成交量的数学计算结果，用于分析市场趋势和预测未来走势。

Technical indicators are mathematical calculations based on historical price and volume data, used to analyze market trends and predict future movements.

**常用技术指标 / Common Technical Indicators:**
- **SMA (简单移动平均线)**: 计算过去 N 个周期的平均价格，平滑价格波动
  - Simple Moving Average: Average price over past N periods, smooths price fluctuations
- **RSI (相对强弱指数)**: 衡量价格变动的速度和变化，范围 0-100
  - Relative Strength Index: Measures speed and change of price movements, range 0-100
- **ROC (变动率)**: 价格变动的百分比，反映价格变化速度
  - Rate of Change: Percentage change in price, reflects speed of price change
- **Volatility (波动率)**: 价格变动的标准差，衡量市场波动程度
  - Volatility: Standard deviation of price changes, measures market volatility

## 4. 创建技术指标 / Create Technical Indicators

In [None]:
# 使用 DataProcessor 类添加技术指标
# Use DataProcessor class to add technical indicators
processor = DataProcessor()
df_features = processor.add_technical_indicators(df)

print(f"添加特征后的数据形状 / Data shape after adding features: {df_features.shape}")
print(f"\n所有列 / All columns: {list(df_features.columns)}")

print(f"\n前 10 行特征数据 / First 10 rows with features:")
df_features.head(10)

## 5. 可视化移动平均线 / Visualize Moving Averages

In [None]:
# 设置时间索引 / Set timestamp as index
df_plot = df_features.set_index('timestamp').tail(500)  # 显示最近 500 小时 / Show last 500 hours

plt.figure(figsize=(14, 6))
plt.plot(df_plot.index, df_plot['close'], label='收盘价 / Close Price', linewidth=1.5, alpha=0.8)
plt.plot(df_plot.index, df_plot['sma_7'], label='SMA 7', linewidth=1.5, alpha=0.7)
plt.plot(df_plot.index, df_plot['sma_25'], label='SMA 25', linewidth=1.5, alpha=0.7)

plt.title('BTC/USDT 价格与移动平均线 / Price with Moving Averages', fontsize=14)
plt.xlabel('时间 / Time', fontsize=12)
plt.ylabel('价格 / Price (USDT)', fontsize=12)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n观察点 / Observations:")
print("- SMA 7 反映短期趋势 / SMA 7 reflects short-term trend")
print("- SMA 25 反映中长期趋势 / SMA 25 reflects medium-to-long-term trend")
print("- 当短期 MA 上穿长期 MA 时，可能形成金叉（买入信号）/ When short MA crosses above long MA, golden cross may form (buy signal)")

## 6. 可视化 RSI 指标 / Visualize RSI Indicator

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# 价格图 / Price chart
ax1.plot(df_plot.index, df_plot['close'], label='收盘价 / Close', linewidth=1, color='blue')
ax1.set_ylabel('价格 / Price', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_title('价格与 RSI 指标 / Price and RSI Indicator', fontsize=14)

# RSI 图 / RSI chart
ax2.plot(df_plot.index, df_plot['rsi_14'], label='RSI 14', linewidth=1.5, color='purple')
ax2.axhline(y=70, color='r', linestyle='--', linewidth=1.5, label='超买区 / Overbought (70)')
ax2.axhline(y=30, color='g', linestyle='--', linewidth=1.5, label='超卖区 / Oversold (30)')
ax2.fill_between(df_plot.index, 70, 100, alpha=0.1, color='red')
ax2.fill_between(df_plot.index, 0, 30, alpha=0.1, color='green')

ax2.set_ylabel('RSI', fontsize=12)
ax2.set_xlabel('时间 / Time', fontsize=12)
ax2.set_ylim([0, 100])
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nRSI 指标说明 / RSI Indicator Explanation:")
print("- RSI > 70: 超买状态，价格可能回调 / Overbought, price may pull back")
print("- RSI < 30: 超卖状态，价格可能反弹 / Oversold, price may rebound")
print("- RSI 在 30-70 之间：正常波动范围 / Normal range between 30-70")

## 7. 可视化波动率 / Visualize Volatility

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# 价格收益率 / Price returns
df_plot['returns'] = df_plot['close'].pct_change()
ax1.plot(df_plot.index, df_plot['returns'], label='收益率 / Returns', linewidth=1, alpha=0.7)
ax1.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax1.set_ylabel('收益率 / Returns', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_title('收益率与波动率 / Returns and Volatility', fontsize=14)

# 波动率 / Volatility
ax2.plot(df_plot.index, df_plot['volatility'], label='波动率 / Volatility', linewidth=1.5, color='orange')
ax2.fill_between(df_plot.index, df_plot['volatility'], alpha=0.3, color='orange')
ax2.set_ylabel('波动率 / Volatility', fontsize=12)
ax2.set_xlabel('时间 / Time', fontsize=12)
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n波动率说明 / Volatility Explanation:")
print(f"平均波动率 / Average Volatility: {df_plot['volatility'].mean():.2f}")
print(f"最大波动率 / Max Volatility: {df_plot['volatility'].max():.2f}")
print("高波动率通常伴随大的价格变动，风险也更高 / High volatility usually comes with large price movements and higher risk")

## 8. 特征相关性分析 / Feature Correlation Analysis

了解特征之间的相关性有助于选择最好的特征组合。
Understanding correlations between features helps select the best feature combination.

In [None]:
# 选择数值型特征进行相关性分析
# Select numerical features for correlation analysis
feature_cols = ['open', 'high', 'low', 'close', 'volume', 'sma_7', 'sma_25', 'rsi_14', 'roc', 'volatility']
corr_matrix = df_features[feature_cols].corr()

# 绘制热力图 / Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, fmt='.2f')
plt.title('特征相关性矩阵 / Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n相关性分析说明 / Correlation Analysis Explanation:")
print("- 红色: 正相关（一个增加，另一个也增加）/ Red: Positive correlation")
print("- 蓝色: 负相关（一个增加，另一个减少）/ Blue: Negative correlation")
print("- 数值越接近 1 或 -1，相关性越强 / Closer to 1 or -1 means stronger correlation")
print("- 高度相关的特征可能导致模型过拟合 / Highly correlated features may cause overfitting")

## 9. 准备特征和标签 / Prepare Features and Labels

机器学习模型需要：
- **X (特征)**: 用于预测的信息
- **y (标签)**: 我们要预测的目标

Machine learning models need:
- **X (Features)**: Information used for prediction
- **y (Labels)**: The target we want to predict

In [None]:
# 准备特征和标签 / Prepare features and labels
X, y = processor.prepare_features_labels(df_features)

print(f"特征矩阵形状 / Feature matrix shape X: {X.shape}")
print(f"标签向量形状 / Label vector shape y: {y.shape}")

print(f"\n前 5 行特征 / First 5 rows of features:")
print(X.head())

print(f"\n前 20 个标签 / First 20 labels:")
print(y.head(20).values)

print(f"\n标签分布 / Label distribution:")
print(f"上涨 (UP/1): {(y == 1).sum()} / {len(y)} ({(y == 1).sum()/len(y)*100:.1f}%)")
print(f"下跌 (DOWN/0): {(y == 0).sum()} / {len(y)} ({(y == 0).sum()/len(y)*100:.1f}%)")

## 10. 特征重要性预览 / Feature Importance Preview

让我们看看每个特征与目标的关系。
Let's look at the relationship between each feature and the target.

In [None]:
# 计算每个特征与目标的相关性
# Calculate correlation between each feature and target
feature_importance = {}
for col in X.columns:
    feature_importance[col] = X[col].corr(y)

# 排序并显示 / Sort and display
importance_df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Correlation with Target'])
importance_df = importance_df.sort_values('Correlation with Target', key=abs, ascending=False)

plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in importance_df['Correlation with Target']]
plt.barh(importance_df['Feature'], importance_df['Correlation with Target'], color=colors, alpha=0.7)
plt.xlabel('与目标的相关性 / Correlation with Target', fontsize=12)
plt.ylabel('特征 / Feature', fontsize=12)
plt.title('特征与目标的相关性 / Feature Correlation with Target', fontsize=14)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\n特征重要性说明 / Feature Importance Explanation:")
print("- 绿色条: 正相关（与价格上涨相关）/ Green: Positive correlation")
print("- 红色条: 负相关（与价格下跌相关）/ Red: Negative correlation")
print("- 条越长，该特征对预测越重要 / Longer bars mean more important for prediction")

## 11. 保存特征数据 / Save Feature Data

In [None]:
# 保存带有特征的数据 / Save data with features
features_path = '../data/BTCUSDT_features.csv'
df_features.to_csv(features_path, index=False)
print(f"特征数据已保存至 / Feature data saved to: {features_path}")

# 保存特征和标签的 numpy 数组 / Save feature and label numpy arrays
import numpy as np
np.save('../data/X_features.npy', X.values)
np.save('../data/y_labels.npy', y.values)
print(f"特征矩阵已保存至 / Feature matrix saved to: ../data/X_features.npy")
print(f"标签向量已保存至 / Label vector saved to: ../data/y_labels.npy")

## 总结 / Summary

在本教程中，我们学习了：

1. **技术指标的概念**: 理解了 SMA、RSI、ROC、Volatility 等指标的原理
2. **特征创建**: 使用 DataProcessor 自动创建技术指标特征
3. **特征可视化**: 通过图表理解每个特征的含义和作用
4. **特征分析**: 分析特征之间的相关性和与目标的关系
5. **数据准备**: 将数据转换为机器学习模型可用的格式 (X, y)

In this tutorial, we learned:

1. **Concept of Technical Indicators**: Understood principles of SMA, RSI, ROC, Volatility
2. **Feature Creation**: Used DataProcessor to automatically create technical indicator features
3. **Feature Visualization**: Understood meaning and purpose of each feature through charts
4. **Feature Analysis**: Analyzed correlations between features and with the target
5. **Data Preparation**: Converted data to machine learning format (X, y)

下一步，我们将训练机器学习模型！/ Next, we'll train machine learning models!