## 交叉验证

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier

# Generate synthetic dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Define model
model = RandomForestClassifier(random_state=42)

# KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_kfold = cross_val_score(model, X, y, cv=kf)

# StratifiedKFold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_skfold = cross_val_score(model, X, y, cv=skf)

# TimeSeriesSplit Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)
scores_tscv = cross_val_score(model, X, y, cv=tscv)

# Print results
print("KFold Scores:", scores_kfold)
print("StratifiedKFold Scores:", scores_skfold)
print("TimeSeriesSplit Scores:", scores_tscv)


KFold Scores: [0.88  0.915 0.92  0.93  0.905]
StratifiedKFold Scores: [0.88  0.915 0.9   0.945 0.895]
TimeSeriesSplit Scores: [0.87349398 0.90963855 0.88554217 0.87349398 0.91566265]


## 超参数调优

In [5]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV

# Generate synthetic dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Define model
model = RandomForestClassifier(random_state=42)

# Define parameter grid
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}

# GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X, y)

# RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=5, random_state=42, cv=5)
random_search.fit(X, y)

# HalvingGridSearchCV
halving_grid_search = HalvingGridSearchCV(estimator=model, param_grid=param_grid, cv=5, factor=2, random_state=42)
halving_grid_search.fit(X, y)

# HalvingRandomSearchCV
halving_random_search = HalvingRandomSearchCV(estimator=model, param_distributions=param_grid, cv=5, factor=2, random_state=42)
halving_random_search.fit(X, y)

# Print best parameters
print("Best parameters from GridSearchCV:", grid_search.best_params_)
print("Best parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best parameters from HalvingGridSearchCV:", halving_grid_search.best_params_)
print("Best parameters from HalvingRandomSearchCV:", halving_random_search.best_params_)




Best parameters from GridSearchCV: {'max_depth': None, 'n_estimators': 200}
Best parameters from RandomizedSearchCV: {'n_estimators': 200, 'max_depth': 10}
Best parameters from HalvingGridSearchCV: {'max_depth': 20, 'n_estimators': 50}
Best parameters from HalvingRandomSearchCV: {'n_estimators': 100, 'max_depth': 10}


## 模型评估指标

In [4]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score,
    silhouette_score, adjusted_rand_score
)
import numpy as np

# Generate predictions
model.fit(X, y)
y_pred = model.predict(X)
y_proba = model.predict_proba(X)[:, 1]  # Probabilities for ROC AUC

# Classification Metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
roc_auc = roc_auc_score(y, y_proba)

# Print classification metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

# Regression Metrics
y_regression = y + np.random.randn(len(y))  # Simulate continuous target
mse = mean_squared_error(y, y_regression)
mae = mean_absolute_error(y, y_regression)
r2 = r2_score(y, y_regression)

# Print regression metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)

# Clustering Metrics
labels_true = y  # Assume true labels
labels_pred = y_pred  # Assume predicted labels
silhouette = silhouette_score(X, labels_pred)
ari = adjusted_rand_score(labels_true, labels_pred)

# Print clustering metrics
print("Silhouette Score:", silhouette)
print("Adjusted Rand Index:", ari)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0
Mean Squared Error: 1.0530152478596555
Mean Absolute Error: 0.8181060788054686
R2 Score: -3.2120778397499823
Silhouette Score: 0.11500836462407422
Adjusted Rand Index: 1.0


以下是 **模型评估与优化** 各个部分的代码示例，并分块详细解释其原理：

---

### **1. 交叉验证**
#### **原理**
1. **KFold**:
   - 将数据随机分为 \(k\) 折，逐次用 \(k-1\) 折作为训练集，剩余 1 折作为测试集。
   - **适用场景**: 数据量适中，无明显类别不平衡。

2. **StratifiedKFold**:
   - 在每折中保持类别比例一致。
   - **适用场景**: 分类任务中类别不平衡。

3. **TimeSeriesSplit**:
   - 适用于时间序列数据，确保训练集早于测试集。
   - **适用场景**: 时间序列建模。

---

### **2. 超参数调优**
#### **原理**
1. **GridSearchCV**:
   - 遍历所有参数组合，使用交叉验证选择最佳参数。
   - **优点**: 保证找到全局最优参数。
   - **缺点**: 参数组合多时计算量大。

2. **RandomizedSearchCV**:
   - 随机选择部分参数组合进行搜索。
   - **优点**: 快速，适合大规模参数空间。
   - **缺点**: 可能漏掉最佳参数。

3. **HalvingGridSearchCV**:
   - 使用逐步增大的数据集和参数子集进行搜索。
   - **优点**: 快速优化参数。

4. **HalvingRandomSearchCV**:

   - 与 HalvingGridSearchCV 类似，但随机选择参数组合。
   - 适合大规模参数搜索。

---

### **3. 模型评估指标**
#### **原理**
1. **分类指标**:
   - `accuracy_score`: 分类准确率。
   - `precision_score`: 正类中预测正确的比例。
   - `recall_score`: 所有正类中预测正确的比例。
   - `f1_score`: 精确率和召回率的调和平均值。
   - `roc_auc_score`: 评估模型的分类能力，尤其是概率输出。

2. **回归指标**:
   - `mean_squared_error`: 均方误差，用于评估回归预测的偏差。
   - `mean_absolute_error`: 平均绝对误差，衡量预测的平均绝对偏差。
   - `r2_score`: 判定系数，表示模型的拟合优度。

3. **聚类指标**:
   - `silhouette_score`: 衡量聚类的紧密性和分离性。
   - `adjusted_rand_score`: 衡量聚类结果与真实标签的一致性。

