In [None]:
# -*- coding: utf-8 -*-
# Auto-merged: capstone_group4.py + RF.py (imports unified; functionality preserved)

from __future__ import annotations
import numpy as np
import pandas as pd
from typing import List, Dict, Union, Optional
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import time; time.sleep(0.5)


# ---- capstone_groupXX.ipynb ----

# MLB 投手薪资等级分类（Decision Tree）Notebook 模板

逐格运行，便于调试与提交草稿。

In [None]:
%load_ext autoreload
%autoreload 2

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

## 1. 导入依赖与脚本函数

In [None]:
    make_salary_tiers, infer_feature_columns,
    train_decision_tree_classifier,                
    train_decision_tree_classifier_simple,         
    plot_confusion, plot_tree_structure,          
    export_tree_rules, top_k_features_by_importance
)

## 2. 读取数据（修改路径即可）

In [None]:
DATA_PATH = "mlb_salaries_and_pitching_stats_2011-2024.csv"
df = pd.read_csv(DATA_PATH)
len(df), df.shape

## 3. 快速窥视数据结构

In [None]:
sorted(df.columns.tolist())

In [None]:
df.head(3)

In [None]:
df.dtypes

## 4. 生成薪资分层（tiers）

In [None]:
df = make_salary_tiers(df, salary_col="Total Cash", n_tiers=4)
df["salary_tier"].value_counts(dropna=False)

## 5. EDA：缺失与分布概览

In [None]:
(df.isna().mean().sort_values(ascending=False)).head(20)

In [None]:
df.describe(include="all").T.head(20)

In [None]:
df["Total Cash"].plot(kind="hist", bins=30, title="Total Cash Distribution")
plt.show()

## 6. 选择特征列（自动 + 手动微调）

In [None]:
feature_cols = infer_feature_columns(df, target_col="salary_tier")
feature_cols

In [None]:
# 手动微调（例如不使用 Year）
feature_cols = [c for c in feature_cols if c not in ["Year"]]
feature_cols

## 7. 训练 & 调参（决策树）

In [None]:
model, (X_test, y_test, y_pred), summary = train_decision_tree_classifier_simple(
    df, feature_cols, target_col="salary_tier",
    base_max_depth=6, base_min_samples_leaf=10, search_alphas=20
)
print(summary["constraints"], "chosen_alpha:", summary["chosen_ccp_alpha"])
print("CV f1_macro:", summary["cv_best_score"])
print("Holdout Acc:", summary["holdout_accuracy"])
print("Holdout F1_macro:", summary["holdout_f1_macro"])
print(summary["classification_report"])

## 8. 评估指标与可视化

In [None]:
print("约束(限深/叶子)：", summary["constraints"])
print("选择的 ccp_alpha：", summary["chosen_ccp_alpha"])
print("CV f1_macro：", summary["cv_best_score"])
print("Holdout Accuracy：", summary["holdout_accuracy"])
print("Holdout F1_macro：", summary["holdout_f1_macro"])
print(summary["classification_report"])

In [None]:
plot_confusion(y_test, y_pred)  # 可改 normalize='true'

In [None]:
plot_tree_structure(model, feature_cols, max_depth_to_plot=5)

## 9. 特征重要性（可选）

In [None]:
clf = model.named_steps["clf"]
importances = clf.feature_importances_
imp_df = pd.DataFrame({"feature": feature_cols, "importance": importances}).sort_values("importance", ascending=False)
imp_df.head(15)

In [None]:
ax = imp_df.head(15).plot(kind="barh", x="feature", y="importance", title="Top Feature Importances")
ax.invert_yaxis()

## 10. 保存模型（可选）

In [None]:
joblib.dump(model, "salary_tier_tree_pipeline.joblib")

## 11. 推理示例（可选）

In [None]:
sample = df.sample(5, random_state=42)
pred = model.predict(sample[feature_cols])
sample_out = pd.DataFrame({
    "Player": sample.get("Player", pd.Series([None]*len(sample))),
    "TrueTier": sample["salary_tier"].values,
    "PredTier": pred
})
sample_out

# ---- RF.ipynb ----

In [None]:
mlb_df = pd.read_csv("mlb_salaries_and_pitching_stats_2011-2024.csv")
random_forest(mlb_df)
print("\nAnalysis complete.")

# ---- capstone_group4.ipynb ----

In [None]:
# --- Main excution ---
if __name__ == '__main__':
    # Chart type
    plt.style.use('seaborn-v0_8-whitegrid')
    
    # Define file path
    file_path = 'C:/Users/frank/OneDrive/Documents/mlb_salaries_and_pitching_stats_2011-2024.csv'
    
    # Step 1-Load data and inspect data
    main_df = load_and_inspect_data(file_path)
    
    # Step 2- if success, run regression model
    if main_df is not None:
        perform_linear_regression_analysis(main_df)
        print("\nAnalysis complete.")