In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor


### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module6_exercise_train.csv')
download_file(test_data_url, 'module6_exercise_test.csv')

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
data_test = pd.read_csv('module6_exercise_test.csv', index_col='index')

### Data Analysis

In [None]:
data_train

In [None]:
data_test

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()

In [None]:
data_test.isnull().sum()

In [None]:
# Plot the distribution using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data_train['end_of_day_return'], bins=50, kde=True)
plt.title('Distribution of End of Day Return')
plt.xlabel('End of Day Return')
plt.ylabel('Frequency')
plt.show()

### Model Building and Evaluate

In [None]:
y = data_train.pop('end_of_day_return')
X = data_train.copy()

In [None]:
def weighted_accuracy(y_true, y_pred):
    weights = np.abs(y_true)

    # Compute the sign of true and predicted values
    sign_true = np.sign(y_true)
    sign_pred = np.sign(y_pred)

    # Correct predictions where the sign of the true and predicted values match
    correct_predictions = sign_true == sign_pred

    # Compute the weighted accuracy
    weighted_acc = np.sum(weights * correct_predictions) / np.sum(weights)

    return weighted_acc

In [None]:
# Function to plot the evaluation results
def plot_results(mse_train, mse_test, w_acc_train, w_acc_test):
    plt.figure(figsize=(12, 6))

    # MSE plot
    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)

    # weighted_accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(w_acc_train, label="Train weighted_accuracy", marker='o')
    plt.plot(w_acc_test, label="Test weighted_accuracy", marker='o')
    plt.fill_between(range(len(w_acc_train)), np.min(w_acc_train), np.max(w_acc_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(w_acc_test)), np.min(w_acc_test), np.max(w_acc_test), color='orange', alpha=0.1)
    plt.title("weighted_accuracy over Folds")
    plt.xlabel("Fold")
    plt.ylabel("weighted_accuracy")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

def plot_multi_model_results(results):
    # Set up the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))

    # Colors for train and test
    train_color = 'skyblue'
    test_color = 'lightgreen'

    # Plot MSE
    ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
    ax1.set_ylabel('MSE', fontsize=12)
    ax1.set_xlabel('Models', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)

    # Plot weighted_accuracy
    ax2.set_title('weighted_accuracy Comparison', fontsize=16)
    ax2.set_ylabel('weighted_accuracy', fontsize=12)
    ax2.set_xlabel('Models', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)

    x = np.arange(len(results))
    width = 0.35

    for i, (model_name, scores) in enumerate(results.items()):
        # MSE
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']

        ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "",
                color=train_color, alpha=0.7)
        ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "",
                color=test_color, alpha=0.7)

        ax1.errorbar(x[i] - width/2, np.mean(mse_train),
                     yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]],
                     fmt='none', ecolor='black', capsize=5)
        ax1.errorbar(x[i] + width/2, np.mean(mse_test),
                     yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]],
                     fmt='none', ecolor='black', capsize=5)

        # weighted_accuracy
        w_acc_train = scores['w_acc_train']
        w_acc_test = scores['w_acc_test']

        ax2.bar(x[i] - width/2, np.mean(w_acc_train), width, label='Train' if i == 0 else "",
                color=train_color, alpha=0.7)
        ax2.bar(x[i] + width/2, np.mean(w_acc_test), width, label='Test' if i == 0 else "",
                color=test_color, alpha=0.7)

        ax2.errorbar(x[i] - width/2, np.mean(w_acc_train),
                     yerr=[[np.mean(w_acc_train)-np.min(w_acc_train)], [np.max(w_acc_train)-np.mean(w_acc_train)]],
                     fmt='none', ecolor='black', capsize=5)
        ax2.errorbar(x[i] + width/2, np.mean(w_acc_test),
                     yerr=[[np.mean(w_acc_test)-np.min(w_acc_test)], [np.max(w_acc_test)-np.mean(w_acc_test)]],
                     fmt='none', ecolor='black', capsize=5)

    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')

    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')

    plt.tight_layout()
    plt.show()

#### Simple Baseline

In [None]:
# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on train set
    y_pred_train = model.predict(X_train)
    # Make predictions on train set
    y_pred_test = model.predict(X_test)

    # Compute MSE for train and test
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)

    # Compute weighted_accuracy

    w_acc_train = weighted_accuracy(y_train, y_pred_train)
    w_acc_test = weighted_accuracy(y_test, y_pred_test)

    return mse_train, mse_test, w_acc_train, w_acc_test


def run_multi_model_cv(X, y, models, n_splits=5):
    fold = KFold(n_splits=n_splits)
    results = {name: {'mse_train': [], 'mse_test': [], 'w_acc_train': [], 'w_acc_test': []}
               for name in models.keys()}

    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        for name, model in models.items():
            mse_train, mse_test, w_acc_train, w_acc_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['w_acc_train'].append(w_acc_train)
            results[name]['w_acc_test'].append(w_acc_test)
        # Find the model with the best mean w_acc test score
    best_mean_w_acc = -1
    best_model = None
    best_min_w_acc = None
    best_max_w_acc = None

    for name, result in results.items():
        w_acc_test_scores = result['w_acc_test']
        mean_w_acc_test = sum(w_acc_test_scores) / len(w_acc_test_scores)  # Calculate mean w_acc score
        min_w_acc_test = min(w_acc_test_scores)  # Minimum w_acc score
        max_w_acc_test = max(w_acc_test_scores)  # Maximum w_acc score

        if mean_w_acc_test > best_mean_w_acc:
            best_mean_w_acc = mean_w_acc_test
            best_min_w_acc = min_w_acc_test
            best_max_w_acc = max_w_acc_test
            best_model = name

    # Print the best mean w_acc test score, min, max, and the associated model
    print(f"Best mean w_acc test score: {best_mean_w_acc:.4f} by model: {best_model}")
    print(f"Min w_acc test score: {best_min_w_acc:.4f}, Max w_acc test score: {best_max_w_acc:.4f}")
    return results


In [None]:
# Step 1: Run cross-validation
results = run_multi_model_cv(X, y, {"RandomForestRegressor": RandomForestRegressor(n_jobs=-1)})

In [None]:
# Step 2: Plot the results
plot_results(results["RandomForestRegressor"]["mse_train"],
             results["RandomForestRegressor"]["mse_test"],
             results["RandomForestRegressor"]["w_acc_train"],
             results["RandomForestRegressor"]["w_acc_test"])

Different models

In [None]:
# 放在你的 import 区域
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor       # 若已导入可忽略
from lightgbm import LGBMRegressor     # 若已导入可忽略
from sklearn.tree import DecisionTreeRegressor  # 若想要真正的“单棵决策树”

models = {
    'Ridge': Ridge(),
    'DecisionTree': DecisionTreeRegressor(),          # 如果你想要单棵树
    'RandomForest': RandomForestRegressor(n_jobs=-1),
    'ExtraTrees': ExtraTreesRegressor(n_jobs=-1),
    'LGBM': LGBMRegressor(),
    'XGB': XGBRegressor(),
    'KNN Regressor': KNeighborsRegressor(),
}


In [None]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

#### Manage properly the objective weighted_accuracy
should we create different classes? custom loss?

Create Compare and Optimize different models

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin, clone

class ClassifierAsRegressor(BaseEstimator, RegressorMixin):
    """
    用分类器学 sign(y)，输出 p=Pr(up)，再映射回数值：
      y_hat = (2p - 1) * m
    其中 m = 训练集 |y| 的稳健尺度（默认用 median(|y|)）
    """
    def __init__(self, clf, scale_mode='median'):
        self.clf = clf
        self.scale_mode = scale_mode
        self._m = None
        self._clf_fitted = None

    def fit(self, X, y):
        y = np.asarray(y)
        z = (y > 0).astype(int)  # 二分类标签
        self._m = np.median(np.abs(y)) if self.scale_mode == 'median' else np.mean(np.abs(y))
        self._clf_fitted = clone(self.clf).fit(X, z)
        return self

    def predict(self, X):
        # 需要支持 predict_proba 或 decision_function
        if hasattr(self._clf_fitted, "predict_proba"):
            p = self._clf_fitted.predict_proba(X)[:, 1]
        elif hasattr(self._clf_fitted, "decision_function"):
            # 将 margin 通过sigmoid近似成概率
            margin = self._clf_fitted.decision_function(X)
            p = 1.0 / (1.0 + np.exp(-margin))
        else:
            # 只有 predict 的话就没法做概率—退化为 +/-1
            z_hat = self._clf_fitted.predict(X)
            p = 0.5 + 0.5 * (z_hat > 0).astype(float)
        return (2.0 * p - 1.0) * self._m


In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, TimeSeriesSplit, cross_val_score
import pandas as pd

# —— 你的数据 ——
# X, y 已在前面准备好（y = data_train.pop('end_of_day_return'); X = data_train）

# 模型池（不加参数版）
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'RandomForest': RandomForestRegressor(n_jobs=-1),
    'ExtraTrees': ExtraTreesRegressor(n_jobs=-1),
    'LGBM': LGBMRegressor(),
    'XGB': XGBRegressor(),
    'KNN': KNeighborsRegressor(),
    'SVR': SVR(),

    # 分类→回归（两个常见选择）
    'Cls2Reg_LGBM': ClassifierAsRegressor(LGBMClassifier()),
    # 也可以试 XGBClassifier，但先给一个
}

# 交叉验证：若要严格时间不泄露，建议换 TimeSeriesSplit
# cv = TimeSeriesSplit(n_splits=5)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def compare_models(X, y, models, cv, scorer):
    rows = []
    for name, mdl in models.items():
        scores = cross_val_score(mdl, X, y, cv=cv, scoring=scorer, n_jobs=-1)
        rows.append({
            "model": name,
            "cv_mean_wa": scores.mean(),
            "cv_std_wa": scores.std(),
            "cv_min_wa": scores.min(),
            "cv_max_wa": scores.max(),
            "n_splits": cv.get_n_splits()
        })
    df = pd.DataFrame(rows).sort_values("cv_mean_wa", ascending=False).reset_index(drop=True)
    return df

df_cmp = compare_models(X, y, models, cv=cv, scorer=WA_SCORER)
df_cmp


In [None]:
# === 可视化 cross_val_score 比较结果 ===
plt.figure(figsize=(10, 6))
plt.barh(df_cmp['model'], df_cmp['cv_mean_wa'], xerr=df_cmp['cv_std_wa'],
         color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel("Weighted Accuracy (mean ± std)")
plt.ylabel("Model")
plt.title("Model Comparison by Weighted Accuracy (5-Fold CV)")
plt.gca().invert_yaxis()  # 让最高的模型在上面
plt.grid(axis='x', linestyle='--', alpha=0.7)
for i, v in enumerate(df_cmp['cv_mean_wa']):
    plt.text(v + 0.001, i, f"{v:.3f}", va='center')
plt.show()


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# 例子1：ExtraTrees（很常见地能把 w-acc 拉高且方差小）
et = ExtraTreesRegressor(n_jobs=-1, random_state=42)
et_grid = {
    "n_estimators": [300, 600, 900],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

et_search = RandomizedSearchCV(
    et, et_grid, n_iter=15, scoring=WA_SCORER,
    cv=cv, n_jobs=-1, random_state=42, verbose=1
)
et_search.fit(X, y)
et_best = et_search.best_estimator_
print("Best ET weighted_accuracy (CV):", et_search.best_score_)
print(et_search.best_params_)

# 例子2：LGBM（更强但易过拟合，随机搜小范围）
lgbm = LGBMRegressor(random_state=42)
lgbm_grid = {
    "n_estimators": [400, 800, 1200],
    "learning_rate": [0.03, 0.05, 0.1],
    "num_leaves": [15, 31, 63],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0],
    "reg_alpha": [0.0, 1.0, 2.0],
    "reg_lambda": [0.0, 1.0, 2.0],
}
lgbm_search = RandomizedSearchCV(
    lgbm, lgbm_grid, n_iter=20, scoring=WA_SCORER,
    cv=cv, n_jobs=-1, random_state=42, verbose=1
)
lgbm_search.fit(X, y)
lgbm_best = lgbm_search.best_estimator_
print("Best LGBM weighted_accuracy (CV):", lgbm_search.best_score_)
print(lgbm_search.best_params_)


# ET seems good, so i planned to do more on ET

In [None]:
from sklearn.metrics import make_scorer
WA_SCORER = make_scorer(lambda yt, yp: weighted_accuracy(yt, yp), greater_is_better=True)
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # 和你之前一致


In [None]:
import itertools, random
from sklearn.ensemble import ExtraTreesRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

def score_et_configs(config_list, X, y, cv, scorer):
    rows = []
    for i, cfg in enumerate(config_list, 1):
        # oob 仅在 bootstrap=True 时可用；这里不启用 oob 以保持CV一致
        mdl = ExtraTreesRegressor(
            random_state=42, n_jobs=-1, **cfg
        )
        scores = cross_val_score(mdl, X, y, cv=cv, scoring=WA_SCORER, n_jobs=-1)
        mean_, std_, mn_, mx_ = scores.mean(), scores.std(), scores.min(), scores.max()
        print(f"[{i:02d}/{len(config_list)}] cfg={cfg} -> "
              f"w_acc(mean)={mean_:.4f}  std={std_:.4f}  range=[{mn_:.4f}, {mx_:.4f}]")
        rows.append({**cfg, "wa_mean": mean_, "wa_std": std_, "wa_min": mn_, "wa_max": mx_})
    df = pd.DataFrame(rows).sort_values("wa_mean", ascending=False).reset_index(drop=True)
    return df


In [None]:
# 参数空间（不要全笛卡尔积，太大；我们随机采样一些组合）
space = {
    "n_estimators":   [600, 900, 1200, 1500],
    "max_depth":      [None, 12, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4, 6],
    "max_features":   ["sqrt", "log2", 0.5, 0.8, 1.0],
    "criterion":      ["squared_error", "absolute_error"],
    "bootstrap":      [False],   # 设 True+oob 也行，但与CV重复评估；先关掉保证速度与一致性
}

# 随机采样 N 组候选（例如 24 组），你也可以调大到 36/48 组
keys = list(space.keys())
cands = []
N = 24
rng = random.Random(42)
for _ in range(N):
    cfg = {k: rng.choice(space[k]) for k in keys}
    cands.append(cfg)

df_et = score_et_configs(cands, X, y, cv, WA_SCORER)
df_et.head(10)


## I noticed that the parameter set [01/24] already performed very well, so I decided to stop the training early. The final selected model is ExtraTrees with the following parameters:
{
    'n_estimators': 600,
    'max_depth': None,
    'min_samples_split': 10,
    'min_samples_leaf': 4,
    'max_features': 'log2',
    'criterion': 'squared_error',
    'bootstrap': False
}

### Submission:

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
X_test = pd.read_csv('module6_exercise_test.csv', index_col='index')
y_train = data_train.pop('end_of_day_return')
X_train = data_train.copy()

In [None]:
# Train on complete data (X_train, y_train) and predict on X_test
# 2) 最终模型（你选定的 ExtraTrees 参数）
best_model = ExtraTreesRegressor(
    n_estimators=600,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='log2',
    criterion='squared_error',
    bootstrap=False,
    n_jobs=-1,
    random_state=42
)

# 3) 全量训练
best_model.fit(X_train, y_train)

In [None]:

submission = pd.DataFrame({
    'index': X_test.index,
    'end_of_day_return': best_model.predict(X_test)
})

submission.to_csv('submission.csv', index=False, sep=',')

In [None]:
from google.colab import files
files.download('submission.csv')
