In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mytest1logistic/customer_churn.csv
/kaggle/input/houkongtest/stock_train_data.csv
/kaggle/input/houkongtest/stock_test_data.csv


In [2]:
# 区域1：库导入========================================================
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import (accuracy_score, precision_score, 
                            recall_score, f1_score, fbeta_score)
import joblib

In [3]:
# 区域2：路径配置=====================================================
TRAIN_PATH = "/kaggle/input/houkongtest/stock_train_data.csv"  # 带标签的训练集
TEST_PATH = "/kaggle/input/houkongtest/stock_test_data.csv"    # 仅特征的测试集
OUTPUT_PATH = "/kaggle/working/predictions.csv"  # 结果文件路径

In [4]:
# 区域3：数据加载与预处理=============================================
# 加载训练集（含标签）
train_data = pd.read_csv(TRAIN_PATH)
X = train_data.drop(['id','close','date'],axis=1)  
y = train_data['close']

In [5]:
# 定义特征类型（根据实际数据修改）
numeric_features = X.select_dtypes(include=['number']).columns  # 数值型特征示例
categorical_features = X.select_dtypes(include=['object']).columns       # 类别型特征示例

# 构建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),      # 标准化数值特征
        ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_features)  # 独热编码
    ])

In [6]:
# 构建完整管道
# 原代码中的管道定义修改为：
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GridSearchCV(  
        estimator=MLPRegressor(random_state=42, early_stopping=True, max_iter=4000),  # 替換為MLP[8](@ref)
        param_grid={
            'hidden_layer_sizes': [(64,), (128,64)],  # 隱藏層結構[6,8](@ref)
            'activation': ['relu', 'tanh'],            # 激活函數[6](@ref)
            'alpha': [0.0001, 0.001],                 # L2正則化強度[6,8](@ref)
            'learning_rate_init': [0.001, 0.005],     # 初始學習率[6](@ref)
            'batch_size': [32, 64],                   # 批大小[6](@ref)
            'solver': ['adam']                        # 優化器固定為Adam[6](@ref)
        },
        cv=3,                # 減少交叉驗證折數加速訓練[5](@ref)
        scoring='neg_mean_squared_error',  
        verbose=2,           
        n_jobs=-1            
    ))
])

# 划分验证集
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 训练模型
pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(64,), learning_rate_init=0.005, solver=adam; total time=   0.4s
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(64,), learning_rate_init=0.001, solver=adam; total time=   1.0s
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(64,), learning_rate_init=0.005, solver=adam; total time=   0.4s
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(64,), learning_rate_init=0.001, solver=adam; total time=   1.4s
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(64,), learning_rate_init=0.005, solver=adam; total time=   0.4s
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(64,), learning_rate_init=0.001, solver=adam; total time=   1.1s
[CV] END activation=relu, alpha=0.0001, batch_size=32, hidden_layer_sizes=(128, 64), learning_rate_init=0

In [7]:
# 获取最佳模型参数和得分
best_params = pipeline.named_steps['regressor'].best_params_  # 最佳参数[6,7](@ref)
best_score = -pipeline.named_steps['regressor'].best_score_   # 转为正MSE[4](@ref)

# 用最佳模型进行验证集预测
val_pred = pipeline.predict(X_val)
print(f"最佳参数：{best_params}")
print("验证集MSE:", mean_squared_error(y_val, val_pred))

最佳参数：{'activation': 'tanh', 'alpha': 0.001, 'batch_size': 32, 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.005, 'solver': 'adam'}
验证集MSE: 0.41832061615072313


In [8]:
# 加载测试集
test_data = pd.read_csv(TEST_PATH)
test_data['id'] = test_data.index + 1501
ids = test_data['id']  


# 生成预测结果
predictions = pipeline.predict(test_data)

# 构建结果数据框
output_df = pd.DataFrame({
    'id': ids,
    'close': predictions
})

#output_df['close'] = output_df['close'].round(1)

# 导出预测结果
output_df.to_csv(OUTPUT_PATH, index=False, header=True)

# 保存完整管道
best_model = pipeline.named_steps['regressor'].best_estimator_
joblib.dump(best_model, '/kaggle/working/best_model.pkl')

print(f"预测结果已保存至：{OUTPUT_PATH}")

预测结果已保存至：/kaggle/working/predictions.csv
