In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 区域1：库导入========================================================
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import (accuracy_score, precision_score, 
                            recall_score, f1_score, fbeta_score)
import joblib

In [None]:
# 区域2：路径配置=====================================================
TRAIN_PATH = "/kaggle/input/houkongtest/stock_train_data.csv"  # 带标签的训练集
TEST_PATH = "/kaggle/input/houkongtest/stock_test_data.csv"    # 仅特征的测试集
OUTPUT_PATH = "/kaggle/working/predictions.csv"  # 结果文件路径

In [None]:
# 区域3：数据加载与预处理=============================================
# 加载训练集（含标签）
train_data = pd.read_csv(TRAIN_PATH)
X = train_data.drop(['id','close','date'],axis=1)  # 假设目标列名为target
y = train_data['close']

In [None]:
# 定义特征类型（根据实际数据修改）
numeric_features = X.select_dtypes(include=['number']).columns  # 数值型特征示例
categorical_features = X.select_dtypes(include=['object']).columns       # 类别型特征示例

# 构建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),      # 标准化数值特征
        ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_features)  # 独热编码
    ])

In [None]:
# 构建完整管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier',  KNeighborsRegressor(n_neighbors=5))
])

# 划分验证集（网页[5]数据分割方法）
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)


param_grid = {
    'classifier__n_neighbors': [3,5,7,9,15],  # 扩展k范围
    'classifier__weights': ['uniform', 'distance'],  # 新增距离权重参数
    'classifier__p': [1, 2],  # 新增距离度量（曼哈顿/欧氏）
    'classifier__algorithm': ['auto', 'kd_tree']  # 优化树算法
}

from sklearn.model_selection import RepeatedKFold
cv_strategy = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv_strategy)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# 训练模型
pipeline.fit(X_train, y_train)

In [None]:
val_pred = pipeline.predict(X_val)
print("验证集MSE:", mean_squared_error(y_val, val_pred))
print("验证集R²:", r2_score(y_val, val_pred))

In [None]:
# 加载测试集
test_data = pd.read_csv(TEST_PATH)
test_data['id'] = test_data.index + 1 
ids = test_data['id']  # 假设存在id列

# 生成预测结果
predictions = pipeline.predict(test_data)

# 构建结果数据框
output_df = pd.DataFrame({
    'id': ids,
    'close': predictions
})

output_df['close'] = output_df['close'].round(1)

# 导出预测结果
output_df.to_csv(OUTPUT_PATH, index=False, header=True)

# 保存完整管道
joblib.dump(grid_search.best_estimator_, '/kaggle/working/best_model.pkl')

print(f"预测结果已保存至：{OUTPUT_PATH}")