In [9]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 加载数据
data = pd.read_csv('./data/BostonHousing.csv').to_numpy()

X = data[:, :-1]
y = data[:, -1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1024)

#训练
grid = GridSearchCV(
    estimator=KNeighborsRegressor(),
    param_grid={
        'n_neighbors': [3, 5, 7, 11, 15, 19, 23, 31],
        'weights': ['uniform', 'distance'],
        'p': [1, 2],
    },
    cv=5,
    scoring='neg_mean_squared_error',  # mean_squared_error 结果一定是正数，数字越大越表示偏差越大，加上负号后，越大表示偏差越小
)
grid.fit(X_train, y_train)

# 交叉验证/训练结果
print(f'训练集的最佳模型: {grid.best_estimator_}')
print(f'训练集的最佳参数: {grid.best_params_}')    # 
print(f'训练集的最佳分数: {grid.best_score_}')     # 训练集中模型的准确率

# 执行预测
y_pred = grid.best_estimator_.predict(X_test)
score = mean_squared_error(y_test, y_pred)
print(f'测试集实际分数: {score}')
print(f'测试集预测值：{y_pred[:5]}')
print(f'测试集真实值：{y_test[:5]}')

训练集的最佳模型: KNeighborsRegressor(n_neighbors=3, p=1, weights='distance')
训练集的最佳参数: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
训练集的最佳分数: -17.34527681022333
测试集实际分数: 13.213022379750711
测试集预测值：[13.21526866 18.34721635  7.22269645 31.53599858 28.90566135]
测试集真实值：[14.3 19.6  5.  28.4 29.6]


In [5]:
model = LinearRegression()
model.fit(X_train,y_train)
# 决定系数,最大是1，最小可以负数
model.score(X_test,y_test)

0.7259630925033402