## scilit-learn 中的回归问题

In [1]:
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import datasets

In [2]:
# 使用波士顿房价数据
boston = datasets.load_boston()

# 由于是多元线性回归，所以这里处理所有的特征值
X = boston.data
y = boston.target

# 数据清理
X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
print(X.shape)
print(y.shape)

(490, 13)
(490,)


In [4]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(392, 13)
(392,)
(98, 13)
(98,)


## scikit-learn 中的线性回归

In [6]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()

In [7]:
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# 系数 coefficients
linear_reg.coef_

array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,
       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,
        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,
       -3.80923751e-01])

In [9]:
# 截距 interception
linear_reg.intercept_

34.117399723229845

In [10]:
linear_reg.score(X_test, y_test)

0.8129794056212809

## scikit-learn 中的 kNN Regression

In [11]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train, y_train)
knn_reg.score(X_test, y_test)

0.5865412198300899

In [12]:
# 显然用 knn_reg 比用 linear_reg 的效果要差
# 但是 kNN 算法存在很多超参数，下面就用网格搜索的方式来寻找 knn_reg 的最佳模型

In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, cv=3)
grid_search.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [23]:
grid_search.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [24]:
grid_search.best_score_

0.6340477954176972

In [28]:
# 评分还是比 linear_reg 的低不少， 不过
# 这里的 score 是网格搜索CV（交叉验证）的评分，并不是最终算法的评分，我们要先取得算法，在看看算法真正的评分
knn_best_reg = grid_search.best_estimator_
knn_best_reg.score(X_test, y_test)

0.7044357727037996

In [None]:
# 最好的 knn_reg 得分还是不如 linear_reg 的，说明 线性回归 算法解决波士顿房价问题是比较合适的。
# 当然这个 网格搜索的 score 用的 CV 的评分，不是我们之前说的 R^2 评价标准，所以选出来的 knn_reg 也许不是最优的。