### 多元线性回归算法

In [1]:
import numpy as np
from sklearn.metrics import r2_score

In [53]:
class LinearRegression(object):
    def __init__(self):
        """初始化LinearRegression模型"""
        self.coef_ = None
        self.interception_ = None
        self._theta = None
        
    def fit_normal(self, X_train, y_train):
        """根据训练数据集X_train，y_train训练LinearRegression模型"""
        assert X_train.shape[0] == y_train.shape[0], 'The size of X_train must be equal to the size of y_train'
        
        """根据正规方程解来计算"""
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
        
        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]
        
        return self
    
    def predict(self, X_predict):
        """给定待测数据集X_predict，返回表示X_predict的结果向量"""
        assert self.interception_ is not None and self.coef_ is not None, "Must fit before predict."
        assert X_predict.shape[1] == len(self.coef_), "The feature number must be equal to X_train."
        
        X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
        return X_b.dot(self._theta)
    
    def score(self, X_test, y_test):
        """根据测试数据集X_test和y_test确定当前模型的准确度."""
        y_predict = self.predict(X_test)
        return r2_score(y_predict, y_test)
        
        
    def __repr__(self):
        return "Linear Regression \n"

#### 实现多元线性回归模型

In [54]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [55]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [56]:
print(X.shape, y.shape)

(490, 13) (490,)


In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
print(X_train.shape, y_train.shape)

(392, 13) (392,)


In [68]:
reg = LinearRegression()
reg.fit_normal(X_train, y_train)

Linear Regression 

In [69]:
reg.coef_

array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,
       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,
        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,
       -3.80923751e-01])

In [70]:
reg.interception_

34.11739972320051

In [71]:
reg.score(X_test, y_test)

0.765296777448203

In [73]:
# sklearn算法模型实现
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(X_train, y_train)              # 使用梯度下降法进行计算

print(model.coef_)
print('theta_1 = ',model.intercept_)

[-1.20354261e-01  3.64423279e-02 -3.61493155e-02  5.12978140e-02
 -1.15775825e+01  3.42740062e+00 -2.32311760e-02 -1.19487594e+00
  2.60101728e-01 -1.40219119e-02 -8.35430488e-01  7.80472852e-03
 -3.80923751e-01]
theta_1 =  34.1173997232295


In [74]:
model.score(X_test, y_test)

0.812979405621281

#### kNN Regression

In [76]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
knn_reg.score(X_test, y_test)

0.5865412198300899

In [79]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "weights": ["uniform"],
        "n_neighbors": [i for i in range(1, 11)]
    },
    {
        "weights": ["distance"],
        "n_neighbors": [i for i in range(1, 11)],
        "p": [i for i in range(1, 6)]
    }
]

knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.2s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [80]:
grid_search.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [81]:
grid_search.best_score_

0.6340477954176972

In [82]:
grid_search.best_estimator_.score(X_test, y_test)

0.7044357727037996

### 更多关于线性回归模型的讨论