## 定义采用向量化的梯度下降方法

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 定义计算损失函数值得方法
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2)
    except:
        return float('inf')

In [3]:
# 定义计算导数值得方法
def dJ(theta, X_b, y):
#     result = np.empty(len(theta))
#     result[0] = np.sum(X_b.dot(theta) - y)
#     for i in range(1, len(theta)):
#         result[i] = np.sum((X_b.dot(theta) - y).dot(X_b[:, i]))
#     return result * 2 / len(X_b)
    return X_b.T.dot(X_b.dot(theta) -y) * 2 / len(X_b)

In [4]:
# 梯度下降的过程包装成方法
def gradient_descent(X_b, y, init_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = init_theta
    i_iters = 1
    while i_iters <= n_iters:
        # 首先求当前点的梯度
        gradient = dJ(theta, X_b, y)
        # 记下移动前的位置
        last_theta = theta
        # 往函数值小的方向移动
        theta = theta - eta * gradient
        # 退出机制
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        i_iters += 1
    print('total steps:', i_iters)
    return theta

## 使用波士顿房价的数据 来测试

In [5]:
from sklearn import datasets

In [6]:
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(392, 13)
(392,)
(98, 13)
(98,)


In [8]:
# 数据归一化
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)

In [9]:
X_b = np.hstack([np.ones((len(X_train_standard), 1)), X_train_standard])
init_theta= np.zeros(X_b.shape[1])
eta = 0.01

boston_theta = gradient_descent(X_b, y_train, init_theta, eta)
boston_theta.shape

total steps: 6824


(14,)

In [10]:
X_predict = np.hstack([np.ones((len(X_test_standard), 1)), X_test_standard])
X_predict.shape

(98, 14)

In [12]:
y_predict = X_predict.dot(boston_theta)
y_predict.shape

(98,)

In [16]:
from sklearn.metrics import r2_score
r2_score(y_test, y_predict)

0.8129798083983443

In [17]:
# 这个分数和之前用线性回归正规方程得到的值差不多。

In [18]:
# 相对于线性回归正规方程的解法，梯度下降法对于特征数量巨大时能体现出巨大的性能优势，主要得益于矩阵的向量化运算速度优势