In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# n_iters = 5 将所有的样本至少看五遍
def fit_sgd(self,X_train,y_train,n_iters = 5,t0 = 5,t1 = 50):
        """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert n_iters >= 1    
        
        def dJ_sgd(theta,X_b,y_i):
            return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
        
        #下面不写t0 t1 应该也不会报错哇?
        def sgd(X_b,y,initial_theta,n_iters,t0 = 5, t1 = 50):
            
            def learning_rate(t):
                return t0/(t + t1)
            
            theta = initial_theta
            # 为什么需要m? 接下去计算index
            m = len(X_b)
        
            # 在指定次数中循环,每次循环,都随机的把所有样本看一遍.
            # 这样做,是防止,直接取随机数,有些样本没有取到.
            for cur_iter in range(n_iters):
                indexes = np.random.permutation(m)
                #打乱了X_b,y的样本排列顺序,保证所有的样本都能够取到.
                X_b_new = X_b[indexes]
                y_new = y[indexes]
                #考虑每一个样本的梯度.
                for i in range(m):
                    gradient = dJ_sgd(theta,X_b_new[i],y_new[i])
                    #注意学习率,考虑
                    theta = theta - learning_rate(cur_iter *m + i ) * gradient
                    
            return theta
                

## 使用我们自己的SGD

In [3]:
m = 100000

x = np.random.normal(size = m)
X = x.reshape(-1,1)
y = 4.*x  + 3 + np.random.normal(0,3,size = m)

In [4]:
from mymodule.LinearRegression  import LinearRegression

In [5]:
reg_sgd = LinearRegression()

In [6]:
reg_sgd.fit_sgd(X,y,n_iters=2)

LinearRegression()

In [7]:
reg_sgd.coef_

array([ 4.00419755])

In [8]:
reg_sgd.intercept_

3.0183394805517363

## 真实使用我们自己的SGD

In [14]:
from  sklearn  import datasets

boston  = datasets.load_boston()

In [23]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [78]:
X = boston.data
y = boston.target

X = X[y<50.0]
y = y[y<50.0]

In [79]:
# train test split
from mymodule.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,seed = 666)

In [80]:
# 数据归一化
from sklearn.preprocessing import StandardScaler

standrdScaler = StandardScaler()
standrdScaler.fit(X_train,y_train)
X_train_standard = standrdScaler.transform(X_train)
X_test_standard = standrdScaler.transform(X_test)

In [81]:
reg_sgd_boston = LinearRegression()
%time reg_sgd_boston.fit_sgd(X_train_standard,y_train,n_iters=2)
reg_sgd_boston.score(X_test_standard,y_test)

CPU times: user 9.06 ms, sys: 3.33 ms, total: 12.4 ms
Wall time: 9.7 ms


0.78651716204682975

In [102]:
%time reg_sgd_boston.fit_sgd(X_train_standard,y_train,n_iters=50)
reg_sgd_boston.score(X_test_standard,y_test)

CPU times: user 132 ms, sys: 2.72 ms, total: 135 ms
Wall time: 133 ms


0.8074538468965633

In [105]:
%time reg_sgd_boston.fit_sgd(X_train_standard,y_train,n_iters=100)
reg_sgd_boston.score(X_test_standard,y_test)

CPU times: user 266 ms, sys: 5.4 ms, total: 271 ms
Wall time: 279 ms


0.81086710549749141

### 使用scikit-learn进行随机梯度下降法

In [107]:
from sklearn.linear_model import SGDRegressor

In [133]:
reg_sgd = SGDRegressor()
%time reg_sgd.fit(X_train_standard,y_train)
reg_sgd.score(X_test_standard,y_test)

CPU times: user 1.7 ms, sys: 1.17 ms, total: 2.88 ms
Wall time: 1.71 ms




0.80691391167110316

In [135]:
reg_sgd = SGDRegressor(n_iter = 100)
%time reg_sgd.fit(X_train_standard,y_train)
reg_sgd.score(X_test_standard,y_test)

CPU times: user 9.06 ms, sys: 3.04 ms, total: 12.1 ms
Wall time: 8.27 ms




0.81329574351286338