In [None]:
import torch
torch.cuda.is_available(), torch.cuda.device_count()

## 构建两层NN
- $h = W_1X$ 
- $h\_relu = max(0, h)$
- $y_{hat} = W_2h\_relu$

<br>
### 实践中发现：
1. randn效果比rand好，最后的loss会收敛到更小----
    - 如果xy也这么生成，效果会perfect，因为服从高斯分布
+ lr影响非常之大，动不动就爆炸或消失
+ 复杂数据，要做数据缩放才能收敛；然后我发现loss震荡，把hidden_neuron调到500就好了，让其尽情过拟合吧
+ 自己电脑GPU更慢，可能是网络本身简单，交互都很费事了

### 1.numpy版

In [13]:
import numpy as np
import pandas as pd

X = np.array([[1,1],[2,1],[1,2],[2,2]])
y = np.array([0,1,1,0,]).reshape(-1,1)

N, D_in, D_out = X.shape[0], X.shape[1], y.shape[1] #N ：样本数，D_in：特征维度
H = 500 #隐藏层神经元数
print(N, D_in, D_out)

w1 = np.random.rand(D_in, H) #w1 = np.zeros((D_in, H))
w2 = np.random.rand(H, D_out) #w2 = np.zeros((H, D_out))

learing_rate = 1e-2
EPOCH = 500

for i in range(EPOCH):
    # Forword pass
    h = X.dot(w1)# N*D_in D_in*H -> N*H
    h_relu = np.maximum(0, h)# N*H...max函数是选某维度的max
    y_pred = h_relu.dot(w2)# N*H H*D_out -> N*D_out
    
    # Compute loss（MSE）
    loss = np.square(y_pred - y).mean()#sum成标量

    # Backward pass,compute gradient；即loss对参数求偏导，
    # 因为是标量对向量/矩阵求，所以维度和向量一样就好。最后在纸上写出来
    grad_y_pred = 2.0 * (y_pred - y) # N*D_out；因为是对y_pred求导，所以是y_pred-y
    grad_w2 = h_relu.T.dot(grad_y_pred)#H*D_out <- H*N N*D_out
    grad_h_relu = grad_y_pred.dot(w2.T) #N*H <- N*D_out D_out*H
    grad_h = grad_h_relu.copy() #浅拷贝，空间不同，但子对象相同。一般来说不能用赋值，否则可能改变被赋值对象的值
    grad_h[h<0] = 0 #N*H；值大于0时grad就是1，所以保持原值
    grad_w1 =X.T.dot(grad_h) #D_in*H <- D_in*H  N*H

    #update weights of w1&w2
    w1 -= learing_rate * grad_w1
    w2 -= learing_rate * grad_w2

    if i%5==0:
        print(loss, grad_h_relu[0][0],grad_h[0][0],grad_w1[0][0])
        

4 2 1
139345.04711526804 286.061484500087 286.061484500087 2720.124235949752
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0 0.0
0.5 0.0 0.0

In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer


DATA = load_boston()

# N, D_in, H, D_out = 64, 1000, 100, 10
# X = np.random.randn(N, D_in)
# y = np.random.randn(N, D_out)

X = np.array([[1,1],[2,1],[1,2],[2,2]])
y = np.array([0,1,1,0,]).reshape(-1,1)
X = DATA.data
y = DATA.target.reshape(-1,1)
# display(pd.DataFrame(X))
# print(pd.DataFrame(X).info())

X = Normalizer().fit_transform(X)
# display(pd.DataFrame(X))
N, D_in, D_out = X.shape[0], X.shape[1], y.shape[1] #N ：样本数，D_in：特征维度
H = 500 #隐藏层神经元数
print(N, D_in, D_out)
w1 = np.zeros((D_in, H))

w2 = np.zeros((H, D_out))
# w1 = np.random.randn(D_in, H) 
# w2 = np.random.randn(H, D_out)

learing_rate = 1e-5
EPOCH = 50000

def forward(X):
    global h,h_relu,y_pred
    h = X.dot(w1)# N*D_in D_in*H -> N*H
    h_relu = np.maximum(0, h)# N*H...max函数是选某维度的max
    y_pred = h_relu.dot(w2)# N*H H*D_out -> N*D_out
    

for i in range(EPOCH):
    # Forword pass
    forward(X)
    # Compute loss（MSE）
    loss = np.square(y_pred - y).mean()#sum成标量

    # Backward pass,compute gradient；即loss对参数求偏导，
    # 因为是标量对向量/矩阵求，所以维度和向量一样就好。最后在纸上写出来
    grad_y_pred = 2.0 * (y_pred - y) # N*D_out；因为是对y_pred求导，所以是y_pred-y
    grad_w2 = h_relu.T.dot(grad_y_pred)#H*D_out <- H*N N*D_out
    grad_h_relu = grad_y_pred.dot(w2.T) #N*H <- N*D_out D_out*H
    grad_h = grad_h_relu.copy() #浅拷贝，空间不同，但子对象相同。一般来说不能用赋值，否则可能改变被赋值对象的值
    grad_h[h<0] = 0#N*H；值大于0时grad就是1，所以保持原值
    grad_w1 =X.T.dot(grad_h) #D_in*H <- D_in*H  N*H

    #update weights of w1&w2
    w1 -= learing_rate * grad_w1
    w2 -= learing_rate * grad_w2

    if i%5==0:
        print(loss,grad_h_relu[0][0],grad_h[0][0],grad_w1[0][0],w1[0][0])
        

506 13 1
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1469169960474 0.0 0.0 0.0 0.0
592.1

KeyboardInterrupt: 

### 2.Tensor版，相较numpy改动之处：
- np.random.randn -> torch.randn
- .dot -> .mm
- relu：np.maximum -> .clamp(min=0)
    - Clamp all elements in input into the range [ min, max ] and return a resulting tensor:
- loss
- transpose：.t()
- copy -> clone 

还可以放GPU上：
- .cuda()相当于to.device('cuda')

In [None]:
import numpy as np
import torch
from time import time
N, D_in, H, D_out = 64, 1000, 100, 10
X = torch.randn(N, D_in).cuda() #放GPU上，本机更慢，可能是交互的原因；
y = torch.randn(N, D_out).cuda()

w1 = torch.randn(D_in, H).cuda() #w1 = np.zeros((D_in, H))
w2 = torch.randn(H, D_out).cuda() #w2 = np.zeros((H, D_out))

learing_rate = 1e-6
EPOCH = 1000

def forward(X):
    global h,h_relu,y_pred
    h = X.mm(w1)# N*D_in D_in*H -> N*H
    h_relu = h.clamp(min=0)# N*H...max函数是选某维度的max
    y_pred = h_relu.mm(w2)# N*H H*D_out -> N*D_out

a = time()
for i in range(EPOCH):
    # Forword pass
    forward(X)
    # Compute loss（MSE）
    loss = (y_pred - y).pow(2).sum().item()#item成标量

    # Backward pass,compute gradient；即loss对参数求偏导，
    # 因为是标量对向量/矩阵求，所以维度和向量一样就好。最后在纸上写出来
    grad_y_pred = 2.0 * (y_pred - y) # N*D_out；因为是对y_pred求导，所以是y_pred-y
    grad_w2 = h_relu.t().mm(grad_y_pred)#H*D_out <- H*N N*D_out
    grad_h_relu = grad_y_pred.mm(w2.t()) #N*H <- N*D_out D_out*H
    grad_h = grad_h_relu.clone() #深拷贝，空间不同，但子对象相同。一般来说不能用赋值，否则可能改变被赋值对象的值
    grad_h[h<0] = 0#N*H；值大于0时grad就是1，所以保持原值
    grad_w1 =X.t().mm(grad_h) #D_in*H <- D_in*H  N*H

    #update weights of w1&w2
    w1 -= learing_rate * grad_w1
    w2 -= learing_rate * grad_w2

    if i%1000==0:
        print(loss,grad_h_relu[0][0],grad_h[0][0],grad_w1[0][0])
print('训练耗时：',time()-a)

Pytorch的autograd

In [20]:
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

y = w*x+b # y = 2*1+3
y.backward()

print(w.grad) #x=1
print(x.grad) #w=2
print(b.grad) #1
print(y.grad) #None

tensor(1.)
tensor(2.)
tensor(1.)
None


### 3.PyTorch 来backward
1. 先简化一波forword...
+ backward
    - loss不能变成item，还要保持tensor格式
    - 把需要修改计算grad的tensor参数 加上requires_grad=True
        - 变量默认不计算grad，节约内存
    - w1.grad.zero_() 因为参数的grad默认是累加的

+ with torch.no_grad() 不让计算图再保存一次grad，以节约内存
    - 学习一下利用文档：其实讲的挺清楚的
    - Context-manager that disabled gradient calculation.<br>Disabling gradient calculation is useful for inference, when you are sure that you will not call Tensor.backward(). <br>It will reduce memory consumption for computations that would otherwise have requires_grad=True.<br>In this mode, the result of every computation will have requires_grad=False, even when the inputs have requires_grad=True.
    - 目的 
        1. 节约内存 
        2. 能进行zeros_（因为requires_grad=True这样的叶子张量不允许inplace操作）
+ 不能放cuda上？
    - 原因：.cuda()只能把requires_grad=True的变量放到GPU上，而计算w需要用到xy的grad（None），这样就会得到None的grad
    - 方法：device=device

In [None]:
import numpy as np
import torch
from time import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

N, D_in, H, D_out = 64, 1000, 100, 10
X = torch.randn(N, D_in, device=device) #放GPU上，本机更慢，可能是交互的原因；
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, requires_grad=True, device=device) #w1 = np.zeros((D_in, H))
w2 = torch.randn(H, D_out, requires_grad=True, device=device) #w2 = np.zeros((H, D_out))

learing_rate = 1e-6
EPOCH = 2000

a = time()
for i in range(EPOCH):
    # Forword pass
    y_pred = X.mm(w1).clamp(min=0).mm(w2)
    # Compute loss（MSE）
    loss = (y_pred - y).pow(2).sum()#保留tensor的type
    # Backward pass,compute gradient；
    loss.backward()
    # update weights of w1&w2
    with torch.no_grad():
        w1 -= learing_rate * w1.grad
        w2 -= learing_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()
    
    if i%1000==0:
        print(loss,w1.grad[0][0])
print('训练耗时：',time()-a)


### 4.PyTorch封装模型
改进：
- model封装参数与forward
- 参数初始化（提升性能）
- loss_fn直接定义
- update param

In [None]:
import torch
import torch.nn as nn
from time import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

N, D_in, H, D_out = 64, 1000, 100, 10
X = torch.randn(N, D_in, device=device) #放GPU上，本机更慢，可能是交互的原因；
y = torch.randn(N, D_out, device=device)

model = nn.Sequential(
    nn.Linear(D_in, H), #D_in -> H的线性变换。另：与之前不同的地方在于bias=True
    nn.ReLU(),
    nn.Linear(H, D_out),
).to(device)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

loss_fn = nn.MSELoss(reduction='sum')
learing_rate = 1e-6
EPOCH = 5000

a = time()
for i in range(EPOCH):
    # Forword pass
    y_pred = model(X)
    
    # Compute loss（MSE）
    loss = loss_fn(y_pred, y)
    
    # Backward pass,compute gradient；
    model.zero_grad()
    loss.backward()
    
    # update parameters
    with torch.no_grad():
        for param in model.parameters(): #model中的每个参数
            param -= learing_rate * param.grad
    
    if i%1000==0:
        print(loss, w1.grad[0][0])
print('训练耗时：',time()-a)


In [25]:
# 输出模型
model = nn.Sequential(
    nn.Linear(D_in, H), #D_in -> H 的线性变换。另：与之前不同的地方在于bias=True
    nn.ReLU(),
    nn.Linear(H, D_out),
).to(device)

print(model)
print(model[0])
print(model[0].weight.shape, model[0].bias.shape)
model[0].weight.mean(), model[0].weight.std() #发现这里随机初始化的weight都是非常小的数（应该是想近似0）

Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)
Linear(in_features=1000, out_features=100, bias=True)
torch.Size([100, 1000]) torch.Size([100])


(tensor(7.7564e-05, grad_fn=<MeanBackward0>),
 tensor(0.0183, grad_fn=<StdBackward0>))

#### 进一步改进optimizer
定义好优化器把grad清零和更新都搞定

这里又发现：
+ Adam，用正态分布ini反而很慢？？？t，不如直接不init（也就是上面看到的近似0的model中的init）
+ SGD，用正态分布init更快

In [19]:
import torch
import torch.nn as nn
from time import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

N, D_in, H, D_out = 64, 1000, 100, 10
X = torch.randn(N, D_in, device=device) #放GPU上，本机更慢，可能是交互的原因；
y = torch.randn(N, D_out, device=device)

model = nn.Sequential(
    nn.Linear(D_in, H, bias=True), #D_in -> H 的线性变换。另：与之前不同的地方在于默认bias=True
    nn.ReLU(),
    nn.Linear(H, D_out, bias=True),
).to(device)
# torch.nn.init.normal_(model[0].weight)
# torch.nn.init.normal_(model[2].weight)

loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #Adma一般设-3 -4，高级优化器一般刚开始调大一点，后面会自动优化
EPOCH = 5000

a = time()
for i in range(EPOCH):
    # Forword pass
    y_pred = model(X)
    
    # Compute loss（MSE）
    loss = loss_fn(y_pred, y)
    
    # Backward pass,compute gradient；
    optimizer.zero_grad()
    loss.backward()
    
    # update parameters
    optimizer.step()
    if loss<1e-3:
        print(i, loss.item())
        break
    if i%100==0:
        print(loss)

print('训练耗时：',time()-a)

tensor(743.4058, grad_fn=<MseLossBackward>)
tensor(0.0064, grad_fn=<MseLossBackward>)
117 0.0009817966492846608
训练耗时： 0.42057013511657715


### 5.自定义网络 继承nn.Module
上面定义的model只是sequential模型，更复杂的就需要自己定义一个class，步骤：
+ 定义class
    1. architecture
    2. forward
+ model = Model(param)，后面不需要调用forward，直接y_pred = model(X)
我手残，在self.linear1定义完打了逗号，这样python会把他和self.linear2看成一个tuple。。。

In [15]:
import torch
import torch.nn as nn
from time import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

N, D_in, H, D_out = 64, 1000, 100, 10
X = torch.randn(N, D_in, device=device) #放GPU上，本机更慢，可能是交互的原因；
y = torch.randn(N, D_out, device=device)

class TwoLayerNN(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNN, self).__init__()
        # define the model architecture
        self.linear1 = nn.Linear(D_in, H, bias=True)
        self.linear2 = nn.Linear(H, D_out, bias=True)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)        
        y_pred = self.linear2(h_relu)
        return y_pred

model = TwoLayerNN(D_in, H, D_out)
loss_fn = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #Adma一般设-3 -4，高级优化器一般刚开始调大一点，后面会自动优化
EPOCH = 5000

a = time()
for i in range(EPOCH):
    # Forword pass
    y_pred = model(X)
    
    # Compute loss（MSE）
    loss = loss_fn(y_pred, y)
    
    # Backward pass,compute gradient；
    optimizer.zero_grad()
    loss.backward()
    
    # update parameters
    optimizer.step()
    if loss<1e-3:
        print(i, loss.item())
        break
    if i%100==0:
        print(loss, w1.grad[0][0])

print('训练耗时：',time()-a)

NameError: name 'w1' is not defined