## 自作活性化関数

活性化関数reluを自作して組み込む

In [1]:
import torch
from torch.autograd import Variable

#自作ReLUクラスの定義
class MyReLU(torch.autograd.Function):
    
    #forwardの活性化関数とbackwardの計算のみ記述すれば良い
    def forward(self, input):
        
        #値の記憶
        self.save_for_backward(input)
        
        #ReLUの定義部分
        #x.clamp(min=0) <=> max(x, 0)
        return input.clamp(min=0)

    #backpropagationの記述
    #勾配情報を返せば良い
    def backward(self, grad_output):

        #記憶したTensorの呼び出し
        input, = self.saved_tensors
        
        #参照渡しにならないようコピー
        grad_input = grad_output.clone()
        
        #input<0 => 0  else input
        grad_input[input < 0] = 0
        return grad_input

#dtypeの明示：gpuでは不要
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

#x, yはダミーデータ。値は不変なのでrequires_grad=Falseとして勾配更新を行わない
#.type(dtype)でtypeをFloatTensorに変更
# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # backwordの定義をしたことで、.backword()で誤差逆伝播される
    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

(0, 35607508.0)
(1, 33786520.0)
(2, 38106872.0)
(3, 40630612.0)
(4, 35113508.0)
(5, 22701976.0)
(6, 11372140.0)
(7, 5106512.5)
(8, 2488791.0)
(9, 1466749.625)
(10, 1027165.8125)
(11, 798103.375)
(12, 652784.1875)
(13, 547596.9375)
(14, 465699.6875)
(15, 399460.78125)
(16, 344915.34375)
(17, 299438.25)
(18, 261203.703125)
(19, 228839.421875)
(20, 201336.609375)
(21, 177820.671875)
(22, 157591.546875)
(23, 140135.0)
(24, 124991.828125)
(25, 111814.8046875)
(26, 100304.390625)
(27, 90209.578125)
(28, 81324.21875)
(29, 73480.0546875)
(30, 66540.6640625)
(31, 60377.44140625)
(32, 54890.4921875)
(33, 49994.25)
(34, 45620.7109375)
(35, 41699.91796875)
(36, 38179.19921875)
(37, 35007.72265625)
(38, 32146.546875)
(39, 29557.373046875)
(40, 27211.181640625)
(41, 25081.1328125)
(42, 23144.708984375)
(43, 21380.4453125)
(44, 19772.072265625)
(45, 18303.18359375)
(46, 16959.3671875)
(47, 15728.7275390625)
(48, 14600.7958984375)
(49, 13564.828125)
(50, 12612.8095703125)
(51, 11737.228515625)
(52, 10

pytorchの組み込み関数を利用すると下のように書ける

In [2]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

(0, 735.7218627929688)
(1, 717.9724731445312)
(2, 700.707763671875)
(3, 683.9345703125)
(4, 667.6505737304688)
(5, 651.7515869140625)
(6, 636.2648315429688)
(7, 621.271484375)
(8, 606.6609497070312)
(9, 592.4224853515625)
(10, 578.5313720703125)
(11, 565.0155639648438)
(12, 551.8464965820312)
(13, 539.0783081054688)
(14, 526.6420288085938)
(15, 514.5726928710938)
(16, 502.8233337402344)
(17, 491.3594665527344)
(18, 480.2715759277344)
(19, 469.4911193847656)
(20, 459.0304870605469)
(21, 448.85211181640625)
(22, 438.9756774902344)
(23, 429.3240661621094)
(24, 419.8954772949219)
(25, 410.70684814453125)
(26, 401.758056640625)
(27, 393.0690612792969)
(28, 384.5974426269531)
(29, 376.336181640625)
(30, 368.238037109375)
(31, 360.315185546875)
(32, 352.5635070800781)
(33, 344.94683837890625)
(34, 337.47174072265625)
(35, 330.1490478515625)
(36, 322.9789733886719)
(37, 315.9537048339844)
(38, 309.0765686035156)
(39, 302.3336486816406)
(40, 295.72760009765625)
(41, 289.2817687988281)
(42, 282.

## 自作部分を盛り込んだ2層のネットワーク構造

In [3]:
import torch
from torch.autograd import Variable


class TwoLayerNet(torch.nn.Module):
         
    #__init__に引数をとることで、外部からモデルレイヤーの層数の定義が可能
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    #入出力はVariableにより行われる。よって、Variable内の関数を用いることで内部的に自由に演算ができる
    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

#層の入出力サイズの定義
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# 入力データ作成
# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# ネットワーク定義
# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Loss関数・optimizerの設定
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

#トレーニング
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 673.3890991210938)
(1, 626.3378295898438)
(2, 585.1387329101562)
(3, 548.5601196289062)
(4, 516.125)
(5, 486.9334411621094)
(6, 460.1074523925781)
(7, 435.20806884765625)
(8, 411.993896484375)
(9, 390.364990234375)
(10, 369.9311218261719)
(11, 350.693603515625)
(12, 332.644775390625)
(13, 315.6683044433594)
(14, 299.4958801269531)
(15, 284.13885498046875)
(16, 269.4665832519531)
(17, 255.46742248535156)
(18, 242.08212280273438)
(19, 229.3905487060547)
(20, 217.2423553466797)
(21, 205.62908935546875)
(22, 194.5345458984375)
(23, 183.86534118652344)
(24, 173.65843200683594)
(25, 163.9386444091797)
(26, 154.6805419921875)
(27, 145.88661193847656)
(28, 137.50808715820312)
(29, 129.55323791503906)
(30, 122.01587677001953)
(31, 114.8779296875)
(32, 108.1331558227539)
(33, 101.77472686767578)
(34, 95.77304077148438)
(35, 90.10315704345703)
(36, 84.76476287841797)
(37, 79.74002838134766)
(38, 75.00753021240234)
(39, 70.55945587158203)
(40, 66.36567687988281)
(41, 62.424842834472656)
(42, 5

## ダイナミックなネットワーク構造

ループごとに構造を変化させるようなネットワークの構築

In [1]:
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    
    #層の定義
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    #フォワードステップではランダムに中間層を0~3に変更する
    #middle_linearはHxHなので層が変化しても問題ない
    #同じモジュールを繰り返し使ってもOK
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

(0, 621.9844970703125)
(1, 664.5276489257812)
(2, 614.6412963867188)
(3, 623.8514404296875)
(4, 471.3734130859375)
(5, 618.7210083007812)
(6, 616.1188354492188)
(7, 600.1791381835938)
(8, 285.7338562011719)
(9, 606.0966796875)
(10, 226.6687469482422)
(11, 194.56753540039062)
(12, 560.6582641601562)
(13, 613.7176513671875)
(14, 586.7184448242188)
(15, 610.3147583007812)
(16, 607.3052978515625)
(17, 563.4048461914062)
(18, 551.1160278320312)
(19, 460.69970703125)
(20, 86.80523681640625)
(21, 575.3087158203125)
(22, 83.90589141845703)
(23, 74.13724517822266)
(24, 458.6404113769531)
(25, 48.97284698486328)
(26, 425.5200500488281)
(27, 31.413944244384766)
(28, 489.8101501464844)
(29, 304.1451721191406)
(30, 338.4856872558594)
(31, 24.448328018188477)
(32, 24.79906463623047)
(33, 265.9589538574219)
(34, 337.88983154296875)
(35, 200.704345703125)
(36, 201.17539978027344)
(37, 31.095848083496094)
(38, 233.8021697998047)
(39, 34.362056732177734)
(40, 173.941162109375)
(41, 129.57888793945312)
(

In [107]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss()

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Forward pass: compute predicted y by passing x to the model.
y_pred = model(x)

# Compute and print loss.
loss = loss_fn(y_pred, y)
print(t, loss.data[0])

# Before the backward pass, use the optimizer object to zero all of the
# gradients for the variables it will update (which are the learnable weights
# of the model)
optimizer.zero_grad()

# Backward pass: compute gradient of the loss with respect to model
# parameters
loss.backward()

# Calling the step function on an Optimizer makes an update to its
# parameters
optimizer.step()

(499, 0.976788341999054)


In [112]:
print (y.size())
print (y_pred.size())

torch.Size([64, 10])
torch.Size([64, 10])


In [3]:
def lossCustomized(output, target):
    #バッチサイズの指定
    batch_size = prediction.size(0)
    
    #1バッチごとのロス関数の定義
    gt = torch.ones(batch_size).type_as(prediction)
    
    #平均とる
    loss = (gt - prediction).sum() / float(batch_size)
    return loss

loss = lossOne(X, y)

TypeError: type() takes exactly 2 arguments (1 given)

In [None]:
def lossOne(prediction):
    batch_size = prediction.size(0)
    gt = torch.ones(batch_size).type_as(prediction)
    loss = (gt - prediction).sum() / float(batch_size)
    return loss