In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import torch
import torch.optim as optim
import torch.nn as nn
from torchviz import make_dot


In [2]:
true_b = 1
true_w = 2
N = 100
np.random.seed(42)
x = np.random.rand(N, 1)
print(x.shape)
epsilon = (-0.1 * np.random.randn(N, 1))
y = true_w * x + true_b + epsilon

(100, 1)


In [3]:
idx = np.arange(N)
np.random.shuffle(idx)
train_idx = idx[:int(0.8 * N)]
val_idx = idx[int(0.8 * N):]
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx],y[val_idx]

In [4]:
# step 0: initializes the parameters randomly
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(1)
# step 1: set the learning rate, which is eta. n like
lr = 0.03
# step 3: define the number of epochs
n_epochs = 1000
# step 4: loop
print('b, w after initial')
print(b, w)
for _ in range(n_epochs):
    # step 1: Compute the predicted output
    yhat = b + w * x_train
    # step 2: Compute the error, we are using batch gradient, that mean are using the whole data point
    error = (yhat - y_train)
    # step 3: Compute the mean loss because is linear regression
    loss = (error ** 2).mean()
    # step 4: Compute the gradient
    b_grad = 2 * error.mean()
    w_grad = 2 * (x_train * error).mean()
    # step 5: update our parameters
    b = b - lr * b_grad
    w = w - lr * w_grad
print('b, w after update')
print(b, w)


b, w after initial
[0.49671415] [-0.1382643]
b, w after update
[0.9847612] [2.01479102]


In [5]:
# sanity check
linear = LinearRegression()
linear.fit(x_train, y_train)
print(linear.intercept_, linear.coef_[0])

[0.97645925] [2.03103553]


In [6]:
scaler = torch.tensor(9.0)
vector = torch.tensor([2, 5, 6])
matrix = torch.ones(2, 3, dtype=torch.float32)
tensor = torch.randn(2, 3, 4, dtype=torch.float32)
print(scaler)
print(vector)
print(matrix)
print(tensor)

tensor(9.)
tensor([2, 5, 6])
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[[ 0.2567, -2.3657, -0.2604, -1.5373],
         [-0.1766,  1.2146, -0.2465,  0.0507],
         [-0.8677, -2.3320, -0.0992,  0.3452]],

        [[ 0.6445, -0.4182,  0.1710,  0.2919],
         [ 1.5615, -0.4156, -0.7768,  0.1102],
         [-1.5864,  0.5006, -0.2633,  1.1923]]])


In [7]:
print(tensor.size(), tensor.shape)

torch.Size([2, 3, 4]) torch.Size([2, 3, 4])


In [8]:
print(scaler.size(), scaler.shape)

torch.Size([]) torch.Size([])


In [9]:
same_x = matrix.view(1, 6)
same_x[0, 2] = 300
print(same_x)
print(matrix)

tensor([[  1.,   1., 300.,   1.,   1.,   1.]])
tensor([[  1.,   1., 300.],
        [  1.,   1.,   1.]])


In [10]:
diff_matrix = matrix.view(1, 6).clone().detach()
diff_matrix[0, 1] = 400
print(matrix)
print(diff_matrix)

tensor([[  1.,   1., 300.],
        [  1.,   1.,   1.]])
tensor([[  1., 400., 300.,   1.,   1.,   1.]])


In [11]:
x_train_tensor = torch.as_tensor(x_train)
x_train_tensor.dtype, x_train.dtype

(torch.float64, dtype('float64'))

In [12]:
float_tensor = x_train_tensor.float()
float_tensor.dtype

torch.float32

In [13]:
device = 'gpu' if torch.cuda.is_available() else 'cpu'

In [14]:
cpu_tensor = torch.as_tensor(x_train).to(device)
cpu_tensor[0]


tensor([0.7713], dtype=torch.float64)

In [15]:
x_train_tensor = torch.as_tensor(x_train).float().to(device)
y_train_tensor = torch.as_tensor(y_train).float().to(device)
print(type(x_train),type(x_train_tensor), x_train_tensor.type())

<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.FloatTensor


In [16]:
back_to_numpy = x_train_tensor.cpu().numpy()

In [17]:
# This is the best approach to create parameters b and w
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device = device)
w = torch.randn(1, requires_grad=True, dtype=torch.float, device = device)
print(b, w)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)


In [18]:
yhat = b + w * x_train_tensor
error = yhat - y_train_tensor
loss = (error **2).mean()
loss.backward()

In [19]:
print(yhat.requires_grad, error.requires_grad, b.requires_grad, w.requires_grad)
print(x_train_tensor.requires_grad, y_train_tensor.requires_grad)

True True True True
False False


In [20]:
print(b.grad)
print(w.grad)

tensor([-3.0770])
tensor([-1.8105])


In [21]:
print(b.grad.zero_())
print(w.grad.zero_())

tensor([0.])
tensor([0.])


In [22]:
lr = 0.1
torch.manual_seed(42)
b = torch.randn(1,requires_grad=True, dtype=torch.float32, device = device)
w = torch.randn(1, requires_grad=True,  dtype = torch.float32, device = device)
epochs = 1000
for _ in range(epochs):
    yhat = b + w * x_train_tensor
    error = yhat-y_train_tensor
    make_dot(yhat)
    loss = (error ** 2).mean()
    loss.backward()
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad
    w.grad.zero_()
    b.grad.zero_()
print(b, w)

tensor([0.9765], requires_grad=True) tensor([2.0310], requires_grad=True)


In [23]:
lr = 0.1
epochs = 1000
b = torch.randn(1, requires_grad=True, device=device, dtype=torch.float32)
w = torch.randn(1, requires_grad=True, device=device, dtype = torch.float32)
optimizer = torch.optim.SGD([b, w], lr = lr)
for _ in range(epochs):
    yhat = b + w * x_train_tensor
    error = yhat - y_train_tensor
    loss = (error ** 2).mean()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(b, w)

tensor([0.9765], requires_grad=True) tensor([2.0310], requires_grad=True)


In [24]:
b = torch.randn(1, requires_grad=True, dtype=torch.float32, device=device)
w = torch.randn(1, requires_grad=True, dtype=torch.float32, device=device)
lr = 0.1
optimizer = torch.optim.SGD([b, w], lr = lr)
loss_fn = nn.MSELoss(reduction='mean')
epochs = 1000
for _ in range(epochs):
    yhat = b + w * x_train_tensor
    loss = loss_fn(yhat, y_train_tensor)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(b, w)
print(loss.detach().cpu().numpy())
print(loss.item(), loss.tolist())

tensor([0.9765], requires_grad=True) tensor([2.0310], requires_grad=True)
0.008044658
0.008044658228754997 0.008044658228754997


In [44]:
class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float32, device=device))
        self.w = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float32, device=device))
    def forward(self, x):
        return self.b + self.w * x
torch.manual_seed(42)
model = ManualLinearRegression().to(device)
lr = 0.1
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr = lr)
epochs = 1000
for _ in range(epochs):
    model.train()
    predictions = model.forward(x_train_tensor)
    loss = loss_fn(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
model.state_dict()

OrderedDict([('b', tensor([0.9765])), ('w', tensor([2.0310]))])