<a href="https://colab.research.google.com/github/probml/pyprobml/blob/master/book1/mlp/pytorch_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Introduction to PyTorch

We show some simple examples of how to use PyTorch. 
See the [official tutorials](https://pytorch.org/tutorials) for more info.

**Make sure you select 'GPU' from the 'Runtime' tab at the top of this page.**



In [1]:
import sklearn
import scipy
import scipy.optimize
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import itertools
import time
from functools import partial

import os

import numpy as np
import numpy as onp # original numpy (not hidden by jax)
from scipy.special import logsumexp
np.set_printoptions(precision=3)




In [2]:


import torch
import torchvision
print("torch version {}".format(torch.__version__))
if torch.cuda.is_available():
  print(torch.cuda.get_device_name(0))
  print("current device {}".format(torch.cuda.current_device()))
else:
  print("Torch cannot find GPU")

def set_seed(seed):
  onp.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#torch.backends.cudnn.benchmark = True

torch version 1.7.0+cu101
Tesla T4
current device 0


# Automatic differentiation <a class="anchor" id="AD"></a>



In [None]:
## Compute gradient of loss "by hand" using numpy


def BCE_with_logits(logits, targets):
    N = logits.shape[0]
    logits = logits.reshape(N,1)
    logits_plus = np.hstack([np.zeros((N,1)), logits]) # e^0=1
    logits_minus = np.hstack([np.zeros((N,1)), -logits])
    logp1 = -logsumexp(logits_minus, axis=1)
    logp0 = -logsumexp(logits_plus, axis=1)
    logprobs = logp1 * targets + logp0 * (1-targets)
    return -np.sum(logprobs)/N

# Compute using numpy
def sigmoid(x): return 0.5 * (np.tanh(x / 2.) + 1)

def predict_logit(weights, inputs):
    return np.dot(inputs, weights) # Already vectorized

def predict_prob(weights, inputs):
    return sigmoid(predict_logit(weights, inputs))

def NLL(weights, batch):
    X, y = batch
    logits = predict_logit(weights, X)
    return BCE_with_logits(logits, y)

def NLL_grad(weights, batch):
    X, y = batch
    N = X.shape[0]
    mu = predict_prob(weights, X)
    g = np.sum(np.dot(np.diag(mu - y), X), axis=0)/N
    return g

np.random.seed(0)
N = 100
D = 5
X = np.random.randn(N, D)
w = 10*np.random.randn(D)
mu = predict_prob(w, X)
y = np.random.binomial(n=1, p=mu, size=N)

X_test = X
y_test = y

y_pred = predict_prob(w, X_test)
loss = NLL(w, (X_test, y_test))
grad_np = NLL_grad(w, (X_test, y_test))
print("params {}".format(w))
#print("pred {}".format(y_pred))
print("loss {}".format(loss))
print("grad {}".format(grad_np))

params [ 3.827 -0.342 10.963 -2.342 -3.475]
loss 0.05501843790657687
grad [-0.014  0.003  0.008  0.008  0.014]




We just compute the objective, call backward() on it, and then lookup variable.grad. However, we have to specify the requires_grad=True attribute on the variable before computing the objective, so that Torch knows to record its values on its tape.

In [None]:
w_torch = torch.Tensor(np.reshape(w, [D, 1])).to(device)
w_torch.requires_grad_() 
x_test_tensor = torch.Tensor(X_test).to(device)
y_test_tensor = torch.Tensor(y_test).to(device)
y_pred = torch.sigmoid(torch.matmul(x_test_tensor, w_torch))[:,0]
criterion = torch.nn.BCELoss(reduction='mean')
loss_torch = criterion(y_pred, y_test_tensor)
loss_torch.backward()
grad_torch = w_torch.grad[:,0].cpu().numpy()
assert np.allclose(grad_np, grad_torch)

print("params {}".format(w_torch))
#print("pred {}".format(y_pred))
print("loss {}".format(loss_torch))
print("grad {}".format(grad_torch))

params tensor([[ 3.8273],
        [-0.3424],
        [10.9635],
        [-2.3422],
        [-3.4745]], device='cuda:0', requires_grad=True)
loss 0.055018432438373566
grad [-0.014  0.003  0.008  0.008  0.014]


# DataLoader

In [9]:
# Put data into PyTorch format.
import torch
from torch.utils.data import DataLoader, TensorDataset

N = 40
D = 10
np.random.seed(0)
X_train = np.random.randn(N,D)
y_train = np.random.randn(N)

x_train_tensor = torch.Tensor(X_train)
print(y_train.shape)
yy = np.reshape(y_train, [N,1])
print(yy.shape)
y_train_tensor = torch.Tensor(yy)
dataset = TensorDataset(x_train_tensor, y_train_tensor)


dataloader = DataLoader(dataset, batch_size=7, shuffle=False)
print('{} examples divided into {} batches of size {}'.format(
    len(dataloader.dataset), len(dataloader), dataloader.batch_size))


for i_batch, batch in enumerate(dataloader):
  X, y = batch
  print(X.shape)
  print(y.shape)
 

(40,)
(40, 1)
40 examples divided into 6 batches of size 7
torch.Size([7, 10])
torch.Size([7, 1])
torch.Size([7, 10])
torch.Size([7, 1])
torch.Size([7, 10])
torch.Size([7, 1])
torch.Size([7, 10])
torch.Size([7, 1])
torch.Size([7, 10])
torch.Size([7, 1])
torch.Size([5, 10])
torch.Size([5, 1])


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

# download and transform train dataset
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../mnist_data', 
                                                          download=True, 
                                                          train=True,
                                                          transform=transforms.Compose([
                                                              transforms.ToTensor(), # first, convert image to PyTorch tensor
                                                              transforms.Normalize((0.1307,), (0.3081,)) # normalize inputs
                                                          ])), 
                                           batch_size=100, 
                                           shuffle=True)


for batch_id, (data, label) in enumerate(train_loader):
    data = Variable(data)
    target = Variable(label)
    print(batch_id)
    print(data.shape)
    if batch_id > 3: break

0
torch.Size([100, 1, 28, 28])
1
torch.Size([100, 1, 28, 28])
2
torch.Size([100, 1, 28, 28])
3
torch.Size([100, 1, 28, 28])
4
torch.Size([100, 1, 28, 28])


# Batch optimization for logistic regression

We will use BFGS from PyTorch for fitting a logistic regression model, and compare to sklearn.

In [14]:
# Fit the model to a dataset, so we have an "interesting" parameter vector to use.

import sklearn.datasets
from sklearn.model_selection import train_test_split

iris = sklearn.datasets.load_iris()
X = iris["data"]
y = (iris["target"] == 2).astype(onp.int)  # 1 if Iris-Virginica, else 0'
N, D = X.shape # 150, 4


X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

from sklearn.linear_model import LogisticRegression

# We set C to a large number to turn off regularization.
# We don't fit the bias term to simplify the comparison below.
log_reg = LogisticRegression(solver="lbfgs", C=1e5, fit_intercept=False)
log_reg.fit(X_train, y_train)
w_mle_sklearn = np.ravel(log_reg.coef_)
print(w_mle_sklearn)

[-4.414 -9.111  6.539 12.686]


In [15]:
# Put data into PyTorch format.

N, D = X_train.shape
x_train_tensor = torch.Tensor(X_train)
yy = np.reshape(y_train, [N,1])
y_train_tensor = torch.Tensor(yy)


In [16]:
# Define model and loss.

class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear = torch.nn.Linear(D, 1, bias=False) 
        
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred
    
set_seed(0)
model = Model() 
criterion = torch.nn.BCELoss(reduction='mean')

In [17]:
optimizer = torch.optim.LBFGS(model.parameters(), history_size=10)
    
def closure():
    optimizer.zero_grad()
    y_pred = model(x_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    #print('loss:', loss.item())
    loss.backward()
    return loss

max_iter = 10
for i in range(max_iter):
    loss = optimizer.step(closure)

params = list(model.parameters())
w_torch_bfgs = params[0][0].detach().numpy() #(D,) vector
print("parameters from sklearn {}".format(w_mle_sklearn))
print("parameters from torch-bfgs {}".format(w_torch_bfgs))

parameters from sklearn [-4.414 -9.111  6.539 12.686]
parameters from torch-bfgs [-4.415 -9.114  6.54  12.691]
