In [None]:
import pandas as pd
import numpy as np
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

# Simple Linear Regression in PyTorch
Here we demonstrate how to use PyTorch's automatic differentiation features.

`theta` is our variable containing learnable parameters for the regression coefficients. Notice that when we instantiate `theta`, we set `requires_grad` to `True`. This signifies that we want to keep track of all the gradients of these parameters.

In [None]:
X = torch.rand(1000, 2)
w_true = 10 * torch.rand(2, 1)
y = X@w_true

theta = torch.rand(2, 1)
theta.requires_grad = True
loss_fn =  torch.nn.MSELoss()
alpha = 0.001
tol = 1e-4

for i in range(100):
    y_pred = X @ theta
    loss = (y - y_pred).square().sum()
    loss.backward()
    
    theta.data -= alpha * theta.grad.data
    theta.grad.zero_()
    
    if i % 10 == 0:
        print('Iter: {:2d} | Loss: {:.3f}'.format(i, loss.item()))
    if loss < tol:
        print('Stopping after', i, 'iterations')
        break
print("True weights;")
print(w_true)
print("Learned weights:")
print(theta)

The key steps in each iteration are:
* evaluate the model to get `y_pred`
* compute the mean squared error loss between `y_pred` and `y_true`
* call `loss.backward()`, which tells PyTorch to compute the gradients of your parameters, which will now be accessible via `theta.grad`
* update theta with its gradients (note that you have to do this via `theta.data`)
* zero out the gradients

# Logistic Regression with PyTorch
Now that we have seen the basics of gradient descent with PyTorch, we can try to do gradient descent for logistic regression.

To setup the model, we just need to have a parameter vector `theta`. Our loss function will be `torch.nn.CrossEntropyLoss`, which will compute the log softmax and negative log-likelihood steps in one function. This allows us to avoid doing any of the logistic function steps like computing $e^{-\theta^\top x}$ and computing the log of these terms since they all happen in the `torch.nn.CrossEntropy` function.

In [None]:
def sample(X, y, batch_size):
    indices = torch.randint(low=0, high=len(X), size=(batch_size,))
    return X[indices], y[indices]

Instead of doing the full multi-class classification, we'll just do 0/1 classification for this demo.

In [None]:
df = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_train.csv')
X = df.drop('label', axis=1).to_numpy()
y = df['label'].to_numpy()

# only use 1s and 0s, convert to torch Tensor
X_th = torch.from_numpy(X[y <= 1]).float()
y_th = torch.from_numpy(y[y <= 1])

In [None]:
def show_digits(X, y):
    plt.subplots(2, 5)
    for d in range(10):
        plt.subplot(2, 5, d+1)
        digits = y==d
        plt.imshow(X[digits, :][0, :].reshape(28, 28), cmap='gray')
        plt.xticks([])
        plt.yticks([])

In [None]:
show_digits(X, y)

In [None]:
max_iters = 100
batch_size = 100
alpha = 0.001 # learning rate
loss_fn = torch.nn.CrossEntropyLoss()
theta = torch.rand(784, 2, requires_grad=True)
losses = []
accs = []

for i in range(max_iters):
    X_batch, y_batch = sample(X_th, y_th, batch_size)
    logits = X_batch @ theta
    
    loss = loss_fn(logits, y_batch)
    loss.backward()
    theta.data -= alpha * theta.grad.data
    theta.grad.zero_()
    
    y_pred = (X_batch @ theta).max(1)[1]
    batch_acc = (y_pred == y_batch).sum() / len(y_batch)
    losses.append(loss.item())
    accs.append(batch_acc.item())
    if i % 10 == 0:
        print("Iter {} | Last 10 avg loss: {:.3f} | acc: {:.3f}".format(
            i, np.mean(losses[-10:]),np.mean(accs[-10:])
        ))

As in the linear regression case, our gradient descent procedure breaks into a few crucial steps
* evaluating the model on our minibatch of examples
* computing the loss function between our model output and the true labels
* calling `loss.backward()`
* updating our weights with its gradients
* zeroing out the gradient