# LET'S GET READY TO RUMBLE!!!

In [1]:
from IPython.display import HTML
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
import torch.functional as F
import torch.nn as nn
import torchvision
import torchvision.transforms as tvtransforms
import torchvision.utils as vutils

In [2]:
# Batch size
batch_size = 64
# number of epochs
epochs = 10
# learning rate
alpha = 0.0002
# beta1
beta1 = 0.5
# beta2
beta2 = 0.999
# Z vector size
nz = 100

# Channels, H, W
img_shape = (1, 28, 28)
# Take advantage of the computer's GPU, if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
os.makedirs("data/mnist", exist_ok=True)
dataloader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST(
        "data/mnist",
        train=True,
        download=True,
        transform=tvtransforms.Compose(
            [tvtransforms.ToTensor(), tvtransforms.Normalize([0.5], [0.5])]
        ),
    ),
    batch_size=batch_size,
    drop_last=True,
    num_workers=2,
    shuffle=True,
)

First, we need to create our loss functions. From the first paper, the discriminator loss from Algorithm 1, which we want to maximize, is:

$\mathcal{L}_d = \frac{1}{m}\sum_{i=1}^m\left(\log(D(x^{(i)})) + \log(1 - D(G(z^{(i)})))\right)$.

The generator loss, which we want to minimize, is

$\mathcal{L}_g = \frac{1}{m}\sum_{i=1}^m\log(1 - D(G(z^{(i)})))$.

Unfortunately, this generator loss function can be troublesome in practice. An alternative the authors suggest is

$\mathcal{L}_g = \frac{1}{m}\sum_{i=1}^m\log(D(G(z^{(i)})))$.

This function then needs to be maximized.

In [4]:
# Some handy functions:
# torch.mean()
# torch.log()

def discriminator_loss(D_of_real, D_of_fake):
    # TODO: implment loss for discriminator
    # What sign should be applied in order to maximize it?
    raise NotImplementedError()

def generator_loss(D_of_fake):
    # TODO: implment loss for generator
    # What sign should be applied in order to maximize or minimize it?
    raise NotImplementedError()

## IN THE G CORNER...
A simple generator can be constructed from sequenctial, fully connected layers.

In [5]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        
        # Calculate the size of the final output vector
        flattened_out_size = 1
        for dim in img_shape:
            flattened_output_size *= dim

        # A sequence of layers that are applied to an input
        self.model = nn.Sequential(
            # Layer 1
            # TODO: what should the first network input size be?
            nn.Linear(???, 128),
            nn.LeakyReLU(0.2, inplace=True),
            
            # Layer 2
            # Fully connected layer with input size 128, output size 256
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2, inplace=True),
            
            # Layer 3
            # TODO: add a fully connected layer with input size 256 and output 512
            # TODO: add batchnorm
            # TODO: add activation
            
            # Layer 4
            # TODO: add a fully connected layer with input size 512 and output 1024
            # TODO: add batchnorm
            # TODO: add activation
            
            # Final Layer
            # Map to desired image size
            nn.Linear(1024, flattened_output_size)
            nn.Tanh(),
        )
        
    def forward(self, z):
        # img is flat and needs to be reshaped
        img = self.model(z)
        # reshape to (batch_size, channels, H, W)
        img = img.view(img.size(0), *img_shape)
        return img

SyntaxError: invalid syntax (<ipython-input-5-b8e266dadc31>, line 14)

## AND IN THE D CORNER...
The descriminator can also be constructed from sequenctial linear layers. We need to down scale the input to a single value for classification.

In [None]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(int(np.prod(img_shape)), 512),
            nn.LeakyReLU(0.2, inplace=True),

            # TODO: add fully connected layer with input size 512 and output size 256
            # TODO: add batch norm
            # TODO: add activation
            
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
    
    def forward(self, img):
        # Flatten the image and feed into model
        return self.model(img.view(img.size(0), -1))

In [None]:
# Create generator and move it to the GPU, if available
gen_model = Generator().to(device)
print(gen_model)

# Create discriminator and move it to the GPU, if available
disc_model = Discriminator().to(device)
print(disc_model)

# Initialize optimizers
gen_opt = torch.optim.Adam(gen_model.parameters(), lr=alpha, betas=(beta1, beta2))
disc_opt = torch.optim.Adam(disc_model.parameters(), lr=alpha, betas=(beta1, beta2))

# FIGHT!

In [None]:
g_losses = []
d_losses = []
fixed_img_list = []
# A fixed set of z vectors to periodically generate images from
fixed_z_vecs = torch.randn(64, nz, device=device)
iters = 0

for epoch in range(epochs):
    for i, (real_imgs, _) in enumerate(dataloader, 0):
        # Move the images to the GPU memory, if possible
        real_imgs = real_imgs.to(device)
        
        # Run D on real images
        disc_model.zero_grad()
        #
        # TODO: compute D(x)
        #
        D_real = ???
               
        # Run D on generated fake images
        z_sample = torch.randn(batch_size, nz, device=device)
        #
        # TODO: compute G(z)
        #
        fake_imgs = ???
        D_fake = disc_model(fake_imgs.detach())

        # Compute D's loss and update D's weights
        disc_err = discriminator_loss(D_real, D_fake)
        disc_err.backward()
        disc_opt.step()
        
        # Update G by maximizing log(D(G(z)))
        gen_model.zero_grad()
        D_fake = disc_model(fake_imgs)
        gen_err = generator_loss(D_fake)
        gen_err.backward()
        gen_opt.step()
        
        if i % 100 == 0:
            print("[{}/{}][{}/{}]".format(epoch, epochs, i, len(dataloader)) )
        if (iters % 500 == 0) or ((epoch == epochs - 1) and (i == len(dataloader) - 1)):
            # Periodically capture generated samples using the same z vectors
            with torch.no_grad():
                fake = gen_model(fixed_z_vecs).detach().cpu()
            fixed_img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        g_losses.append(gen_err.item())
        d_losses.append(disc_err.item())
        iters += 1

In [None]:
plt.figure(figsize=(10, 5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(g_losses,label="G")
plt.plot(d_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# Create slide show from fixed z images
fig = plt.figure(figsize=(8, 8))
plt.axis("off")
ims = [
    [plt.imshow(np.transpose(i, (1, 2, 0)), animated=True)]
    for i in fixed_img_list
]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)

HTML(ani.to_jshtml())

## Questions
1. Does making the generator more complex (adding layers) improve the generated images? Not for me
2. Does making the discriminator more complex or smarter force the generator to improve as well? It improves it a little bit, but not much
3. Can you get the original generator loss function (i.e. log(1 - D(G(z)))) to work? Perhaps transition to it after some number of epochs. 
4. Bonus: Can you refactor the models to use convolution instead of fully connected layers?

## Links
* [Link](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html): pytorch tutorial for DCGAN using a face dataset
* [Link](https://junyanz.github.io/CycleGAN/): the CycleGAN website