In [None]:
import os

gpu_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = "{}".format(gpu_id)

In [None]:
from torch import distributions
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.preprocessing import StandardScaler

In [None]:
torch.cuda.set_device(0)
np.random.seed(123)

## Get two moons data

In [None]:
n_samples = 2000
noise = .1
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=noise)
X, y = noisy_moons
X = StandardScaler().fit_transform(X)
xlim, ylim = [-3, 3], [-3, 3]
colors = ['red' if label == 0 else 'blue' for label in y]
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors)
plt.xlim(xlim)
plt.ylim(ylim)

## Define classifier network

In [None]:
class twomoons_classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )
    
    def forward(self, x):
        """x is [B, 2]"""
        return self.network(x)

## Quick question? What loss are we going to train with
Well, expected risk minimisation is cross entropy for classification and min square error for regression. Turns out they are both equivalent to maximum likelihood estimation. So we train this classification task with MSE loss, so that afterwards we'll be able to train the mixup network.

## Train first with noise = 0.02

In [None]:
n_samples = 2000
noise = .02
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=noise)
X, y = noisy_moons
X = StandardScaler().fit_transform(X)
xlim, ylim = [-3, 3], [-3, 3]
colors = ['red' if label == 0 else 'blue' for label in y]
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors)
plt.xlim(xlim)
plt.ylim(ylim)

In [None]:
"""Setup model"""
model = twomoons_classifier().cuda()
num_iters = 10000
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

"""Train model"""
train_loss = []
for iter in range(num_iters):
    X, y = datasets.make_moons(n_samples=128, noise=noise)
    X = X.astype(np.float32)
    logits = model(torch.FloatTensor(X).cuda())
    loss = F.mse_loss(F.sigmoid(logits).squeeze(1), torch.FloatTensor(y).cuda())
    train_loss.append(loss.item())
    if iter % 1000 == 0:
        print('Iter {} Loss: {:.3f}'.format(iter, loss.item()))
    loss.backward()
    optimizer.step()
    
"""Plot losses"""
plt.plot(train_loss)

## Now train with noise = 0.2

In [None]:
n_samples = 2000
noise = .3
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=noise)
X, y = noisy_moons
X = StandardScaler().fit_transform(X)
xlim, ylim = [-3, 3], [-3, 3]
colors = ['red' if label == 0 else 'blue' for label in y]
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors)
plt.xlim(xlim)
plt.ylim(ylim)

In [None]:
"""Setup model"""
model = twomoons_classifier().cuda()
num_iters = 10000
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

"""Train model"""
train_loss_no_mixup = []
for iter in range(num_iters):
    X, y = datasets.make_moons(n_samples=128, noise=noise)
    X = X.astype(np.float32)
    logits = model(torch.FloatTensor(X).cuda())
    loss = F.mse_loss(F.sigmoid(logits).squeeze(1), torch.FloatTensor(y).cuda())
    train_loss_no_mixup.append(loss.item())
    if iter % 1000 == 0:
        print('Iter {} Loss: {:.3f}'.format(iter, loss.item()))
    loss.backward()
    optimizer.step()
    
"""Plot losses"""
plt.plot(train_loss_no_mixup)

## Now train with noise = 0.2 and mixup. Note, this is now MSE instead of cross entropy!!

In [None]:
"""Setup model"""
model_mixup = twomoons_classifier().cuda()
num_iters = 10000
learning_rate = 1e-4
optimizer = torch.optim.Adam(model_mixup.parameters(), lr=learning_rate)
# specific to mixup.
alpha = 0.4

"""Train"""
train_loss_mixup = []
for iter in range(num_iters):
    """Draw twice."""
    X1, y1 = datasets.make_moons(n_samples=128, noise=noise)
    X2, y2 = datasets.make_moons(n_samples=128, noise=noise)
    X1 = X1.astype(np.float32)
    X2 = X2.astype(np.float32)
    """Draw from beta distribution"""
    lam = np.random.beta(alpha, alpha)
    X = lam * X1 + (1 - lam) * X2
    y = lam * y1 + (1 - lam) * y2

    logits = model_mixup(torch.FloatTensor(X).cuda())
    loss = F.mse_loss(F.sigmoid(logits).squeeze(1), torch.FloatTensor(y).cuda())
    train_loss_mixup.append(loss.item())
    if iter % 1000 == 0:
        print('Iter {} Loss: {:.3f}'.format(iter, loss.item()))
    loss.backward()
    optimizer.step()

## Compare two training losses

In [None]:
plt.figure()
plt.plot(train_loss_mixup, alpha=0.5, color='red', label='With mixup')
plt.plot(train_loss_no_mixup, alpha=0.5, color='blue', label='No mixup')
plt.legend()
plt.show()