In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import numpy as np
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import torch.utils.data as data
from torch.utils.data.sampler import Sampler
import math
import copy
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

!pip install faiss-gpu
import faiss

torch.manual_seed(0)

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[0mInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


<torch._C.Generator at 0x78d1a4c1b270>

The model used in this analysis is a simple cnn model with 2 convolutional layer and two fully connected layers. The model is split into features, classifier and top_layer to mimic the architecture used in the original paper. See [here](https://github.com/facebookresearch/deepcluster/tree/master/models)

In [2]:
class SimpleCnn(nn.Module):
    def __init__(self, k=10):
        super(SimpleCnn, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),  # Change input channels to 3
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(4*4*128, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU()
        )
        self.top_layer = nn.Linear(64, k)
        self._initialize_weights()

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        if self.top_layer:
            out = self.top_layer(out)
        return out

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


In [16]:
def train_supervised(model, device, train_loader, epoch):
    model.train()
    torch.set_grad_enabled(True)

    optimizer = torch.optim.SGD(
        filter(lambda x: x.requires_grad, model.parameters()),
        lr=0.05,
        momentum=0.9,
        weight_decay=10**(-5)
    )

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    for e in range(epoch):

      epoch_loss = 0.0

      for batch_idx, (data, target) in enumerate(train_loader):
          data, target = data.to(device), target.to(device)
          optimizer.zero_grad()
          output = model(data)
          loss = criterion(output, target)
          loss.backward()
          optimizer.step()
          epoch_loss += output.shape[0] * loss.item()

      print("Epoch: " + str(e))
      print(epoch_loss / len(train_loader.dataset))


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [3]:
# choose device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

# Define the transformations for CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

# Load the CIFAR-10 dataset
cifar10_train = datasets.CIFAR10('../data', train=True, download=True, transform=transform)
cifar10_test = datasets.CIFAR10('../data', train=False, download=True, transform=transform)

# Split the training data into unsupervised pretrain (45k) and supervised train (5k)
unsupervised_pretrain, supervised_train = torch.utils.data.random_split(cifar10_train, [45000, 5000])

# Create data loaders
train_loader_unsupervised = DataLoader(unsupervised_pretrain, batch_size=64, shuffle=False, num_workers=4)
train_loader_supervised = DataLoader(supervised_train, batch_size=64, shuffle=False, num_workers=4)
test_loader = DataLoader(cifar10_test, batch_size=64, shuffle=True, num_workers=4)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 29797985.07it/s]


Extracting ../data/cifar-10-python.tar.gz to ../data
Files already downloaded and verified


In [5]:
def cluster_assign(images_lists, dataset):
    """Creates a dataset from clustering, with clusters as labels.
    Args:
        images_lists (list of list): for each cluster, the list of image indexes
                                    belonging to this cluster
        dataset (list): initial dataset
    Returns:
        ReassignedDataset(torch.utils.data.Dataset): a dataset with clusters as
                                                     labels
    """
    assert images_lists is not None
    pseudolabels = []
    image_indexes = []
    for cluster, images in enumerate(images_lists):
        image_indexes.extend(images)
        pseudolabels.extend([cluster] * len(images))

    t = transforms.Compose([
               transforms.ToTensor(),
               transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))]
           )

    return ReassignedDataset(image_indexes, pseudolabels, dataset, t)

In [6]:
class Dataset(data.Dataset):
    """A dataset where the new images labels are given in argument. This assigns
    each image withits "pseudolabel"
    Args:
        image_indexes (list): list of data indexes
        pseudolabels (list): list of labels for each data
        dataset (list): list of tuples with paths to images
        transform (callable, optional): a function/transform that takes in
                                        an PIL image and returns a
                                        transformed version
    """

    def __init__(self, image_indexes, pseudolabels, dataset, transform=None):
        self.imgs = self.make_dataset(image_indexes, pseudolabels, dataset)
        self.transform = transform

    def make_dataset(self, image_indexes, pseudolabels, dataset):
        label_to_idx = {label: idx for idx, label in enumerate(set(pseudolabels))}
        images = []
        for j, idx in enumerate(image_indexes):
            path = dataset[idx][0]
            pseudolabel = label_to_idx[pseudolabels[j]]
            images.append((path, pseudolabel))
        return images

    def __getitem__(self, index):
        """
        Args:
            index (int): index of data
        Returns:
            tuple: (image, pseudolabel) where pseudolabel is the cluster of index datapoint
        """
        img, pseudolabel = self.imgs[index]
        return img, pseudolabel

    def __len__(self):
        return len(self.imgs)

In [7]:
class Sampler(Sampler):
    """Samples elements uniformely accross pseudolabels.
        Args:
            N (int): size of returned iterator.
            images_lists: dict of key (target), value (list of data with this target)
    """

    def __init__(self, N, images_lists):
        self.N = N
        self.images_lists = images_lists
        self.indexes = self.generate_indexes_epoch()

    def generate_indexes_epoch(self):
        nmb_non_empty_clusters = 0
        for i in range(len(self.images_lists)):
            if len(self.images_lists[i]) != 0:
                nmb_non_empty_clusters += 1

        size_per_pseudolabel = int(self.N / nmb_non_empty_clusters) + 1
        res = np.array([])

        for i in range(len(self.images_lists)):
            # skip empty clusters
            if len(self.images_lists[i]) == 0:
                continue
            indexes = np.random.choice(
                self.images_lists[i],
                size_per_pseudolabel,
                replace=(len(self.images_lists[i]) <= size_per_pseudolabel)
            )
            res = np.concatenate((res, indexes))

        np.random.shuffle(res)
        res = list(res.astype('int'))
        if len(res) >= self.N:
            return res[:self.N]
        res += res[: (self.N - len(res))]
        return res

    def __iter__(self):
        return iter(self.indexes)

    def __len__(self):
        return len(self.indexes)

In [8]:
class ComputeAverage(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def learning_rate_decay(optimizer, t, lr_0):
    for param_group in optimizer.param_groups:
        lr = lr_0 / np.sqrt(1 + lr_0 * param_group['weight_decay'] * t)
        param_group['lr'] = lr


In [9]:
def compute_features(dataloader, model, N, get_labels=False):

    model.eval()
    labels = []

    # discard the label information in the dataloader
    for i, (input_tensor, label) in enumerate(dataloader):
        input_var = torch.autograd.Variable(input_tensor.cuda(), requires_grad=False)
        aux = model(input_var).data.cpu().numpy()

        if i == 0:
            features = np.zeros((N, aux.shape[1]), dtype='float32')

        aux = aux.astype('float32')
        if i < len(dataloader) - 1:
            features[i * 64: (i + 1) * 64] = aux
        else:
            # special treatment for final batch
            features[i * 64:] = aux

        # measure elapsed time

        labels.append(label.numpy())

    labels = np.concatenate(labels)

    if get_labels:
      return features, labels

    else:
      return features


In [10]:
def train(loader, model, crit, opt, epoch):
    """Training of the CNN.
        Args:
            loader (torch.utils.data.DataLoader): Data loader
            model (nn.Module): CNN
            crit (torch.nn): loss
            opt (torch.optim.SGD): optimizer for every parameters with True
                                   requires_grad in model except top layer
            epoch (int)
    """
    losses = AverageMeter()
    # switch to train mode
    model.train()

    # create an optimizer for the last fc layer
    optimizer_tl = torch.optim.SGD(
        model.top_layer.parameters(),
        lr=0.01,
        weight_decay=10**-5,
    )

    for i, (input_tensor, target) in enumerate(loader):

        target = target.cuda()
        input_var = torch.autograd.Variable(input_tensor.cuda())
        target_var = torch.autograd.Variable(target)

        output = model(input_var)
        loss = crit(output, target_var)

        # record loss
        losses.update(loss.data, input_tensor.size(0))

        # compute gradient and do SGD step
        opt.zero_grad()
        optimizer_tl.zero_grad()
        loss.backward()
        opt.step()
        optimizer_tl.step()

    return losses.avg

In [11]:
def DeepCluster(model, device, train_loader, epoch, k):

    fd = int(model.top_layer.weight.size()[1])
    model.top_layer = None

    model = model.to(device)


    optimizer = torch.optim.SGD(
        filter(lambda x: x.requires_grad, model.parameters()),
        lr=0.05,
        momentum=0.9,
        weight_decay=10**(-5)
    )

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    #cluster_step


    for e in range(epoch):

      model.top_layer = None
      model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])

      features = compute_features(train_loader, model, len(unsupervised_pretrain))

       # only 64 dims, so no PCA
      pipeline = Pipeline([('scaling', StandardScaler())])

      post_scale = pipeline.fit_transform(features)
      post_norm = normalize(post_scale, norm="l2")

      n_data, d = post_norm.shape

      # faiss implementation of k-means
      clus = faiss.Clustering(d, k)
      clus.seed = np.random.randint(1234)

      clus.niter = 20
      clus.max_points_per_centroid = 60000

      res = faiss.StandardGpuResources()
      flat_config = faiss.GpuIndexFlatConfig()
      flat_config.useFloat16 = False
      flat_config.device = 0
      index = faiss.GpuIndexFlatL2(res, d, flat_config)

      #get new cluster labels
      clus.train(post_norm, index)
      _, I = index.search(post_norm, 1)

      labels = np.squeeze(I)

      unique, counts = np.unique(labels, return_counts=True)

      print("Epoch: " + str(e))

      print("Overview of cluster assignments:")
      print(dict(zip(unique, counts)))

      images_lists = [[] for i in range(k)]
      for i in range(len(unsupervised_pretrain)):
            images_lists[int(labels[i])].append(i)


      # create new dataset from pseudolabels
      train_dataset = cluster_assign(images_lists, unsupervised_pretrain)

      #print(len(train_dataset))
      #print(images_lists)

      # sample images from uniform distribution over classes
      sampler = UnifLabelSampler(int(1 * len(train_dataset)),
                                   images_lists)


      train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=64,
            num_workers=4,
            sampler=sampler,
        )

      # reset last layer
      mlp = list(model.classifier.children())
      mlp.append(nn.ReLU(inplace=True).cuda())
      model.classifier = nn.Sequential(*mlp)
      model.top_layer = nn.Linear(fd, k)
      model.top_layer.weight.data.normal_(0, 0.01)
      model.top_layer.bias.data.zero_()
      model.top_layer.cuda()



      # train step
      torch.set_grad_enabled(True)
      loss = train(train_dataloader, model, criterion, optimizer, e)
      print(loss.cpu().numpy())


In [12]:
def linear_model(model_base, train_loader, test_loader):

  model = copy.deepcopy(model_base)
  model.to(device)
  model.top_layer = None
  model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])
  features,labels = compute_features(train_loader, model, len(supervised_train), get_labels=True)

  clf = make_pipeline(StandardScaler(),LinearSVC(random_state=0, tol=1e-5, max_iter =10000))
  clf.fit(features, labels)

  x_test = []
  y_true = []

  torch.set_grad_enabled(False)
  for idx, (pics, labels) in enumerate(test_loader):
    pics = pics.to(device)

    model.eval()
    features_test = model(pics)
    x_test.append(features_test.cpu().numpy())
    y_true.append(labels)

  x_test = np.concatenate(x_test)
  y_true = np.concatenate(y_true)

  y_pred = clf.predict(x_test)

  print("Test Accuracy: " + str(accuracy_score(y_true, y_pred)))

In [13]:
simpleCNN = SimpleCnn()
simpleCNN = simpleCNN.to(device)
DeepCluster(simpleCNN, device, train_loader_unsupervised, 5, 10)

Epoch Nr: 0
Overview of cluster assignments:
{0: 4132, 1: 4081, 2: 6232, 3: 3545, 4: 3907, 5: 5203, 6: 4414, 7: 5124, 8: 4764, 9: 3598}
0.824921
Epoch Nr: 1
Overview of cluster assignments:
{0: 4314, 1: 3248, 2: 4593, 3: 4675, 4: 5030, 5: 4864, 6: 5834, 7: 3722, 8: 4210, 9: 4510}
0.4759902
Epoch Nr: 2
Overview of cluster assignments:
{0: 3862, 1: 5260, 2: 3405, 3: 4047, 4: 3522, 5: 4066, 6: 5144, 7: 4792, 8: 6101, 9: 4801}
0.4394555
Epoch Nr: 3
Overview of cluster assignments:
{0: 3335, 1: 5426, 2: 3666, 3: 4564, 4: 3229, 5: 4958, 6: 4328, 7: 6547, 8: 4644, 9: 4303}
0.4360714
Epoch Nr: 4
Overview of cluster assignments:
{0: 3515, 1: 3467, 2: 5267, 3: 3051, 4: 6169, 5: 5160, 6: 2871, 7: 3970, 8: 4316, 9: 7214}
0.40906253


In [14]:
random_CNN = SimpleCnn()

In [17]:
trainCNN = SimpleCnn()
trainCNN = trainCNN.to(device)
train_supervised(trainCNN, device, train_loader_unsupervised, 5)

Epoch: 0
1.5247264789157444
Epoch: 1
1.0250105587853326
Epoch: 2
0.8021779124789767
Epoch: 3
0.6645423480365011
Epoch: 4
0.5722854996667968


In [19]:
# cnn trained self supervised
linear_model(simpleCNN, train_loader_supervised, test_loader)

Test Accuracy: 0.3536


In [20]:
# cnn trained supervised
linear_model(trainCNN, train_loader_supervised, test_loader)

Test Accuracy: 0.7409
