In [13]:
# Installs
!pip install pyclustertend
!conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -y
!pip install torchvision prettytable
!pip install h5py tqdm
!pip install lightning

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 22.9.0
  latest version: 23.3.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.

Retrieving notices: ...working... done
Collecting lightning
  Downloading lightning-2.0.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting croniter<1.4.0,>=1.3.0
  Downloading croniter-1.3.8-py2.py3-none-any.whl (18 kB)
Collecting lightning-utilities<2.0,>=0.7.0
  Using cached lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.1-py3-none-any.whl (716 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.4/716.4 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic<3.0
  Downloading pydantic-1.10.7-cp310-

In [1]:
# Imports cell
import sys
import math, random, numpy as np
import json
import datetime
from collections import Counter
from pathlib import Path
from matplotlib import pyplot as plt
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
from torchvision import models, transforms as T
from torchvision.utils import make_grid

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.cluster import normalized_mutual_info_score
from tqdm import tqdm
from data import SignalDatasetV2
import torchvision.datasets as datasets
from torch.utils.data import ConcatDataset
from prettytable import PrettyTable
from scipy.sparse import data
import pickle

import SSLUtils as utls
from SSLConstants import *

  warn(f"Failed to load image Python extension: {e}")


In [2]:
# Setup GPU

if torch.cuda.is_available():
    torch.cuda.set_device(0)
    device = torch.device('cuda')
    torch.backends.cudnn.benchmark = False
    pin_memory = True
else:
    device = torch.device('cpu')
    pin_memory = False

print("Current device: ", torch.cuda.current_device())  
print("Current device: ", torch.cuda.is_available())  

Current device:  0
Current device:  True


In [12]:
# Main cell

pca_dimensions = list(range(20, 21))
num_clusters = list(range(24, 25, 1))
nmi_list = []

raw_dataset = SignalDatasetV2(window=IMG_SIZE, stride=IMG_STRIDE,
                              limit=LIMIT_IMAGES, dataset_path=DATASET_PATH, three_channels=False)
num_learning_iter = 5

nn_input = raw_dataset

for n in num_clusters:
    for p in pca_dimensions:


        d_t_string = datetime.datetime.now().strftime("%H_%M_%S_%d_%m_%Y")

        CURRENT_MODEL_PATH = '../results/models/' + str(num_learning_iter) + 'e_kmeans_c' + str(n).zfill(2)\
        + 'pca' + str(p).zfill(2) + 'e' + str(MAX_EPOCHS) + 'RN18_' + d_t_string + '.pt'

        CURRENT_STAT_PATH = '../results/stats/' + str(num_learning_iter) + 'e_kmeans_c' + str(n).zfill(2)\
        + 'pca' + str(p).zfill(2) + 'e' + str(MAX_EPOCHS) + 'RN18_' + d_t_string + '_stats.json'

        KMEANS_MODEL_PATH = '../results/models/' + str(num_learning_iter) + 'e_kmeans_c' + str(n).zfill(2)\
        + 'pca' + str(p).zfill(2) + 'e' + str(MAX_EPOCHS) + 'RN18_' + d_t_string + '.pkl'

        print("\n Cluster: ")
        print(n)
        print("\n PCA number: ")
        print(p)

        utls.set_all_seed(42)

        if not USE_VGG:            
            # Using ResNet18
            model = models.resnet18()
            model.fc = nn.Linear(512, n)
            # Adapt last, fully connected layer, if one channel images are used
            model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        else:
            # Using VGG11
            model = models.vgg11()
            model.classifier[3] = nn.Linear(4096, 512)
            model.classifier[6] = nn.Linear(512, n)

        model.to(device)

        # Optimizer for NN
        #optimizer = optim.SGD(model.parameters(), lr=0.1)
        optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)


        if LOAD_SAVED_MODEL:
            checkpoint = torch.load(SAVED_MODEL_PATH)
            model.load_state_dict(checkpoint['model_state_dict'])

            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            LOADED_EPOCH = checkpoint['epoch'] + 1 # Add 1 because of 0-indexing
            loss = checkpoint['loss']
            print("saved model loaded")
    
        print(next(model.parameters()).device)
    
        # Principal Component Analysis
        pca = IncrementalPCA(n_components=p, batch_size=256, whiten=True)

        # mini-batch K-Means
        kmeans = MiniBatchKMeans(n_clusters=n, batch_size=256, 
                                init_size=3*n)

        scaler = torch.cuda.amp.GradScaler()

        all_stats = []
        # Check if existing model is used in order to load the stat data.
        if LOAD_SAVED_MODEL:
            with open(SAVED_STAT_PATH, 'r') as f:
                all_stats = json.load(f)
        
        # Get initial labels
        pseudo_labels = utls.cluster(pca, kmeans, model, nn_input, device, pin_memory)

        for _ in range(MAX_EPOCHS-LOADED_EPOCH+num_learning_iter):
      
            print("\n Epoch:")
            print(LOADED_EPOCH + _)
      
            if _ % num_learning_iter == 0:
                # generate labels
                pseudo_labels = utls.cluster(pca, kmeans, model, nn_input, device, pin_memory)

            # make new dataset with labels matched to images (Spectrum data)
            labeled_dataset = SignalDatasetV2(
                window=IMG_SIZE, stride=IMG_STRIDE, labels=pseudo_labels[0], 
                limit=LIMIT_IMAGES, dataset_path=DATASET_PATH, three_channels=False
            )
    
            # Commands for usage of the MNIST dataset
            # nn_input.targets = pseudo_labels
      
            # train for one epoch
            stats = utls.train_epoch(model, optimizer, labeled_dataset, device, pin_memory)
            all_stats.append(stats)

            # Save the current model state.
            torch.save({
                    'epoch': _ + LOADED_EPOCH,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': stats['running_loss'],
                    }, CURRENT_MODEL_PATH)

            # Save the stat data in .json file.
            with open(CURRENT_STAT_PATH, 'w') as f:
                json.dump(all_stats, f, indent=2) 
                



 Cluster: 
24

 PCA number: 
20
cuda:0


extracting features: 100%|██████████| 313/313 [00:05<00:00, 52.60batch/s]



 Epoch:
0


extracting features: 100%|██████████| 313/313 [00:06<00:00, 51.95batch/s]





training:  45%|████▍     | 140/313 [00:07<00:09, 19.02batch/s, loss=0.752, lr=0.001]


KeyboardInterrupt: 