In [5]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [19]:
from PIL import Image
import pandas as pd
import os
from scipy.io import loadmat
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from torchvision import transforms
import numpy as np
import cv2
import torch
import torchvision.datasets as datasets
import moco.builder
import moco.loader

In [18]:
# CIFAR-10 dataset (try train_simclr.py code)

datadir = './data'
classes = 10
epoch_size = 50000
# Normalization for CIFAR
normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                                 std=[0.2023, 0.1994, 0.2010])

augmentation = [
    transforms.RandomResizedCrop(32, scale=(0.2, 1.0)),
    transforms.RandomApply([
        transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  # not strengthened
    ], p=0.8),
    transforms.RandomGrayscale(p=0.2),
    # transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    normalize
]

train_dataset = datasets.CIFAR10(root=datadir, train=True, download=True,
                                 transform=moco.loader.TwoCropsTransform(
                                     transforms.Compose(augmentation)))

print(f'train_dataset.data: {type(train_dataset.data)}')


train_labels = np.array(train_dataset.targets)
print(f'train_labels: {train_labels}')
num_classes = classes
train_idx = np.array(
    [np.where(train_labels == i)[0][:int(epoch_size / num_classes)] for i in range(0, num_classes)]).flatten()
train_dataset.targets = train_labels[train_idx]
train_dataset.data = train_dataset.data[train_idx]
print(f'train_idx: {train_idx}')
print(f'train_idx.size: {train_idx.size}')
print(f'train_dataset.targets: {train_dataset.targets}')
print(f'train_dataset.targets.size: {train_dataset.targets.size}')
print(f'train_dataset.data.size: {train_dataset.data.size}')
print(f'train_dataset.data.shape: {train_dataset.data.shape}')

Files already downloaded and verified
train_dataset.data: <class 'numpy.ndarray'>
train_labels: [6 9 9 ... 9 1 1]
train_idx: [   29    30    35 ... 49963 49971 49997]
train_idx.size: 50000
train_dataset.targets: [0 0 0 ... 9 9 9]
train_dataset.targets.size: 50000
train_dataset.data.size: 153600000
train_dataset.data.shape: (50000, 32, 32, 3)


In [53]:
# CiFAR-10 (try linear_probe_simclr.py code)

datadir = './data'
classes = 10

# For CIFAR
normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                                 std=[0.2023, 0.1994, 0.2010])

augmentation = [
    transforms.RandomResizedCrop(32),
    transforms.RandomApply([
        transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  # not strengthened
    ], p=0.8),
    transforms.RandomGrayscale(p=0.2),
    # transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    normalize
]

num_classes = classes

train_dataset = datasets.CIFAR10(root=datadir, train=True, download=True,
                                 transform=transforms.Compose(augmentation))

# print(f'train_dataset.targets: {train_dataset.targets.shape}')
# print(f'train_dataset.data: {train_dataset.data.shape}')
train_labels = np.array(train_dataset.targets)
# print(train_labels.size)
# print(train_dataset.targets)
train_idx = np.array(
    [np.where(train_labels == i)[0] for i in range(0, num_classes)]).flatten()
print(f'train_idx.size: {train_idx.size}')
print(f'train_idx: {train_idx}')
train_dataset.targets = train_labels[train_idx]
train_dataset.data = train_dataset.data[train_idx]
print(f'train_dataset.data: {train_dataset.data.shape}')
print(f'train_dataset.data: {type(train_dataset.data)}')
# print(f'train_dataset.data: {train_dataset.data}')
print(f'train_dataset.targets: {train_dataset.targets.shape}')
print(f'train_dataset.targets: {type(train_dataset.targets)}')
print(f'train_dataset.targets: {train_dataset.targets.size}')
# print(f'train_dataset.targets: {train_dataset.targets}')

Files already downloaded and verified
train_idx.size: 50000
train_idx: [   29    30    35 ... 49963 49971 49997]
train_dataset.data: (50000, 32, 32, 3)
train_dataset.data: <class 'numpy.ndarray'>
train_dataset.targets: (50000,)
train_dataset.targets: <class 'numpy.ndarray'>
train_dataset.targets: 50000


In [61]:
# 2023-3-12 custom dataset created by Allen LIN

class fingerprintDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.targets = self.img_labels.iloc[:, 1] # label of the dataset
        self.target_transform = target_transform
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[0, 0])
        image = cv2.imread(img_path)
        self.data = np.empty((len(self.img_labels), *image.shape), dtype=np.uint8)
        for i in range(len(self.img_labels)):
            self.data[i] = image
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = cv2.imread(img_path)
        print(image.shape)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label
    def __len__(self):
        return len(self.img_labels)

In [62]:
# SOCOFing fingerprint dataset (try linear_probe_simclr.py code)

img_dir = os.path.join("../", "kaggle_fingerprint", "SOCOFing", "All")
annotations_file = os.path.join("../", "kaggle_fingerprint", "kaggle_fingerprint_annotations.csv")
classes = 600
epoch_size = 55270

# For CIFAR
normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                                 std=[0.2023, 0.1994, 0.2010])

augmentation = [
    transforms.ToPILImage(), # to PIL format
    transforms.RandomResizedCrop(32),
    transforms.RandomApply([
        transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  # not strengthened
    ], p=0.8),
    transforms.RandomGrayscale(p=0.2),
    # transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    normalize
]

train_dataset = fingerprintDataset(annotations_file, img_dir, transform=transforms.Compose(augmentation))

img, label = train_dataset[0]
print(img.shape)

# train_labels = np.array(train_dataset.targets)

# train_idx = np.array(
#     [np.where(train_labels == i)[0] for i in range(0, num_classes+1)]).flatten()

# train_idx = np.array(
#         [np.where(train_labels == i)[0][:int(epoch_size / num_classes)+1] for i in range(0, num_classes+1)], dtype=object).flatten()

# train_idx = np.hstack(train_idx)
# print(f'train_idx.size: {train_idx.size}')
# # print(f'train_idx: {train_idx}')
# train_dataset.targets = train_labels[train_idx]
# train_dataset.data = train_dataset.data[train_idx]
# print(f'train_dataset.data: {train_dataset.data.shape}')
# print(f'train_dataset.data: {type(train_dataset.data)}')
# # print(f'train_dataset.data: {train_dataset.data}')
# print(f'train_dataset.targets: {train_dataset.targets.shape}')
# print(f'train_dataset.targets: {type(train_dataset.targets)}')
# print(f'train_dataset.targets: {train_dataset.targets.size}')
# # print(f'train_dataset.targets: {train_dataset.targets}')

(103, 96, 3)
torch.Size([3, 32, 32])
