In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Loading STL-10 Dataset

In [3]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from PIL import Image

# Define transformations
# transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# ])

# Define the specific transformations for CLIP

# Define the specific transformations for CLIP
clip_transform = transforms.Compose([
    transforms.Resize(256),               # Resize the shorter side to 256 while maintaining aspect ratio
    transforms.CenterCrop(224),            # Crop the center to 224x224 size (CLIP expects 224x224 images)
    transforms.ToTensor(),                 # Convert the image to a PyTorch tensor
    transforms.Normalize(                 # Normalize the image using CLIP's mean and standard deviation
        mean=[0.48145466, 0.4578275, 0.40821073],  # CLIP's mean for RGB channels
        std=[0.26862954, 0.26130258, 0.27577711]    # CLIP's std for RGB channels
    ),
])

# Load STL-10 dataset (train and test sets)
train_dataset = datasets.STL10(root='/kaggle/working', split='train', download=True, transform=clip_transform)
test_dataset = datasets.STL10(root='/kaggle/working', split='test', download=True, transform=clip_transform)

# Split train dataset into train and validation sets for unsupervised learning
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

batch_size = 256

# Create data loaders for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)


Downloading http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz to /kaggle/working/stl10_binary.tar.gz


100%|██████████| 2640397119/2640397119 [03:01<00:00, 14578035.22it/s]


Extracting /kaggle/working/stl10_binary.tar.gz to /kaggle/working
Files already downloaded and verified


# Loading Pre-trained ViT using Dino technique

In [4]:
import torch
from torchvision import transforms
from PIL import Image
import torch.hub

# Load DINO pre-trained model from Facebook AI repository
dino_model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')

# Set to evaluation mode
dino_model.eval()


Downloading: "https://github.com/facebookresearch/dino/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dino_deitsmall16_pretrain.pth
100%|██████████| 82.7M/82.7M [00:00<00:00, 216MB/s] 


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
  (head): Identity()
)

In [5]:
import torch
from sklearn.cluster import KMeans
import numpy as np
from tqdm import tqdm  # For progress bar
import json

def extract_dino_features_and_labels(dino_model, dataloader):
    dino_model.to(device)
    dino_model.eval()
    all_features = []
    all_labels = []
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Extracting DINO features"):
            images = images.to(device)  # Move images to GPU
            # print(images.shape)
            # Extract features
            features = dino_model(images)  # Assuming DINO returns a feature vector for each image
            features = features / features.norm(dim=-1, keepdim=True)  # Normalize the features (if needed)
            all_features.append(features.cpu())  # Store features on CPU
            all_labels.append(labels.numpy())

    # Concatenate all features from the batch
    all_features = torch.cat(all_features, dim=0)  # Shape: (num_samples, feature_dim)
    all_labels = np.hstack(all_labels)
    return all_features, all_labels


# Processing training set

In [6]:
import os
import torch

# Path to the processed data file
processed_data_path = '/kaggle/input/stl-10-processed/train_data.pt'

# Check if the file exists
if os.path.exists(processed_data_path):
    # Load the preprocessed data
    checkpoint = torch.load(processed_data_path)
    train_features = checkpoint['train_features']
    train_labels = checkpoint['train_labels']
    print("Loaded train_features and train_labels from train_data.pt.")
else:
    # Extract features and labels using the DINO model
    train_features, train_labels = extract_dino_features_and_labels(dino_model, train_loader)

    # Save the data for future use
    torch.save({
        'train_features': train_features,
        'train_labels': torch.tensor(train_labels)
    }, 'train_data.pt')
    print("Extracted features and labels and saved to train_data.pt.")

Extracting DINO features: 100%|██████████| 16/16 [00:10<00:00,  1.55it/s]

Extracted features and labels and saved to train_data.pt.





# Processing test set

In [7]:
import os
import torch

# Path to the processed data file
processed_data_path = '/kaggle/input/stl-10-processed/test_data.pt'

# Check if the file exists
if os.path.exists(processed_data_path):
    # Load the preprocessed data
    checkpoint = torch.load(processed_data_path)
    test_features = checkpoint['test_features']
    test_labels = checkpoint['test_labels']
    print("Loaded test_features and test_labels from test_data.pt.")
else:
    # Extract features and labels using the DINO model
    test_features, test_labels = extract_dino_features_and_labels(dino_model, test_loader)

    # Save the data for future use
    torch.save({
        'test_features': test_features,
        'test_labels': torch.tensor(test_labels)
    }, 'test_data.pt')
    print("Extracted test_features and test_features and saved to test_data.pt.")


Extracting DINO features: 100%|██████████| 32/32 [00:17<00:00,  1.81it/s]


Extracted test_features and test_features and saved to test_data.pt.


# Processing val set

In [8]:
import os
import torch

# Path to the processed data file
processed_data_path = '/kaggle/input/stl-10-processed/val_data.pt'

# Check if the file exists
if os.path.exists(processed_data_path):
    # Load the preprocessed data
    checkpoint = torch.load(processed_data_path)
    val_features = checkpoint['val_features']
    val_labels = checkpoint['val_labels']
    print("Loaded val_features and val_labels from val_data.pt.")
else:
    # Extract features and labels using the DINO model
    val_features, val_labels = extract_dino_features_and_labels(dino_model, val_loader)

    # Save the data for future use
    torch.save({
        'val_features': val_features,
        'val_labels': torch.tensor(val_labels)
    }, 'val_data.pt')
    print("Extracted features and labels and saved to val_data.pt.")


Extracting DINO features: 100%|██████████| 4/4 [00:03<00:00,  1.29it/s]

Extracted features and labels and saved to val_data.pt.





# Functions to generate caption subjects corresponding to images

In [9]:
import torch
# from transformers import processor
from torchvision import transforms

import torch
import numpy as np
from tqdm import tqdm  # For progress bar
from transformers import BlipProcessor, BlipForConditionalGeneration
import spacy
from torchvision import transforms
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN
from collections import Counter


# Load models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
nlp = spacy.load("en_core_web_sm")


def generate_captions(images, model, processor, device):
    """Generate captions for a batch of images using BLIP."""
    unnormalize = transforms.Normalize((-1, -1, -1), (2, 2, 2))

    # Unnormalize and convert to PIL images for the batch
    images = unnormalize(images)
    images = [transforms.ToPILImage()(img) for img in images]

    # Prepare the inputs using the processor for the entire batch
    inputs = processor(images=images, return_tensors="pt", padding=True).to(device)

    # Generate captions for the batch
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=10)
    
    # Decode captions for all images in the batch
    captions = [processor.decode(output, skip_special_tokens=True) for output in outputs]
    return captions

def extract_subjects(captions, nlp):
    """Extract the main subject from each caption."""
    subjects = []
    for caption in captions:
        doc = nlp(caption)
        subject = "outlier"  # Default subject
        for token in doc:
            if token.dep_ in {"nsubj", "nsubjpass"}:  # Find subject
                subject = token.text
                break

        if subject == "outlier":
            for token in doc:
                if token.dep_ == "ROOT":  # Backup: Find the root
                    subject = token.text
                    break
        subjects.append(subject)
    return subjects


def extract_subjects_batch(images):
    """Main function to extract features from images using BLIP and CLIP."""
    captions = generate_captions(images, blip_model, blip_processor, device)
    subjects = extract_subjects(captions, nlp)
    return subjects

def extract_pseudolabels(data_loader):
    """Extract features and labels for an entire dataset."""
    subjects = []
    with torch.no_grad():
        for images, labels in tqdm(data_loader, desc="Extracting features and labels"):
            images = images.to(device)  # Move images to GPU
            subject = extract_subjects_batch(images)  # Extract features
            subjects.extend(subject)

    return subjects
    

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

# Generating subjects and storing 

In [10]:
import os
import json
from collections import Counter

# Path to the JSON file
subjects_path = '/kaggle/input/stl-10-processed/subjects.json'

# Check if the JSON file exists
if os.path.exists(subjects_path):
    # Load subjects from the JSON file
    with open(subjects_path, 'r') as file:
        subjects = json.load(file)
    print("subjects loaded from subjects.json:", Counter(subjects))
else:
    # Extract pseudolabels
    subjects = extract_pseudolabels(train_loader)
    
    # Save subjects to a JSON file
    with open('subjects.json', 'w') as file:
        json.dump(subjects, file)
    print("subjects saved to subjects.json.")

  self.pid = os.fork()
  self.pid = os.fork()
Extracting features and labels: 100%|██████████| 16/16 [04:19<00:00, 16.22s/it]

subjects saved to subjects.json.



