# Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision.models import resnet18
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans 
from tqdm import tqdm  
import os
from PIL import Image
from torch.utils.data import Dataset

# Data Loading and Augmentation

In [2]:
class SimCLRTransform:
    def __init__(self, size=96):
        self.transform = transforms.Compose([
            transforms.RandomResizedCrop(size=size),
            transforms.RandomHorizontalFlip(),
            transforms.RandomApply([transforms.ColorJitter(0.8, 0.8, 0.8, 0.2)], p=0.8),
            transforms.RandomGrayscale(p=0.2),
            transforms.GaussianBlur(kernel_size=int(0.1 * size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __call__(self, x):
        return self.transform(x), self.transform(x)

In [3]:
class UnlabeledImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = [os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_path = self.image_files[idx]
        image = Image.open(image_path).convert('RGB')  

        if self.transform:
            image = self.transform(image)

        return image, -1 

train_transform = SimCLRTransform(size=96)

train_dataset = UnlabeledImageDataset(
    root_dir='/kaggle/input/stl10/unlabeled_images',
    transform=train_transform
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)

# Neural Network

In [4]:
class SimCLR(nn.Module):
    def __init__(self, base_encoder, projection_dim=128):
        super(SimCLR, self).__init__()
        self.encoder = base_encoder(pretrained=False)
        self.feature_dim = self.encoder.fc.in_features
        self.encoder.fc = nn.Identity()  # Remove the final classification layer

        # Projection head
        self.projection = nn.Sequential(
            nn.Linear(self.feature_dim, self.feature_dim),
            nn.ReLU(),
            nn.Linear(self.feature_dim, projection_dim)
        )

    def forward(self, x):
        h = self.encoder(x)
        z = self.projection(h)
        return h, z

# Loss Function

In [5]:
class NTXentLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss(reduction="sum")

    def forward(self, z_i, z_j):
        N = z_i.size(0)
        z = torch.cat((z_i, z_j), dim=0)
        sim = torch.matmul(z, z.T) / self.temperature

        sim_i_j = torch.diag(sim, N)
        sim_j_i = torch.diag(sim, -N)

        positive_samples = torch.cat((sim_i_j, sim_j_i), dim=0).reshape(2 * N, 1)
        negative_samples = sim[~torch.eye(2 * N, dtype=bool)].reshape(2 * N, -1)

        labels = torch.zeros(2 * N).to(positive_samples.device).long()
        logits = torch.cat((positive_samples, negative_samples), dim=1)
        loss = self.criterion(logits, labels)
        return loss / (2 * N)

# Model Training 

In [6]:
def train_simclr(model, train_loader, optimizer, criterion, epochs=25, device='cuda'):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{epochs}]", leave=False)

        for (x_i, x_j), _ in train_loader_tqdm:
            x_i, x_j = x_i.to(device), x_j.to(device)

            optimizer.zero_grad()
            h_i, z_i = model(x_i)
            h_j, z_j = model(x_j)
            loss = criterion(z_i, z_j)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.3f}")

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SimCLR(base_encoder=resnet18, projection_dim=128).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = NTXentLoss(temperature=0.5)



In [8]:
train_simclr(model, train_loader, optimizer, criterion, epochs=25, device=device)
torch.save(model.state_dict(), '/kaggle/working/simclr_stl10.pth')

                                                                          

Epoch [1/25], Loss: 4.331


                                                                          

Epoch [2/25], Loss: 3.075


                                                                          

Epoch [3/25], Loss: 2.588


                                                                          

Epoch [4/25], Loss: 2.345


                                                                          

Epoch [5/25], Loss: 2.188


                                                                          

Epoch [6/25], Loss: 2.063


                                                                          

Epoch [7/25], Loss: 1.976


                                                                          

Epoch [8/25], Loss: 1.914


                                                                          

Epoch [9/25], Loss: 1.858


                                                                           

Epoch [10/25], Loss: 1.806


                                                                           

Epoch [11/25], Loss: 1.761


                                                                          

Epoch [12/25], Loss: 1.725


                                                                          

Epoch [13/25], Loss: 1.685


                                                                           

Epoch [14/25], Loss: 1.651


                                                                           

Epoch [15/25], Loss: 1.625


                                                                           

Epoch [16/25], Loss: 1.604


                                                                           

Epoch [17/25], Loss: 1.574


                                                                           

Epoch [18/25], Loss: 1.554


                                                                           

Epoch [19/25], Loss: 1.537


                                                                           

Epoch [20/25], Loss: 1.521


                                                                           

Epoch [21/25], Loss: 1.506


                                                                          

Epoch [22/25], Loss: 1.490


                                                                          

Epoch [23/25], Loss: 1.472


                                                                           

Epoch [24/25], Loss: 1.455


                                                                           

Epoch [25/25], Loss: 1.446




# Extract Embeddings

In [9]:
def extract_embeddings(model, dataloader, device='cuda'):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for (x_i, x_j), _ in dataloader:
            x_i = x_i.to(device)
            h_i, _ = model(x_i) 
            embeddings.append(h_i.cpu())
    return torch.cat(embeddings, dim=0)

embeddings = extract_embeddings(model, train_loader, device=device)
print(f"Extracted embeddings shape: {embeddings.shape}")

Extracted embeddings shape: torch.Size([99968, 512])


# Determine the Number of Clusters

**Silhouette Score**

In [10]:
sample_size = 20000  
indices = np.random.choice(embeddings.shape[0], size=sample_size, replace=False)
sampled_embeddings = embeddings.numpy()[indices]

silhouette_scores = []
k_values = range(1, 20) 
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(sampled_embeddings)
    score = silhouette_score(sampled_embeddings, labels)
    silhouette_scores.append(score)

# Plot the Silhouette Scores
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k (Subsampled Data)')
plt.show()



ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

**Visualize Clusters**

In [None]:
optimal_k = 10  
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings.numpy())

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis', s=1)
plt.title(f"Clusters in SimCLR Embeddings (k={optimal_k})")
plt.colorbar()
plt.show()