In [1]:
import torch
import torchvision.models as models
import numpy as np
import pandas as pd
# Example: Using ResNet50 as the CNN backbone
backbone = models.resnet50(pretrained=True)
# Replace the final fully connected layer with an identity transform
backbone.fc = torch.nn.Identity()



In [2]:
class ProjectionHead(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=512, output_dim=128):
        super(ProjectionHead, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Example: Creating a projection head
# Assuming the output features of your backbone has 2048 dimensions
projection_head = ProjectionHead(input_dim=2048)


In [3]:
class NTXentLoss(torch.nn.Module):
    def __init__(self, temperature, device):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature
        self.device = device
        self.criterion = torch.nn.CrossEntropyLoss().to(device)

    def forward(self, z_i, z_j):
        N, Z = z_i.size()  # Batch size and feature dimension

        # Concatenate the positive pairs
        z = torch.cat((z_i, z_j), dim=0)

        # Calculate cosine similarity
        sim = torch.mm(z, z.T) / self.temperature
        sim_i_j = torch.diag(sim, N)
        sim_j_i = torch.diag(sim, -N)

        # Create positive and negative masks
        positive_mask = torch.cat((sim_j_i, sim_i_j), dim=0).reshape(2 * N, 1)
        negative_mask = sim > -1e6  # Mask to remove self-similarity

        labels = torch.from_numpy(np.array([range(N), range(N)])).view(2 * N).to(self.device)
        loss = self.criterion(sim, labels)

        return loss

# Example: Creating the NT-Xent Loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
contrastive_loss = NTXentLoss(temperature=0.5, device=device)


In [4]:
class SimCLR(torch.nn.Module):
    def __init__(self, backbone, projection_head):
        super(SimCLR, self).__init__()
        self.backbone = backbone
        self.projection_head = projection_head

    def forward(self, x_i, x_j):
        h_i = self.backbone(x_i)
        h_j = self.backbone(x_j)

        z_i = self.projection_head(h_i)
        z_j = self.projection_head(h_j)

        return z_i, z_j

# Instantiate the SimCLR model
simclr_model = SimCLR(backbone, projection_head)


In [5]:
# Add a linear layer for classification (after training SimCLR model)
class Classifier(torch.nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc = torch.nn.Linear(feature_dim, num_classes)

    def forward(self, x):
        x = self.fc(x)
        return x


num_classes = 10  # Set the number of classes in UrbanSound8K
feature_dim= 2048
classifier = Classifier(feature_dim, num_classes).to(device)

In [6]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"pranked03","key":"0b86bc00fa5a20d3b6f50f967c14a1a0"}'}

In [7]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [8]:
! chmod 600 ~/.kaggle/kaggle.json

In [9]:
!kaggle datasets download -d pranked03/urbansound8k-mel-spectrogram-images

Downloading urbansound8k-mel-spectrogram-images.zip to /content
100% 897M/897M [00:40<00:00, 24.7MB/s]
100% 897M/897M [00:40<00:00, 23.2MB/s]


In [10]:
! unzip urbansound8k-mel-spectrogram-images.zip -d images

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: images/archive/fold4/151877-5-1-0.png  
  inflating: images/archive/fold4/154758-5-0-0.png  
  inflating: images/archive/fold4/154758-5-0-1.png  
  inflating: images/archive/fold4/154758-5-0-10.png  
  inflating: images/archive/fold4/154758-5-0-11.png  
  inflating: images/archive/fold4/154758-5-0-12.png  
  inflating: images/archive/fold4/154758-5-0-13.png  
  inflating: images/archive/fold4/154758-5-0-14.png  
  inflating: images/archive/fold4/154758-5-0-15.png  
  inflating: images/archive/fold4/154758-5-0-16.png  
  inflating: images/archive/fold4/154758-5-0-17.png  
  inflating: images/archive/fold4/154758-5-0-18.png  
  inflating: images/archive/fold4/154758-5-0-19.png  
  inflating: images/archive/fold4/154758-5-0-2.png  
  inflating: images/archive/fold4/154758-5-0-20.png  
  inflating: images/archive/fold4/154758-5-0-21.png  
  inflating: images/archive/fold4/154758-5-0-3.png  
  inflating: images/ar

In [6]:
import torchvision.transforms as transforms

def get_simclr_transformations(size, s=1):
    """Return a set of data transformations for SimCLR.

    Args:
    - size (int): Size of the square crop.
    - s (float): Strength of color jitter, typically between 0.5 and 1.5.

    Returns:
    - A torchvision transforms module.
    """
    color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)

    data_transforms = transforms.Compose([
        transforms.RandomResizedCrop(size=size),
        transforms.RandomHorizontalFlip(),
        color_jitter,
        transforms.RandomGrayscale(p=0.2),
        transforms.ToTensor(),
    ])
    return data_transforms

# Example usage
transform = get_simclr_transformations(size=224)


In [7]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

class UrbanSoundDataset(Dataset):
    def __init__(self, root_dir, fold, csv_file, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.fold = fold
        self.annotations = pd.read_csv(csv_file)
        # Filter the annotations for the current fold
        self.current_fold_annotations = self.annotations[self.annotations['fold'] == self.fold]

    def __len__(self):
        return len(self.current_fold_annotations)

    def __getitem__(self, idx):
        img_filename = self.current_fold_annotations.iloc[idx]['slice_file_name']
        img_path = os.path.join(self.root_dir, f'fold{self.fold}', img_filename)
        image = Image.open(img_path).convert('RGB')
        label = self.current_fold_annotations.iloc[idx]['classID']

        # Apply the transformation twice to get two augmented versions of the same image
        xi = self.transform(image)
        xj = self.transform(image)

        return xi, xj, label


# Instantiate the dataset
dataset = UrbanSoundDataset(root_dir='./images/archive/', fold=1, csv_file="./images/archive/UrbanSound8K.csv", transform=transform)

# DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
num_epochs = 15
num_folds = 10
root_dir = './images/archive/'
feature_dim = 2048
num_classes = 10
batch_size = 32
csv_file = "./images/archive/UrbanSound8K.csv"
#base_lr = 0.3 * (batch_size / 256)  # Adjust batch_size according to your setup
base_lr = 0.001
weight_decay = 1e-6

# Training and validation loop
for fold in range(num_folds):
    print(f"Starting fold {fold+1}")

    # Setup training and validation data loaders
    train_dataset = UrbanSoundDataset(root_dir=root_dir, fold=fold+1, csv_file=csv_file, transform=transform)
    val_dataset = UrbanSoundDataset(root_dir=root_dir, fold=fold+1, csv_file=csv_file, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    # Initialize model, optimizer, and loss for each fold
    simclr_model = SimCLR(backbone, projection_head).to(device)
    classifier = Classifier(feature_dim, num_classes).to(device)
    optimizer = torch.optim.Adam(list(simclr_model.parameters()) + list(classifier.parameters()), lr=0.001)#, weight_decay=weight_decay)
    contrastive_loss = NTXentLoss(temperature=0.5, device=device)
    #scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)

    # Training loop for the current fold
    for epoch in range(num_epochs):
        simclr_model.train()
        classifier.train()
        train_loss = 0
        """
        if epoch < 10:
            lr_scale = min(1., float(epoch + 1) / 10.)
            for pg in optimizer.param_groups:
                pg['lr'] = lr_scale * base_lr"""

        for (xi, xj, labels) in train_loader:
            xi, xj, labels = xi.to(device), xj.to(device), labels.to(device)

            # Forward pass
            zi, zj = simclr_model(xi, xj)

            # Compute contrastive loss
            features = simclr_model.backbone(xi)  # Get features from one of the augmented images
            classifier_output = classifier(features)

            loss_contrastive = contrastive_loss(zi, zj)
            loss_classifier = torch.nn.functional.cross_entropy(classifier_output, labels)

            loss = loss_contrastive + loss_classifier

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #scheduler.step()

            train_loss += loss.item()

        # Optional: Print the current learning rate
        #current_lr = scheduler.get_last_lr()[0]
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss/len(train_loader)}")
        #print(f"Epoch [{epoch+1}/{num_epochs}], Current LR: {current_lr}")

        # Validation step
        simclr_model.eval()
        classifier.eval()
        val_accuracy = 0
        total = 0
        correct = 0
        with torch.no_grad():
            val_loss = 0
            for (xi, xj, labels) in val_loader:
                xi, xj, labels = xi.to(device), xj.to(device), labels.to(device)
                zi, zj = simclr_model(xi, xj)
                loss_contrastive = contrastive_loss(zi, zj)

                features = simclr_model.backbone(xi)
                outputs = classifier(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                val_loss += loss.item()

            print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss/len(val_loader)}")
            accuracy = 100 * correct / total
            print(f"Fold {fold+1}, Validation Accuracy: {accuracy}%")

    # Save model after each fold
    torch.save({'simclr_model': simclr_model.state_dict(),
                'classifier': classifier.state_dict()},
               f'simclr_classifier_urbansound8k_fold{fold+1}.pth')


Starting fold 1
Epoch [1/15], Training Loss: 7.642238582883563
Epoch [1/15], Validation Loss: 4.853926658630371
Fold 1, Validation Accuracy: 11.56930126002291%
Epoch [2/15], Training Loss: 5.635459618909018
Epoch [2/15], Validation Loss: 3.995060682296753
Fold 1, Validation Accuracy: 25.65864833906071%
Epoch [3/15], Training Loss: 5.3305262838091165
Epoch [3/15], Validation Loss: 5.523021697998047
Fold 1, Validation Accuracy: 28.98052691867125%
Epoch [4/15], Training Loss: 4.758390307426453
Epoch [4/15], Validation Loss: 4.173895835876465
Fold 1, Validation Accuracy: 26.345933562428407%
Epoch [5/15], Training Loss: 4.634980235780988
Epoch [5/15], Validation Loss: 4.515435218811035
Fold 1, Validation Accuracy: 43.4135166093929%
Epoch [6/15], Training Loss: 4.302809877055032
Epoch [6/15], Validation Loss: 3.054222583770752
Fold 1, Validation Accuracy: 41.92439862542955%
Epoch [7/15], Training Loss: 4.167481677872794
Epoch [7/15], Validation Loss: 5.043528079986572
Fold 1, Validation Accu