In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms, models
from PIL import Image, ImageOps
import pandas as pd
from torch.cuda.amp import GradScaler, autocast


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

data_transforms = {
    'train': transforms.Compose([ 
        transforms.RandomResizedCrop(32),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),  
        transforms.ToTensor(),
        transforms.RandomGrayscale(p=0.1),
        transforms.RandomRotation(20),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
        transforms.GaussianBlur(3, sigma=(0.1, 2.0)), 
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.Normalize(mean, std)
    ]),
    'val': transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
    'test': transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
}

class TestDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = os.listdir(root_dir)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.image_files[idx])
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, self.image_files[idx]  # Return filename along with the image

data_dir = "/kaggle/input/iith-dl-contest-2024"
full_train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train', 'train'), transform=data_transforms['train'])
test_dataset = TestDataset(os.path.join(data_dir, 'test', 'test'), transform=data_transforms['test'])

train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])
val_dataset.dataset.transform = data_transforms['val']

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)
dataloaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}

# Define EfficientNetV2-XL architecture
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

class SEBlock(nn.Module):
    def __init__(self, channels, se_ratio=0.25):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        hidden_units = max(1, int(channels * se_ratio))
        self.fc = nn.Sequential(
            nn.Linear(channels, hidden_units, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_units, channels, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        batch, channels, _, _ = x.size()
        y = self.avg_pool(x).view(batch, channels)
        y = self.fc(y).view(batch, channels, 1, 1)
        return x * y.expand_as(x)

class MBConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, expand_ratio, se_ratio, drop_rate):
        super(MBConvBlock, self).__init__()
        self.stride = stride
        self.se = SEBlock(out_channels, se_ratio)
        self.drop_rate = drop_rate

        # Expansion phase
        expanded_channels = expand_ratio * in_channels
        self.expand_conv = nn.Conv2d(in_channels, expanded_channels, kernel_size=1, bias=False)
        self.expand_bn = nn.BatchNorm2d(expanded_channels)
        self.expand_activation = Swish()

        # Depthwise convolution phase
        self.depthwise_conv = nn.Conv2d(expanded_channels, expanded_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size // 2), groups=expanded_channels, bias=False)
        self.depthwise_bn = nn.BatchNorm2d(expanded_channels)
        self.depthwise_activation = Swish()

        # Squeeze and Excitation phase
        self.se_block = SEBlock(expanded_channels, se_ratio)

        # Projection phase
        self.project_conv = nn.Conv2d(expanded_channels, out_channels, kernel_size=1, bias=False)
        self.project_bn = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        out = self.expand_activation(self.expand_bn(self.expand_conv(x)))
        out = self.depthwise_activation(self.depthwise_bn(self.depthwise_conv(out)))
        out = self.se_block(out)
        out = F.dropout(out, self.drop_rate, self.training)
        out = self.project_bn(self.project_conv(out))
        if self.stride == 1 and x.size() == out.size():
            out = torch.add(x, out)
        return out

class EfficientNetV2_XL(nn.Module):
    def __init__(self, num_classes=1000, width_multiplier=1.0, depth_multiplier=1.0, dropout_rate=0.2):
        super(EfficientNetV2_XL, self).__init__()
        self.dropout_rate = dropout_rate
        self.conv_stem = nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn_stem = nn.BatchNorm2d(32)
        self.stem_activation = Swish()

        block_settings = [
            [1, 16, 3, 1, 1, 0.25],  # MBConv1 block
            [2, 24, 3, 2, 4, 0.25],  # MBConv2 block
#             [2, 40, 5, 2, 4, 0.25],  # MBConv3 block
#             [3, 80, 3, 2, 4, 0.25],  # MBConv4 block
#             [3, 112, 5, 1, 4, 0.25],  # MBConv5 block
#             [4, 192, 5, 2, 4, 0.25],  # MBConv6 block
#             [1, 320, 3, 1, 1, 0.25]   # MBConv7 block
        ]

        self.blocks = nn.ModuleList([])
        in_channels = 32
        for idx, (kernel_size, out_channels, stride, expand_ratio, se_ratio, drop_rate) in enumerate(block_settings):
            out_channels = int(out_channels * width_multiplier)
            if idx == 0:
                stride = 1
            self.blocks.append(MBConvBlock(in_channels, out_channels, kernel_size, stride, expand_ratio, se_ratio, drop_rate))
            in_channels = out_channels

        # Final convolution
        self.conv_head = nn.Conv2d(in_channels, 1280, kernel_size=1, bias=False)
        self.bn_head = nn.BatchNorm2d(1280)
        self.head_activation = Swish()

        # Classifier
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(self.dropout_rate)
        self.fc = nn.Linear(1280, num_classes)

    def forward(self, x):
        x = self.stem_activation(self.bn_stem(self.conv_stem(x)))
        for block in self.blocks:
            x = block(x)
        x = self.head_activation(self.bn_head(self.conv_head(x)))
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Define diverse architectures for ensemble
models_list = [
    EfficientNetV2_XL(num_classes=len(full_train_dataset.classes)).to(device),
    models.resnet50(num_classes=len(full_train_dataset.classes)).to(device),
    models.resnext101_32x8d(num_classes=len(full_train_dataset.classes)).to(device)
]

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.1)



def train_model(model, criterion, optimizer, scheduler, dataloaders, num_epochs=10):
    scaler = GradScaler()  # for mixed precision training
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            correct_predictions = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    with autocast(): 
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                    if phase == 'train':
                        scaler.scale(loss).backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)  
                        scaler.step(optimizer)
                        scaler.update()
                        
                running_loss += loss.item() * inputs.size(0)
                correct_predictions += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = correct_predictions.double() / len(dataloaders[phase].dataset)

            print(f'Epoch {epoch}/{num_epochs - 1}, {phase} Loss: {epoch_loss:.4f}, {phase} Acc: {epoch_acc:.4f}')

            if phase == 'val':
                scheduler.step(epoch_loss)

    return model
model = train_model(model, loss_fn, optimizer, scheduler, dataloaders, num_epochs=40)

model.eval()
predictions = []

with torch.no_grad():
    for inputs, filenames in dataloaders['test']:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        for filename, prediction in zip(filenames, predicted.cpu().numpy()):
            predictions.append((filename, class_names[prediction]))

df = pd.DataFrame(predictions, columns=['ID', 'Category'])
df.set_index('ID', inplace=True)
df.to_csv('submission6test.csv')

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms
from PIL import Image, ImageOps
import pandas as pd
from torch.cuda.amp import GradScaler, autocast


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

data_transforms = {
    'train': transforms.Compose([ 
        transforms.RandomResizedCrop(32),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),  
        transforms.ToTensor(),
        transforms.RandomGrayscale(p=0.1),
        transforms.RandomRotation(20),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
        transforms.GaussianBlur(3, sigma=(0.1, 2.0)), 
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.Normalize(mean, std)
    ]),
    'val': transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
    'test': transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
}

class TestDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = os.listdir(root_dir)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.image_files[idx])
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, self.image_files[idx]  # Return filename along with the image

data_dir = "/kaggle/input/iith-dl-contest-2024"
full_train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train', 'train'), transform=data_transforms['train'])
test_dataset = TestDataset(os.path.join(data_dir, 'test', 'test'), transform=data_transforms['test'])

train_size = int(0.9* len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])
val_dataset.dataset.transform = data_transforms['val']

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)
dataloaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}

# Define EfficientNetV2-XL architecture
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

class SEBlock(nn.Module):
    def __init__(self, channels, se_ratio=0.25):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        hidden_units = max(1, int(channels * se_ratio))
        self.fc = nn.Sequential(
            nn.Linear(channels, hidden_units, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_units, channels, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        batch, channels, _, _ = x.size()
        y = self.avg_pool(x).view(batch, channels)
        y = self.fc(y).view(batch, channels, 1, 1)
        return x * y.expand_as(x)

class MBConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, expand_ratio, se_ratio, drop_rate):
        super(MBConvBlock, self).__init__()
        self.stride = stride
        self.se = SEBlock(out_channels, se_ratio)
        self.drop_rate = drop_rate

        # Expansion phase
        expanded_channels = expand_ratio * in_channels
        self.expand_conv = nn.Conv2d(in_channels, expanded_channels, kernel_size=1, bias=False)
        self.expand_bn = nn.BatchNorm2d(expanded_channels)
        self.expand_activation = Swish()

        # Depthwise convolution phase
        self.depthwise_conv = nn.Conv2d(expanded_channels, expanded_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size // 2), groups=expanded_channels, bias=False)
        self.depthwise_bn = nn.BatchNorm2d(expanded_channels)
        self.depthwise_activation = Swish()

        # Squeeze and Excitation phase
        self.se_block = SEBlock(expanded_channels, se_ratio)

        # Projection phase
        self.project_conv = nn.Conv2d(expanded_channels, out_channels, kernel_size=1, bias=False)
        self.project_bn = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        out = self.expand_activation(self.expand_bn(self.expand_conv(x)))
        out = self.depthwise_activation(self.depthwise_bn(self.depthwise_conv(out)))
        out = self.se_block(out)
        out = F.dropout(out, self.drop_rate, self.training)
        out = self.project_bn(self.project_conv(out))
        if self.stride == 1 and x.size() == out.size():
            out = torch.add(x, out)
        return out

class EfficientNetV2_XL(nn.Module):
    def __init__(self, num_classes=1000, width_multiplier=1.0, depth_multiplier=1.0, dropout_rate=0.2):
        super(EfficientNetV2_XL, self).__init__()
        self.dropout_rate = dropout_rate
        self.conv_stem = nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn_stem = nn.BatchNorm2d(32)
        self.stem_activation = Swish()

        block_settings = [
            [1, 16, 3, 1, 1, 0.25],  # MBConv1 block
#             [2, 24, 3, 2, 4, 0.25],  # MBConv2 block
#             [2, 40, 5, 2, 4, 0.25],  # MBConv3 block
#             [3, 80, 3, 2, 4, 0.25],  # MBConv4 block
#             [3, 112, 5, 1, 4, 0.25],  # MBConv5 block
#             [4, 192, 5, 2, 4, 0.25],  # MBConv6 block
#             [1, 320, 3, 1, 1, 0.25]   # MBConv7 block
        ]

        self.blocks = nn.ModuleList([])
        in_channels = 32
        for idx, (kernel_size, out_channels, stride, expand_ratio, se_ratio, drop_rate) in enumerate(block_settings):
            out_channels = int(out_channels * width_multiplier)
            if idx == 0:
                stride = 1
            self.blocks.append(MBConvBlock(in_channels, out_channels, kernel_size, stride, expand_ratio, se_ratio, drop_rate))
            in_channels = out_channels

        # Final convolution
        self.conv_head = nn.Conv2d(in_channels, 1280, kernel_size=1, bias=False)
        self.bn_head = nn.BatchNorm2d(1280)
        self.head_activation = Swish()

        # Classifier
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(self.dropout_rate)
        self.fc = nn.Linear(1280, num_classes)

    def forward(self, x):
        x = self.stem_activation(self.bn_stem(self.conv_stem(x)))
        for block in self.blocks:
            x = block(x)
        x = self.head_activation(self.bn_head(self.conv_head(x)))
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Define diverse architectures for ensemble
models_list = [
    EfficientNetV2_XL(num_classes=len(full_train_dataset.classes)).to(device),
    models.resnet50(num_classes=len(full_train_dataset.classes)).to(device),
    models.resnext101_32x8d(num_classes=len(full_train_dataset.classes)).to(device)
]

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.1)



def train_model(model, criterion, optimizer, scheduler, dataloaders, num_epochs=10):
    scaler = GradScaler()  # for mixed precision training
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            correct_predictions = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    with autocast(): 
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                    if phase == 'train':
                        scaler.scale(loss).backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)  
                        scaler.step(optimizer)
                        scaler.update()
                        
                running_loss += loss.item() * inputs.size(0)
                correct_predictions += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = correct_predictions.double() / len(dataloaders[phase].dataset)

            print(f'Epoch {epoch}/{num_epochs - 1}, {phase} Loss: {epoch_loss:.4f}, {phase} Acc: {epoch_acc:.4f}')

            if phase == 'val':
                scheduler.step(epoch_loss)

    return model
model = train_model(model, loss_fn, optimizer, scheduler, dataloaders, num_epochs=15 )

model.eval()
predictions = []

with torch.no_grad():
    for inputs, filenames in dataloaders['test']:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        for filename, prediction in zip(filenames, predicted.cpu().numpy()):
            predictions.append((filename, class_names[prediction]))

df = pd.DataFrame(predictions, columns=['ID', 'Category'])
df.set_index('ID', inplace=True)
df.to_csv('submission6test.csv')