In [None]:
### Only do it for the first time
!unzip /kaggle/input/dogs-vs-cats/train.zip 

In [1]:
# All imports
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from PIL import Image
import os
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# All constants here
DIR = '/kaggle/working/train/'

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
print(device)

cuda


In [5]:
print(f"Number of images: {len(os.listdir(DIR))}")

Number of images: 25000


In [6]:
IMAGE_DIM = 128
CHANNEL_DIM = 3

In [7]:
# current goal is to work with 10k images
from torchvision import transforms
required_cat_count = 12000
required_dog_count = 12000
total_size = required_cat_count + required_dog_count 
index = 0
data = torch.zeros([required_cat_count + required_dog_count, 
                    CHANNEL_DIM,IMAGE_DIM, IMAGE_DIM])
targets = []
for i,filename in enumerate(os.listdir(DIR)):
    animal = 'cat'
    if filename.startswith('cat'):
        target = 0
        if required_cat_count <= 0:
            continue
        required_cat_count -= 1
    else:
        target = 1
        animal = 'dog'
        if required_dog_count <= 0:
            continue
        required_dog_count -= 1
    file_path = DIR + '/' + filename
    image = Image.open(file_path).resize((IMAGE_DIM, IMAGE_DIM))
    image_tensor = transforms.ToTensor()(image)
    '''if index % 100 == 0:
        plt.imshow(image_tensor.permute(1,2,0))
        plt.title(animal)
        plt.show()'''
    data[index] = image_tensor.to(device)
    index = index + 1
    targets.append(target)

In [8]:
data = data.to(device)
targets = torch.tensor(targets).type(torch.float32).to(device)

In [9]:
print(data.shape, targets.shape)
print(required_cat_count, required_dog_count)

torch.Size([24000, 3, 128, 128]) torch.Size([24000])
0 0


In [10]:
def train_test_split_data(data, target, test_size=0.1, random_state=None):
    """
    Splits a dataset into training and testing sets, and optionally splits the training set into mini-batches.

    Args:
        data (torch.Tensor): The input data tensor, with shape (num_samples, height, width, channels).
        target (torch.Tensor): The target tensor, with shape (num_samples, 1).
        test_size (float): The proportion of the data to include in the test set.
        random_state (int): The random seed to use for shuffling the data.
        batch_size (int): The size of the mini-batches to create from the training set.

    Returns:
        A tuple (X_train_batches, X_test_batches, y_train_batches, y_test_batches) if batch_size is not None,
        otherwise a tuple (X_train, X_test, y_train, y_test).
    """
    # Shuffle the data
    if random_state is not None:
        torch.manual_seed(random_state)
    shuffled_indices = torch.randperm(data.shape[0])
    data_shuffled = data[shuffled_indices]
    target_shuffled = target[shuffled_indices]

    # Split the shuffled dataset into training and testing sets
    test_size = int(test_size * data.shape[0])
    X_test = data_shuffled[:test_size]
    y_test = target_shuffled[:test_size]
    X_train = data_shuffled[test_size:]
    y_train = target_shuffled[test_size:]
    return X_train, X_test, y_train, y_test

In [11]:
SPLIT = 0.2
BATCH = 256
SEED = 42

In [12]:
X_train, X_test, y_train, y_test = train_test_split_data(data, targets, SPLIT, SEED)

In [13]:
# flush or remove unwanted tensors from GPU
del data
del targets
torch.cuda.empty_cache()

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([19200, 3, 128, 128]),
 torch.Size([4800, 3, 128, 128]),
 torch.Size([19200]),
 torch.Size([4800]))

In [15]:
import random

@torch.no_grad()
def get_batch(split):
    dataset = {'train': (X_train, y_train),
              'test': (X_test, y_test)}
    X, y = dataset[split]
    idx = random.randint(0, X.shape[0]-BATCH)
    Xb,yb = X[idx:idx+BATCH], y[idx:idx+BATCH]
    return Xb, yb

In [16]:
class CNN(nn.Module):
    def __init__(self, output_dim):
        super(CNN, self).__init__()
        self.feature_extraction_layers = nn.Sequential(nn.Conv2d(CHANNEL_DIM, 16, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(16), # Use batch normalization to speed up training and reduce overfitting 
                                                       nn.ReLU(), # Use ReLU activation function for faster convergence and sparsity 
                                                       nn.MaxPool2d((2,2) ,stride=2),
                                                       
                                                       nn.Conv2d(16, 32, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(32), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(32, 64, kernel_size= (3,3), padding=1),
                                                       nn.BatchNorm2d(64), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(64, 128, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(128), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2),
                                                       nn.Conv2d(128, 256, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(256), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2)) 
        self.flatten = nn.Flatten() 
        self.dropout = nn.Dropout(0.8) # Increase the dropout rate to prevent overfitting 
        self.connected_layer = nn.Linear(4096, 512) # Adjust the input dimension of the linear layer according to the output of the convolutional layers 
        self.output_layer = nn.Linear(512, output_dim)
        
    def forward(self, x):
        feature_extraction_layer_output = self.feature_extraction_layers(x)
        flatten_output = self.flatten(feature_extraction_layer_output)
        connected_layer_output = self.connected_layer(self.dropout(flatten_output))
        output = self.output_layer(self.dropout(connected_layer_output))
        return output

In [17]:
@torch.no_grad()
def evaluate(model, X, y):
    model.eval()
    logits = model(X)
    probs = torch.sigmoid(logits)
    loss = F.binary_cross_entropy(probs, y.view(-1,1))
    return loss

In [18]:
cnn = CNN(1).to(device)
for p in cnn.parameters():
    p.requires_grad = True

In [19]:
print(sum(p.nelement() for p in cnn.parameters()) / 1e6, "M")
print(sum(p.nelement() for p in cnn.parameters()))

2.491777 M
2491777


In [20]:
logging_interval = 1000
interval_average_for_plot = 1500

In [None]:
#cnn = torch.load('/kaggle/working/cnn-base-2.hdf5')

In [None]:
from torch.nn import functional as F
ITERATIONS = 15000
LR = 1e-4
WEIGHT_DECAY = 3

losses_train = []
losses_test = []
optim = torch.optim.AdamW(cnn.parameters(),
                          lr = LR,
                         weight_decay=WEIGHT_DECAY)

for i in range(ITERATIONS):
    Xb_train, yb_train = get_batch('train')
    Xb_test, yb_test = get_batch('test')
    cnn.train()
    logits = cnn(Xb_train)
    probs = torch.sigmoid(logits)
    
    '''Reset gradients to zero (old school)
    for p in slp.parameters():
        p.grad = None'''
    optim.zero_grad(set_to_none=True)
    
    #Compute loss and back propagate
    lossb_train = F.binary_cross_entropy(probs, yb_train.view(-1,1))
    lossb_train.backward()
    
    #Test loss
    lossb_test = evaluate(cnn, Xb_test, yb_test)
    
    cnn.train()
    optim.step()
    ''' Update gradients (old school)
    for p in slp.parameters():
        p.data -= LR * p.grad'''
        
    # Verbose logging at every iteration
    if i%logging_interval == 0:
        print(f"Loss at iteration {i} - train loss: {lossb_train.item()}, test loss: {lossb_test.item()}")
    losses_train.append(lossb_train.item())
    losses_test.append(lossb_test.item())

In [None]:
plt.plot(torch.tensor(losses_train).view(-1, interval_average_for_plot).mean(1))
plt.plot(torch.tensor(losses_test).view(-1, interval_average_for_plot).mean(1))
plt.show()

#### 1. CNN Base model - v1

##### Saved file
/kaggle/working/cnn-base.hdf5

##### Dataset
```
IMAGE_DIM = 128
CHANNEL_DIM = 3
required_cat_count = 2000
required_dog_count = 2000
```
##### Hyperparameters
```
ITERATIONS = 300
LR = 1e-3
WEIGHT_DECAY = 0.8
```

##### Architecture
```torch.manual_seed(1234)
class CNN(nn.Module):
    def __init__(self, output_dim):
        super(CNN, self).__init__()
        self.feature_extraction_layers = nn.Sequential(nn.Conv2d(CHANNEL_DIM, 16, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(16), # Use batch normalization to speed up training and reduce overfitting 
                                                       nn.ReLU(),
                                                       nn.MaxPool2d((2,2) ,stride=2),
                                                       nn.Conv2d(16, 32, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(32), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(32, 64, kernel_size= (3,3), padding=1),
                                                       nn.BatchNorm2d(64), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(64, 128, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(128), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2),
                                                       nn.Conv2d(128, 256, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(256), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2)) 
        self.flatten = nn.Flatten() 
        self.dropout = nn.Dropout(0.8) # Increase the dropout rate to prevent overfitting 
        self.connected_layer = nn.Linear(4096, 512) # Adjust the input dimension of the linear layer according to the output of the convolutional layers 
        self.output_layer = nn.Linear(512, output_dim)
        
    def forward(self, x):
        feature_extraction_layer_output = self.feature_extraction_layers(x)
        flatten_output = self.flatten(feature_extraction_layer_output)
        connected_layer_output = self.connected_layer(self.dropout(flatten_output))
        output = self.output_layer(self.dropout(connected_layer_output))
        return output
```
#### Model Learning history
* Loss at iteration 0 - train loss: 1.111029028892517, test loss: 0.6931840181350708
* Loss at iteration 50 - train loss: 1.0523931980133057, test loss: 0.7053629159927368
* Loss at iteration 100 - train loss: 0.6232627630233765, test loss: 0.6956414580345154
* Loss at iteration 150 - train loss: 0.6970182061195374, test loss: 0.5816259384155273
* Loss at iteration 200 - train loss: 0.5333189964294434, test loss: 0.49696922302246094
* Loss at iteration 250 - train loss: 0.47810685634613037, test loss: 0.4219547510147095

#### Model Performance 
##### Classification Report
```
precision    recall  f1-score   support

         cat       0.78      0.90      0.84       408
         dog       0.88      0.74      0.80       392

    accuracy                           0.82       800
   macro avg       0.83      0.82      0.82       800
weighted avg       0.83      0.82      0.82       800
```



#### 2. CNN Base model - v2

##### Saved file
/kaggle/working/cnn-base-2.hdf5

##### Dataset
```
IMAGE_DIM = 128
CHANNEL_DIM = 3
required_cat_count = 5000
required_dog_count = 5000
```
##### Hyperparameters
```
ITERATIONS = 10000
LR = 1e-3
WEIGHT_DECAY = 0.8
```

##### Architecture
```
torch.manual_seed(1234)
class CNN(nn.Module):
    def __init__(self, output_dim):
        super(CNN, self).__init__()
        self.feature_extraction_layers = nn.Sequential(nn.Conv2d(CHANNEL_DIM, 16, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(16), # Use batch normalization to speed up training and reduce overfitting 
                                                       nn.ReLU(), # Use ReLU activation function for faster convergence and sparsity 
                                                       nn.MaxPool2d((2,2) ,stride=2),
                                                       
                                                       nn.Conv2d(16, 32, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(32), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(32, 64, kernel_size= (3,3), padding=1),
                                                       nn.BatchNorm2d(64), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(64, 128, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(128), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2),
                                                       nn.Conv2d(128, 256, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(256), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2)) 
        self.flatten = nn.Flatten() 
        self.dropout = nn.Dropout(0.8) # Increase the dropout rate to prevent overfitting 
        self.connected_layer = nn.Linear(4096, 512) # Adjust the input dimension of the linear layer according to the output of the convolutional layers 
        self.output_layer = nn.Linear(512, output_dim)
        
    def forward(self, x):
        feature_extraction_layer_output = self.feature_extraction_layers(x)
        flatten_output = self.flatten(feature_extraction_layer_output)
        connected_layer_output = self.connected_layer(self.dropout(flatten_output))
        output = self.output_layer(self.dropout(connected_layer_output))
        return output
```
#### Model Performance 
```Loss at iteration 0 - train loss: 1.236282467842102, test loss: 0.6929770708084106
Loss at iteration 1000 - train loss: 0.5520414710044861, test loss: 0.773626446723938
Loss at iteration 2000 - train loss: 0.18896639347076416, test loss: 0.44813066720962524
Loss at iteration 3000 - train loss: 0.034101538360118866, test loss: 0.4923945367336273
Loss at iteration 4000 - train loss: 0.016995254904031754, test loss: 0.3038991689682007
Loss at iteration 5000 - train loss: 0.004320989828556776, test loss: 0.44383835792541504
Loss at iteration 6000 - train loss: 0.021008003503084183, test loss: 1.594154715538025
Loss at iteration 7000 - train loss: 0.001111862133257091, test loss: 0.45177677273750305
Loss at iteration 8000 - train loss: 0.002191168488934636, test loss: 0.47085660696029663
Loss at iteration 9000 - train loss: 0.006088821683079004, test loss: 0.34020543098449707
```

##### Classification Report

```
     precision    recall  f1-score   support

         cat       0.89      0.96      0.93       988
         dog       0.96      0.89      0.92      1012

    accuracy                           0.93      2000
    macro avg       0.93      0.93      0.93      2000
    weighted avg       0.93      0.93      0.93      2000
```



#### 3. CNN Base model - v3

##### Saved file
/kaggle/working/cnn-base-3.hdf5

##### Dataset
```
IMAGE_DIM = 128
CHANNEL_DIM = 3
required_cat_count = 12000
required_dog_count = 12000
```
##### Hyperparameters
```
ITERATIONS = 15000
LR = 1e-4
WEIGHT_DECAY = 3
BATCH = 256
```

##### Architecture
```
torch.manual_seed(1234)
class CNN(nn.Module):
    def __init__(self, output_dim):
        super(CNN, self).__init__()
        self.feature_extraction_layers = nn.Sequential(nn.Conv2d(CHANNEL_DIM, 16, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(16), # Use batch normalization to speed up training and reduce overfitting 
                                                       nn.ReLU(), # Use ReLU activation function for faster convergence and sparsity 
                                                       nn.MaxPool2d((2,2) ,stride=2),
                                                       
                                                       nn.Conv2d(16, 32, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(32), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(32, 64, kernel_size= (3,3), padding=1),
                                                       nn.BatchNorm2d(64), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2), 
                                                       nn.Conv2d(64, 128, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(128), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2),
                                                       nn.Conv2d(128, 256, kernel_size= (3,3), padding=1), 
                                                       nn.BatchNorm2d(256), nn.ReLU(), 
                                                       nn.MaxPool2d((2,2), stride=2)) 
        self.flatten = nn.Flatten() 
        self.dropout = nn.Dropout(0.8) # Increase the dropout rate to prevent overfitting 
        self.connected_layer = nn.Linear(4096, 512) # Adjust the input dimension of the linear layer according to the output of the convolutional layers 
        self.output_layer = nn.Linear(512, output_dim)
        
    def forward(self, x):
        feature_extraction_layer_output = self.feature_extraction_layers(x)
        flatten_output = self.flatten(feature_extraction_layer_output)
        connected_layer_output = self.connected_layer(self.dropout(flatten_output))
        output = self.output_layer(self.dropout(connected_layer_output))
        return output
```
#### Model Performance 
```
Loss at iteration 0 - train loss: 1.080051302909851, test loss: 0.6915969848632812
Loss at iteration 1000 - train loss: 0.33023184537887573, test loss: 0.5084758400917053
Loss at iteration 2000 - train loss: 0.13062278926372528, test loss: 0.3834241032600403
Loss at iteration 3000 - train loss: 0.12334740161895752, test loss: 0.6894320249557495
Loss at iteration 4000 - train loss: 0.02116161584854126, test loss: 0.6645081043243408
Loss at iteration 5000 - train loss: 0.016679178923368454, test loss: 0.3315171003341675
Loss at iteration 6000 - train loss: 0.00930651556700468, test loss: 0.490634024143219
Loss at iteration 7000 - train loss: 0.03066173940896988, test loss: 0.5103933811187744
Loss at iteration 8000 - train loss: 0.007853752002120018, test loss: 0.7809205055236816
Loss at iteration 9000 - train loss: 0.04008251801133156, test loss: 4.434248924255371
Loss at iteration 10000 - train loss: 0.0021463362500071526, test loss: 0.23868006467819214
Loss at iteration 11000 - train loss: 0.016685249283909798, test loss: 0.8665417432785034
Loss at iteration 12000 - train loss: 0.010997233912348747, test loss: 0.31216633319854736
Loss at iteration 13000 - train loss: 0.0020246654748916626, test loss: 0.33097776770591736
Loss at iteration 14000 - train loss: 0.002831029240041971, test loss: 0.2651127278804779
```

##### Classification Report

```
precision    recall  f1-score   support

         cat       0.98      0.99      0.98      2317
         dog       0.99      0.98      0.98      2483

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800
```



cnn.eval()
torch.save(cnn, '/kaggle/working/cnn-base-3.hdf5')

In [17]:
del X_train
del y_train
torch.cuda.empty_cache()

In [18]:
saved_model = torch.load('/kaggle/working/cnn-base-3.hdf5')

In [32]:
from sklearn.metrics import classification_report
saved_model.eval()
y_prednumpy = np.array([])
y_testnumpy = np.array([])
for i in range(0, X_test.shape[0], BATCH):
    X_testb, y_testb = X_test[i: i + BATCH], y_test[i: i + BATCH]
    logits_testb = saved_model(X_testb)
    probs_testb = torch.sigmoid(logits_testb)
    probs_test_numpy = probs_testb.cpu().detach().numpy().reshape(-1,)
    probs_test_numpy[probs_test_numpy >= 0.5] = 1
    probs_test_numpy[probs_test_numpy < 0.5] = 0
    y_prednumpy = np.concatenate([y_prednumpy, probs_test_numpy], 0)
    y_testnumpy = np.concatenate([y_testnumpy, y_testb.cpu().detach().numpy()], 0)

In [33]:
y_prednumpy.shape, y_testnumpy.shape

((4800,), (4800,))

In [34]:
print(classification_report(y_prednumpy, y_testnumpy, target_names = ['cat', 'dog']))

              precision    recall  f1-score   support

         cat       0.98      0.99      0.98      2317
         dog       0.99      0.98      0.98      2483

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800



cnn.eval()
logits_test = cnn(X_test)
probs_test = torch.sigmoid(logits_test)
from torchmetrics import Accuracy
accuracy = Accuracy(task = 'binary').to(device)