In [1]:
# simple implementation of CAM in PyTorch for the networks such as ResNet, DenseNet, SqueezeNet, Inception
# last update by BZ, June 30, 2021

import io
from PIL import Image
from torchvision import models, transforms
from torch.autograd import Variable
from torch.nn import functional as F
import torch
import numpy as np
import cv2
import json
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# input image


# networks such as googlenet, resnet, densenet already use global average pooling at the end, so CAM could be used directly.
model_id = 2
if model_id == 1:
    net = models.squeezenet1_1(pretrained=True)
    finalconv_name = 'features' # this is the last conv layer of the network
elif model_id == 2:
    net = models.resnet18(pretrained=True)
    finalconv_name = 'layer4'
elif model_id == 3:
    net = models.densenet161(pretrained=True)
    finalconv_name = 'features'

# net.eval()

# Define transformations for data preprocessing




In [2]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to a uniform size
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the images
])

# Load train and test datasets
train_dataset = ImageFolder('Dataset/train', transform=transform)
test_dataset = ImageFolder('Dataset/test', transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

num_features = net.fc.in_features
net.fc = torch.nn.Linear(num_features, 2)

device = torch.device('mps')
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# for param in model.parameters():
#     param.requires_grad = False
criterion = torch.nn.CrossEntropyLoss()
    
num_epochs = 5
for epoch in range(num_epochs):
    net.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = net(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')

# Evaluation on test set
net.eval()




Epoch [1/5], Loss: 0.1357
Epoch [2/5], Loss: 0.0156
Epoch [3/5], Loss: 0.0345
Epoch [4/5], Loss: 0.0360
Epoch [5/5], Loss: 0.0497


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [3]:
# model.eval()

# # Remove the fully connected layer
# model = nn.Sequential(*list(model.children())[:-2])

# Load and preprocess the image
# img = cv2.imread('sat.png')
img = cv2.imread('2 faces.png')
# img = cv2.imread('river_hand.jpeg')
# img = cv2.imread('image_2.jpg')
# img = cv2.imread('tejas.jpg')
# img = cv2.imread('shahan.jpg')
# img = cv2.imread('osama.jpg')
# img = cv2.imread('Human1250 copy.png')

if img is not None:
    print("Image loaded successfully!")
else:
    print("Unable to load the image. Please check the file path.")

features_blobs = []
def hook_feature(module, input, output):
    features_blobs.append(output.data.cpu().numpy())

print(net.eval())

net._modules.get('layer4').register_forward_hook(hook_feature)

img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_img = preprocess(img).unsqueeze(0).to(device)


# Forward pass to get feature maps
with torch.no_grad():
    feature_maps = net(input_img)

# Get the weights of the final convolutional layer
# final_conv_layer = None
# for layer in reversed(model2):
#     # print("layer", layer)
#     if isinstance(layer, torch.nn.modules.conv.Conv2d):
#         final_conv_layer = layer
#         break

# print("feature blobs", len(features_blobs))
# print(feature_maps.shape)

# print(final_conv_layer)
# if final_conv_layer is None:
#     raise ValueError("Final convolutional layer not found in the model.")

# final_conv_layer_weights = final_conv_layer.weight.detach().cpu()

# Compute the class activation map (CAM)
# cam = np.zeros((feature_maps.shape[2], feature_maps.shape[3]), dtype=np.float32)
# for i in range(final_conv_layer_weights.size(0)):
#     weight = final_conv_layer_weights[i].detach().cpu().numpy()
#     # print(weight.shape)
#     params = list(model.parameters())
#     # print(params)
#     weight = np.squeeze(params[-2].data.cpu().numpy())
#     print(feature_maps.squeeze(0)[i].cpu().numpy().shape)
#     cam += np.sum(weight * feature_maps.squeeze(0)[i].cpu().numpy(), axis=0)


params = list(net.parameters())
#     # print(params)
weight = np.squeeze(params[-2].data.cpu().numpy())
print("weight shape", weight.shape)
# cam = np.sum(weight[0].T * feature_maps[0].cpu().numpy(), axis=0)
# cam = weight[0].dot(feature_maps[0].cpu().numpy().reshape(-1, 7 * 7))
cam = weight[0].dot(features_blobs[0].reshape(-1, 7 * 7))

print("cam", cam)
# cam = np.maximum(cam, 0)  # ReLU activation
# cam = cv2.resize(cam, (img.shape[1], img.shape[0]))
cam = cam.reshape(7, 7)
cam = cam - np.min(cam)
cam = cam / np.max(cam)
cam = np.uint8(255 * cam)
cam = cv2.resize(cam, (256, 256))
print("final", cam)
cam = cv2.resize(cam, (img.shape[1], img.shape[0])) 
print("shape", cam.shape)

# Apply heatmap on the original image
heatmap = cv2.applyColorMap(cam, cv2.COLORMAP_JET)
img = cv2.imread('2 faces.png')
result = heatmap * 0.3 + img * 0.5
cv2.imwrite('CAM3.jpg', result)

# print(heatmap.shape)
# # heatmap = np.flip(heatmap, axis=0)
# superimposed_img = heatmap * 0.3 + img.astype('float32') * 0.5
# superimposed_img = superimposed_img / superimposed_img.max()

# # Display the original image and the image with the heatmap
# # cv2.imshow('Original Image', img)
# # cv2.imshow('CAM', np.uint8(255 * superimposed_img))
# import matplotlib.pyplot as plt 
# plt.imshow(img)

# plt.imshow(np.uint8(255 * superimposed_img))
# # cv2.waitKey(0)
# # cv2.destroyAllWindows()

Image loaded successfully!
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (r

True

In [4]:
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')



LABELS_file = 'imagenet-simple-labels.json'
# image_file = 'sat.png'
image_file = '2 faces.png'


# hook the feature extractor
features_blobs = []
def hook_feature(module, input, output):
    features_blobs.append(output.data.cpu().numpy())

print(net.eval())
net._modules.get(finalconv_name).register_forward_hook(hook_feature)

# get the softmax weight
params = list(net.parameters())
# print(params)
weight_softmax = np.squeeze(params[-2].data.cpu().numpy())

def returnCAM(feature_conv, weight_softmax, class_idx):
    print("conv feature", feature_conv.shape)
    print(class_idx)
    # generate the class activation maps upsample to 256x256
    size_upsample = (256, 256)
    bz, nc, h, w = feature_conv.shape
    output_cam = []
    print(weight_softmax[0].shape)
    print(nc, h*w)
    for idx in class_idx:
        cam = weight_softmax[idx].dot(feature_conv.reshape((nc, h*w)))
        print("cam", cam)
        cam = cam.reshape(h, w)
        cam = cam - np.min(cam)
        cam_img = cam / np.max(cam)
        cam_img = np.uint8(255 * cam_img)
        print(cam_img.shape)
        print("final", cv2.resize(cam_img, size_upsample))
        output_cam.append(cv2.resize(cam_img, size_upsample))
    return output_cam


normalize = transforms.Normalize(
   mean=[0.485, 0.456, 0.406],
   std=[0.229, 0.224, 0.225]
)
preprocess = transforms.Compose([
   transforms.Resize((224,224)),
   transforms.ToTensor(),
   normalize
])

# load test image
img_pil = Image.open(image_file)
if img_pil.mode == "RGBA":
    img_pil = img_pil.convert("RGB")
img_tensor = preprocess(img_pil)
img_variable = Variable(img_tensor.unsqueeze(0)).to(device)
logit = net(img_variable)

# load the imagenet category list
# with open(LABELS_file) as f:
#     classes = json.load(f)

classes = ['face', 'no_face']

h_x = F.softmax(logit, dim=1).data.squeeze()
probs, idx = h_x.sort(0, True)
probs = probs.cpu().numpy()
idx = idx.cpu().numpy()

# output the prediction
for i in range(0, 2):
    print('{:.3f} -> {}'.format(probs[i], classes[idx[i]]))

print("feature blobs", len(features_blobs))
# generate class activation mapping for the top1 prediction
CAMs = returnCAM(features_blobs[0], weight_softmax, [idx[0]])

# render the CAM and output
print('output CAM.jpg for the top1 prediction: %s'%classes[idx[0]])
img = cv2.imread('2 faces.png')
height, width, _ = img.shape
heatmap = cv2.applyColorMap(cv2.resize(CAMs[0],(width, height)), cv2.COLORMAP_JET)
result = heatmap * 0.3 + img * 0.5
cv2.imwrite('CAM.jpg', result)

import matplotlib.pyplot as plt
# plt.imshow(result)

# print(heatmap.shape)


Test Accuracy: 96.60%
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu):