# EP #4 - Object Recognition: BoF vs ConvNets

Renato Sérgio Lopes Júnior \
2020667570

In [None]:
import os
import cv2
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import sklearn
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')

## Load CIFAR-10 dataset

In [None]:
transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
            ])

inv_normalize = transforms.Normalize(
    mean=[-1.0, -1.0, -1.0],
    std=[1/0.5, 1/0.5, 1/0.5]
)

def get_cv2_image(img):
    return cv2.cvtColor((inv_normalize(img).numpy().transpose(1, 2, 0) * 255).astype(dtype=np.uint8), cv2.COLOR_RGB2BGR)

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

NUM_CLASSES = 10
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
fig = plt.figure(figsize=(16,2))
for i in range(1, 6):
    X, y = trainset[np.random.randint(len(trainset))]
    img = get_cv2_image(X)
    plt.subplot(1, 5, i)
    plt.title(classes[y])
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.show()

## A classifier based on bag of features

### Extracting SIFT features

In [None]:
sift = cv2.SIFT_create(nfeatures=100)

def get_sift_features(dataset):
    sift_features = {i: [] for i in range(NUM_CLASSES)}

    # Get SIFT descriptions for all samples in dataset
    for i in range(len(dataset)):
        if ((i+1)/len(dataset)*100)%10 == 0:
            print(f"{(i+1)/len(dataset)*100}%")
        X, y = dataset[i]
        img = get_cv2_image(X)
        kp, des = sift.detectAndCompute(img, None)
        if des is None:
            continue
        sift_features[y].append(des)
    
    return sift_features

def get_descriptions_matrix(sift_features):
    # Returns matrix with SIFT descriptions (for clustering)
    descriptions = []
    for i in range(NUM_CLASSES):
        for j in range(len(sift_features[i])):
            descriptions.extend(sift_features[i][j])
    descriptions = np.array(descriptions)
    return descriptions

In [None]:
# Get SIFT descriptions for training samples
train_sift_features = get_sift_features(trainset)
train_descriptions = get_descriptions_matrix(train_sift_features)

# Get SIFT descriptions for test samples
test_sift_features = get_sift_features(testset)

### Create visual dictionary

In [None]:
def create_clustering(descriptions, dict_size):
    # Cluster descriptions
    clustering = MiniBatchKMeans(n_clusters=dict_size).fit(descriptions)
    return clustering

def get_visual_dict(clustering, descriptions, dict_size):
    hist = np.zeros(dict_size)
    for desc in descriptions:
        word = clustering.predict(desc.reshape(1, -1))
        hist[word[0]] += 1
    cv2.normalize(hist, hist, norm_type=cv2.NORM_L2)
    return hist

def get_bof_data(clustering, sift_features, dict_size):
    X = []
    y = []
    # Get visual dictionary for each image
    for i in range(NUM_CLASSES):
        if ((i+1)/NUM_CLASSES*100)%10 == 0:
            print(f"{(i+1)/NUM_CLASSES*100}%")
        for j in range(len(sift_features[i])):
            bof = get_visual_dict(clustering, sift_features[i][j], dict_size)
            X.append(bof)
            y.append(i)
    return X, y

### Train Random Forest Classifier with Dictionary Size 8

In [None]:
# Create clustering
print("Creating cluster...")
clustering = create_clustering(train_descriptions, dict_size=8)
print("Created cluster.")

# Get training data visual dictionaries
print("Getting visual dictionaries...")
bof_X, bof_y = get_bof_data(clustering, train_sift_features, dict_size=8)
print("Got visual dictionaries.")

# Get visual dictionaries for test samples
print("Getting visual dictionaries...")
bof_X_test, bof_y_test = get_bof_data(clustering, test_sift_features, dict_size=8)
print("Got visual dictionaries.")

In [None]:
# Create Random Forest Classifier
forest_classifier = RandomForestClassifier()
# Fit with visual dictionaries
forest_classifier.fit(bof_X, bof_y)

In [None]:
# Compute Metrics
sklearn.metrics.plot_confusion_matrix(forest_classifier, bof_X_test, bof_y_test)
predictions = forest_classifier.predict(bof_X_test)
print("Accuracy", sklearn.metrics.accuracy_score(bof_y_test, predictions))

## A classifier using ConvNet

### Create Dataloaders

In [None]:
BATCHSIZE = 128

trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCHSIZE, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCHSIZE, shuffle=False, num_workers=2)

### Define Network Architecture

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.feature = nn.Sequential(
            nn.Conv2d(3, 6, kernel_size=5, stride=1),
            nn.MaxPool2d(2),
            nn.ReLU(True),
            nn.Conv2d(6, 16, kernel_size=5, stride=1),
            nn.MaxPool2d(2),
            nn.ReLU(True),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(400, 120),
            nn.ReLU(True),
            nn.Linear(120, 84),
            nn.ReLU(True),
            nn.Linear(84, NUM_CLASSES),
            nn.LogSoftmax(dim=1)
        )
    
    def forward(self, data):
        feature = self.feature(data)
        return self.classifier(feature)

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
net = ConvNet().to(device)

### Train Network

In [None]:
NUM_EPOCHS = 50

loss_fn = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.005)

for epoch in range(NUM_EPOCHS):
    # Training
    net.train()
    loss_epoch_train = 0.0
    corrects_epoch_train = 0
    for X, y in trainloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        
        out = net(X)
        
        loss = loss_fn(out, y)
        loss.backward()
        optimizer.step()
        
        prob, pred = torch.max(out.data, 1)
        corrects_epoch_train += (pred == y.long()).sum().item()
        loss_epoch_train += loss.item()
    
    # Evaluation
    net.eval()
    loss_epoch_eval = 0.0
    corrects_epoch_eval = 0
    for X, y in testloader:
        X, y = X.to(device), y.to(device)
        
        with torch.no_grad():
            out = net(X)
            loss = loss_fn(out, y)
        
        prob, pred = torch.max(out.data, 1)
        corrects_epoch_eval += (pred == y.long()).sum().item()
        loss_epoch_eval += loss.item()
    
    loss_epoch_train /= len(trainloader)
    loss_epoch_eval /= len(testloader)
    acc_epoch_train = corrects_epoch_train/len(trainloader.dataset)
    acc_epoch_eval = corrects_epoch_eval/len(testloader.dataset)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}: train_loss={loss_epoch_train:.4f}, train_acc={acc_epoch_train:.4f}, test_loss={loss_epoch_eval:.4f}, test_acc={acc_epoch_eval:.4f}")
        
        