# **2301865741 - Edgard Jonathan Putra Pranoto**

# Code for No. 3

**Notes: Explanation is given in the PDF**

In [None]:
!pip install opencv-contrib-python==4.4.0.44

# Import Library

In [None]:
import os
from time import time
import errno
import shutil
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from skimage.feature import hog
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.vq import vq
from scipy.spatial.distance import cosine

# Preparing dataset

**Extracting Data**

In [None]:
import zipfile
with zipfile.ZipFile("../input/dogs-vs-cats/train.zip",'r') as z:
    z.extractall(".")

**Dataset directory**

Since the dataset is big, i only used 5000 images

In [None]:
path = "./train"
file = os.listdir(path)
# file = file[:5000]

# jika data tersort command split diatas dan gunakan split yang ada dibawah ini
_, file = train_test_split(file, test_size=5000, random_state=42)

**Splitting train and test with 80% for training and 20% for test**

In [None]:
train, test = train_test_split(file, test_size=0.2, random_state=42)
print(len(train))
print(len(test))

# **Read Training Data and Extract the Feature using SIFT**

In [None]:
x_train = []
y_train = []
train_descriptor = []
train_descriptors = []
IMG_SIZE = 128

SIFT = cv2.SIFT_create()
for i,filename in enumerate(tqdm(train)):
    image = cv2.imread(os.path.join(path, filename), 1)
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    if(i<5):
        plt.subplot(1, 5, i+1)
        plt.imshow(image, cmap='gray')
        plt.axis('off')
    
    kp, ds = SIFT.detectAndCompute(cv2.cvtColor(image,cv2.COLOR_RGB2GRAY),None)
    if len(kp)>0:
        x_train.append(image)
        if filename.split('.')[0] =='cat':
            y_train.append('cat')
            train_descriptor.append(("cat", ds))
        else:
            y_train.append('dog')
            train_descriptor.append(("dog", ds))
        for d in ds: 
            train_descriptors.append(d)
plt.show()

**Dataset Frequency for both label in train data**

From the frequency the dataset is balanced

In [None]:
from collections import Counter
print(Counter(y_train))

# **Read Testing Data and Extract the Feature using SIFT**

In [None]:
x_test = []
y_test = []
test_descriptor = []
test_descriptors = []

for i,filename in enumerate(tqdm(test)):
    image = cv2.imread(os.path.join(path, filename), 1)
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    
    if(i<5):
        plt.subplot(1, 5, i+1)
        plt.imshow(image, cmap='gray')
        plt.axis('off')
        
    kp, ds = SIFT.detectAndCompute(cv2.cvtColor(image, cv2.COLOR_RGB2GRAY ),None)
    if len(kp)>0:
        x_test.append(image)
        if filename.split('.')[0] =='cat':
            y_test.append('cat')
            test_descriptor.append(("cat", ds))
        else:
            y_test.append('dog')
            test_descriptor.append(("dog", ds))
        for d in ds: 
            test_descriptors.append(d)


**Dataset Frequency for both label in test data**

From the frequency the dataset is balanced

In [None]:
print(Counter(y_test))

**From the dataset, there are more than 2 millions local feature extracted**

Since this will make the computation slow, the extracted feature is then clustered using kmeans to reduce the dimension

In [None]:
train_descriptors = np.array(train_descriptors)
test_descriptors = np.array(test_descriptors)
print(train_descriptors.shape)
print(test_descriptors.shape)

In [None]:
LABELS = ["cat", "dog"]
LABEL2INDEX = {"cat": 0, "dog": 1}
INDEX2LABEL = {0: "cat", 1: "dog"}

# Load K Means Model From Question 3

In [None]:
import pickle

model = pickle.load(open("../input/model-comvis-no-3/kmeans (100).sav", 'rb'))
k = 100

# Creating Frequency diagram for each Visual Word

**Creating Histogram for train local feature**

In [None]:
train_features = np.zeros((len(x_train),k),"float32")
y_train = []
for i in tqdm(range(len(x_train))):
    prediction = model.predict(train_descriptor[i][1])
    y_train.append(train_descriptor[i][0])
    for w in prediction:
        train_features[i][w]+=1

**Creating Histogram for test local feature**

In [None]:
test_features = np.zeros((len(x_test),k),"float32")
y_test = []
for i in tqdm(range(len(x_test))):
    prediction = model.predict(test_descriptor[i][1])
    y_test.append(test_descriptor[i][0])
    for w in prediction:
        test_features[i][w]+=1

# Normalize the Histogram to [0, 1]

In [None]:
scaler = StandardScaler().fit(train_features)
train_features = scaler.transform(train_features)
scaler = StandardScaler().fit(test_features)
test_features = scaler.transform(test_features)

# Image Retrieval using Cosine Similarity

**The document will be ranked and the top-N document will be shown**

In [None]:
import random
from scipy.spatial.distance import cosine

for i in range(0,3):
    score = []
    query = random.randint(1,1000)
    result_size = 10
    
    for target in range(len(x_train)):
        score.append([target,1.0-cosine(test_features[query],train_features[target])])
    
    score.sort(key=lambda x: x[1], reverse=True)
    plt.title("Query Image")
    plt.imshow(x_test[query],cmap='gray')
    plt.xticks([])
    plt.yticks([])
    plt.show()
    fig, axs = plt.subplots((result_size+4)//5, 5, figsize=(9,9))
    for j in range(0,result_size):
        idx = score[j][0]
        sim = score[j][1]
        axs[j//5][j%5].imshow(x_train[idx])
        axs[j//5][j%5].set_title('Score= %.5lf'%(sim))
        axs[j//5][j%5].axis('off')
    plt.subplots_adjust(hspace=-0.5)
    plt.show()

# Evaluation using Precision and Recall

**The number of document that will be tested is 4**

In [None]:
q = 4
expected_retrievals = []
query_retrievals = []

for i in range(0,q):
    print("QUERY %d" % (i+1))
    expected_retrieval = []
    query_retrieval = []
    score = []
#     query = random.randint(1, 1000)
    query = len(x_test)-i-1
    query_label = LABEL2INDEX[y_test[query]]
    result_size = 25

    for target in range(len(x_train)):
        score.append([target,1.0-cosine(test_features[query],train_features[target])])

    score.sort(key=lambda x: x[1], reverse=True)
    plt.title("Query Image")
    plt.imshow(x_test[query],cmap='gray')
    plt.xticks([])
    plt.yticks([])
    plt.show()
    for j in range(0,result_size):
        idx = score[j][0]
        sim = score[j][1]

        expected_retrieval.append(query_label)
        retrieval_label = LABEL2INDEX[y_train[idx]]
        query_retrieval.append(retrieval_label)
    
    expected_retrievals.append(expected_retrieval)
    query_retrievals.append(query_retrieval)
    print(expected_retrieval)
    print(query_retrieval)
    print("===============================================")

# Calculating Precision and Recall + Plotting 

In [None]:
plt.figure(figsize=(16, 4))
for i in range(q):
    expected_result = expected_retrievals[i]
    query_result = query_retrievals[i]
    
    recall = []
    precision = []
    true_positive = 0
    relevant_document = Counter(y_train)[INDEX2LABEL[expected_result[0]]]
    for j in range(len(query_result)):
        if query_result[j] == expected_result[j]:
            true_positive += 1
        recall.append((true_positive/relevant_document))
        precision.append((true_positive/(j+1)))
    interpolated_precision = np.maximum.accumulate(precision[::-1])[::-1]
    
    plt.subplot(1, q, i+1)
    plt.title("Query %d (Expected result: %s)"%(i+1, INDEX2LABEL[expected_result[0]]))
    plt.step(recall, interpolated_precision, '-r', label="interpolated")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.xlim(0.0)
    plt.ylim((0.0, 1.1))
plt.show()