# **2301865741 - Edgard Jonathan Putra Pranoto**

# Code for No. 3

**Notes: Explanation is given in the PDF**

In [None]:
!pip install opencv-contrib-python==4.4.0.44

# Import Library

In [None]:
import os
from time import time
import errno
import shutil
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from skimage.feature import hog
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.vq import vq
from scipy.spatial.distance import cosine

# Preparing dataset

**Extracting Data**

In [None]:
import zipfile
with zipfile.ZipFile("../input/dogs-vs-cats/train.zip",'r') as z:
    z.extractall(".")

**Dataset directory**

Since the dataset is big, i only used 5000 images

In [None]:
path = "./train"
file = os.listdir(path)
file = file[:5000]

# jika data tersort command split diatas dan gunakan split yang ada dibawah ini
# _, file = train_test_split(file, test_size=5000, random_state=42)

**Splitting train and test with 80% for training and 20% for test**

In [None]:
train, test = train_test_split(file, test_size=0.2, random_state=42)
print("Train Data Count: %d" % len(train))
print("Test Data Count: %d" % len(test))

**Read Training Data and Extract the Feature using SIFT**

In [None]:
x_train = []
y_train = []
train_descriptor = []
train_descriptors = []
IMG_SIZE = 128

SIFT = cv2.SIFT_create()
for i,filename in enumerate(tqdm(train)):
    image = cv2.imread(os.path.join(path, filename), 0)
    
    if(i<5):
        plt.subplot(1, 5, i+1)
        plt.imshow(image, cmap='gray')
        plt.axis('off')
    
    kp, ds = SIFT.detectAndCompute(image,None)
    if len(kp)>0:
        x_train.append(image)
        if filename.split('.')[0] =='cat':
            y_train.append('cat')
            train_descriptor.append(("cat", ds))
        else:
            y_train.append('dog')
            train_descriptor.append(("dog", ds))
        for d in ds: 
            train_descriptors.append(d)
plt.show()

**Dataset Frequency for both label in train data**

From the frequency the dataset is balanced

In [None]:
from collections import Counter
print(Counter(y_train))

**Read Testing Data and Extract the Feature using SIFT**

In [None]:
x_test = []
y_test = []
test_descriptor = []
test_descriptors = []

for i,filename in enumerate(tqdm(test)):
    image = cv2.imread(os.path.join(path, filename), 0)
    
    if(i<5):
        plt.subplot(1, 5, i+1)
        plt.imshow(image, cmap='gray')
        plt.axis('off')
        
    kp, ds = SIFT.detectAndCompute(image,None)
    if len(kp)>0:
        x_test.append(image)
        if filename.split('.')[0] =='cat':
            y_test.append('cat')
            test_descriptor.append(("cat", ds))
        else:
            y_test.append('dog')
            test_descriptor.append(("dog", ds))
        for d in ds: 
            test_descriptors.append(d)


**Dataset Frequency for both label in test data**

From the frequency the dataset is balanced

In [None]:
print(Counter(y_test))

**From the dataset, there are more than 2 millions local feature extracted**

Since this will make the computation slow, the extracted feature is then clustered using kmeans to reduce the dimension

In [None]:
train_descriptors = np.array(train_descriptors)
test_descriptors = np.array(test_descriptors)
print("Count of Local Feature: %d" % train_descriptors.shape[0])
print("Count of Local Feature: %d" % test_descriptors.shape[0])

# Elbow method for finding the best number of cluster

Since the clustering time is long, i skipped some number and only checked 5 number of cluster

In [None]:
# all_k = [1,25,50, 75, 100]
# error = []
# for k in all_k:
#     model = KMeans(k)
#     model.fit(train_descriptors)
#     error.append(model.inertia_)

# plt.plot(all_k, error)
# plt.xlabel('K')
# plt.ylabel('Inertia')
# plt.title('The Elbow Method')
# plt.show()

# Clustering using Kmeans (Visual Words)

the number cluster is from elbow method that is done above which is 50

In [None]:
import math

k = 50

model = KMeans(n_clusters = k, n_init = 1, max_iter = 20, verbose = 1)
model.fit(train_descriptors)

# Creating Frequency diagram for each Visual Word

**Creating Histogram for train local feature**

In [None]:
train_features = np.zeros((len(x_train),k),"float32")
y_train = []
for i in tqdm(range(len(x_train))):
    prediction = model.predict(train_descriptor[i][1])
    y_train.append(train_descriptor[i][0])
    for w in prediction:
        train_features[i][w]+=1

**Creating Histogram for test local feature**

In [None]:
test_features = np.zeros((len(x_test),k),"float32")
y_test = []
for i in tqdm(range(len(x_test))):
    prediction = model.predict(test_descriptor[i][1])
    y_test.append(test_descriptor[i][0])
    for w in prediction:
        test_features[i][w]+=1

In [None]:
LABELS = ["cat", "dog"]
LABEL2INDEX = {"cat": 0, "dog": 1}
INDEX2LABEL = {0: "cat", 1: "dog"}

In [None]:
# y_train = [LABEL2INDEX[label] for label in y_train]
# y_test = [LABEL2INDEX[label] for label in y_test]

# y_train = [INDEX2LABEL[label] for label in y_train]
# y_test = [INDEX2LABEL[label] for label in y_test]

# Finding the best number of neighbors for KNN

In [None]:
import math

n_neighbors = 3
best_acc = 0
best_neighbors = 0
neighbors = []
acc = []

for z in range(1,10):
    KNN = KNeighborsClassifier(n_neighbors = n_neighbors)
    KNN.fit(train_features, y_train)
    
    y_pred = KNN.predict(test_features)

    # y_test = [INDEX2LABEL[index] for index in y_test]
    # y_pred = [INDEX2LABEL[index] for index in y_pred]

    accuracy = (accuracy_score(y_test, y_pred))
    acc.append(accuracy)
    neighbors.append(n_neighbors)
    
    if(best_acc<accuracy):
        best_acc=accuracy
        best_neighbors = n_neighbors
    
    n_neighbors+=14

**Plot for the classifier**

In [None]:
plt.plot(neighbors, acc)
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.title('accuracy based on neighbors')
plt.show()

# Creating the Classifier based on the best number of neighbors

In [None]:
KNN = KNeighborsClassifier(n_neighbors = best_neighbors)
KNN.fit(train_features, y_train)

# Evaluating the Classifier using Classification Report and Confusion Matrix

In [None]:
y_pred = KNN.predict(test_features)

# y_test = [INDEX2LABEL[index] for index in y_test]
# y_pred = [INDEX2LABEL[index] for index in y_pred]

accuracy = (accuracy_score(y_test, y_pred))

print("Accuracy = %.2lf\n" % (accuracy*100))

report = classification_report(y_test, y_pred)
print(report)

cf_matrix = confusion_matrix(y_test, y_pred)
cf_matrix = pd.DataFrame(cf_matrix, index = ["cat", "dog"],
                  columns = ["cat", "dog"])
plt.figure(figsize = (10,7))
sns.heatmap(cf_matrix, annot=True)

# Saving the Model

In [None]:
import pickle

filename = 'kmeans.sav'
pickle.dump(model, open(filename, 'wb'))