In [None]:
import os
import shutil
from tqdm import tqdm
import random
import numpy as np
from tensorflow import keras
import tensorflow as tf
import pathlib
import matplotlib.pyplot as plt
import pandas as pd

### a) Download the dataset from kaggle and arrange images as per requirnment, 
If you are executing on colab

Let's download dataset from kaggle to drive and have look at Procedure you will need to follow  
>1) Download kaggle.json from kaggle/username/account , In that json you will get username and key.  
>2) Replace key in the code below and execute 
```
>import os  
>os.environ['KAGGLE_USERNAME'] = "usenrname" # username from the json file  
>os.environ['KAGGLE_KEY'] = "key" # key from the json file  
```
>3) Now copy API command from kaggle, You can get that in Data tab of the competition, It will be like this.   
*(Attention : Please don't use double quotes to the path)*  
```
>*!kaggle competitions download -c "compitition name seperated with (-)" -p "folder path where you want to download dataset"*  
```

In [None]:
# Uncomment this if you are executing this code on colab

# os.environ['KAGGLE_USERNAME'] = "username" # username from the json file
# os.environ['KAGGLE_KEY'] = "key" # key from the json file

# !kaggle competitions download -c dogs-vs-cats-redux-kernels-edition -p /content/drive/MyDrive/Projects/Image_search_engine/train_zip/

#### Unzip file

You will need to unzip file you downloaded from kaggle  
There are multiple way to unzip the file  
- 1) Using zipfile liabrary , We get flexibility to deal with the zipfile
- 2) Using command line, Sometime I prefer to use command line as shown below    
```
!unzip zip_file_path -d destination_folder

```

Hold a cup of coffee for a minute till it unzip

In [None]:
train_zip_path = r'/kaggle/input/dogs-vs-cats-redux-kernels-edition/train.zip'
test_zip_path = r'/kaggle/input/dogs-vs-cats-redux-kernels-edition/test.zip'
base_dir = '/kaggle/working/Data/'
unzip_dir = '/kaggle/working/data/'
no_of_images = 6000 #No of images we want to take for training and validation, for validation set we will take 80% of training

In [None]:
if not os.path.exists(unzip_dir):
    os.makedirs(unzip_dir)
    print(f'Directory ceated at {unzip_dir}')

In [None]:
shutil.rmtree(unzip_dir)

In [None]:
import zipfile
with zipfile.ZipFile(train_zip_path) as f:
    f.extractall(unzip_dir)
f.close

In [None]:
with zipfile.ZipFile(test_zip_path) as f:
    f.extractall(unzip_dir)
f.close

#### Arrange unzip images in the folder structure below

In [None]:
# Let's see how many files we have in unziped folder
len(os.listdir(unzip_dir + 'train/'))

In [None]:
# If we have base directory in path we are going to remove the directory
if os.path.exists(base_dir):
      shutil.rmtree(base_dir )

# Creating typical folder structure used in image classification where images are stoored in respective classes 
os.makedirs(base_dir + 'train/dog')
os.makedirs(base_dir + 'train/cat')
os.makedirs(base_dir + 'val/dog')
os.makedirs(base_dir + 'val/cat')

In [None]:
# Now i am saving dog and cat images name in list, so that i can copy those images to the folder structure we need
cat_filename = []
dog_filename = []

for i in tqdm(os.listdir(unzip_dir + 'train/')):
    if i.startswith('dog'):
        dog_filename.append(i)
    else:
        cat_filename.append(i)

In [None]:
# source = '/content/drive/MyDrive/Projects/Image_search_engine/data/train/'

for id_, image  in tqdm(enumerate(random.sample(dog_filename,no_of_images))):
    if id_<int(0.8 * no_of_images):
        shutil.copy2(unzip_dir + 'train/' + image, base_dir + 'train/' + 'dog/' )
    else:
        shutil.copy2(unzip_dir + 'train/'  + image, base_dir + 'val/' + 'dog/' )
        
for id_, image  in tqdm(enumerate(random.sample(cat_filename,no_of_images))):
    if id_<int(0.8 * no_of_images):
        shutil.copy2(unzip_dir + 'train/' + image, base_dir + 'train/' + 'cat/' )
    else:
        shutil.copy2(unzip_dir + 'train/' + image, base_dir + 'val/' + 'cat/' )

### b) Data Preparation 
To have faster communication while training we will take advantage by prefetching images using tensorflow, This is procedure we can follow for every dataset in this format, This will drastically decrease the training time 

In [None]:
TRAIN_DATA_DIR = base_dir + 'train/'
VALIDATION_DATA_DIR = base_dir + 'val/'
batch_size = 32

In [None]:
# pathlib.Path, will listing of subdirectories in the "TRAIN_DATA_DIR"
train_data_dir = pathlib.Path(TRAIN_DATA_DIR)

# .glob will find all files by using pattern we provided in the paranthessis of glob, Here all files in subfolders which is
# Having extension as .jpg
image_count_train = len(list(train_data_dir.glob('*/*.jpg')))
print(image_count_train)

In [None]:
# Similarly we will do for VALIDATION_DATA_DIR
valid_data_dir = pathlib.Path(VALIDATION_DATA_DIR)
image_count_valid= len(list(valid_data_dir.glob('*/*.jpg')))
print(image_count_valid)

In [None]:
# A dataset of all files matching one or more glob patterns metioned in "tf.data.Dataset.list_files(pattern)"" 
train_list_ds = tf.data.Dataset.list_files(str(train_data_dir/'*/*'), shuffle=False)

# We are shuffling the dataset with buffer size as image_count_train, Which is the best way to shuffle dataset,
# But sometimes we can't fit that much of data in ram at a time, so I try to use 10% of data 
train_list_ds = train_list_ds.shuffle(image_count_train, reshuffle_each_iteration=False)

# Similarly we will do this for validation data
valid_list_ds = tf.data.Dataset.list_files(str(valid_data_dir/'*/*'), shuffle=False)
valid_list_ds = valid_list_ds.shuffle(image_count_valid, reshuffle_each_iteration=False)

In [None]:
# As we can see from the result we have stored path of each image in "train_list_ds"
for f in train_list_ds.take(10):
    print(f)

In [None]:
# Now, Need to know how many classes we have ? , Which are stored inside the "class_names" variable as list of strings
class_names= [i.name for i in train_data_dir.glob('*')]
class_names

In [None]:
def get_label(file_path):
    # split the path by seperator "/"
    parts = tf.strings.split(file_path,  os.path.sep)

    # eg. let's assume value inside the "one_hot" will be [1, 0], this means that this file is belong to of class 
    # dog from list of class_names ['dog', 'cat'] 
    one_hot = parts[-2] == class_names

    return tf.argmax(one_hot) # Return 1 or 0

In [None]:
def decode_image(file_path):

    # read image in the string format
    image = tf.io.read_file(file_path)

    # decode image, this will decode string into array
    image = tf.io.decode_jpeg(image)

    return tf.image.resize(image, size =[224, 224]) # resize the image [224,224]

In [None]:
def process_img(file_path):

    # using "get_label" function we will get label
    label = get_label(file_path)

    # using "decode_label" function we will get image in format of array
    image = decode_image(file_path)

    return  image, label # returning image and label

In [None]:
# prefetch base on the memory available
AUTOTUNE = tf.data.AUTOTUNE

# Finally we will call "process_img" function, using map, This will store image and label in train_ds. 
train_ds = train_list_ds.map(process_img, AUTOTUNE )

# Similarly for valid dataset
valid_ds = valid_list_ds.map(process_img, AUTOTUNE)

In [None]:
def configure_for_performance(ds):

    #  To train a model with this dataset you will want the data:
    #  To be well shuffled.
    #  To be batched.
    #  Batches to be available as soon as possible.
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size = AUTOTUNE)
    return ds

In [None]:
# configure the performance
train_ds = configure_for_performance(train_ds)
valid_ds  = configure_for_performance(valid_ds)

In [None]:
# Let's visualize the dataset
image_batch, label_batch = next(iter(train_ds))

plt.figure(figsize=(10, 10))
for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(image_batch[i].numpy().astype("uint8"))

    label = label_batch[i]
    plt.title(class_names[label])
    plt.axis("off")

### c) Build model withiout finetuning

We will use Mobilenet pretrained model

In [None]:
base_model = keras.applications.MobileNet(input_shape=(224,224,3),weights = 'imagenet', include_top=False)
# base_model.summary()

In [None]:
# Set base model trainable as false
base_model.trainable = True
for layer in base_model.layers[:-25]:
    layer.trainable =False

In [None]:
# keras input with image shape (224, 224 , 3)
inputs = keras.Input(shape = (224,224,3))

# Preprocessing of image
  # rescaling by dividing 255.0
x = keras.layers.experimental.preprocessing.Rescaling(1./255)(inputs)

  # RandomRotation by 20
x = keras.layers.experimental.preprocessing.RandomRotation(20)(x)
  # RandomHeight by 0.2
x = keras.layers.experimental.preprocessing.RandomHeight(0.2)(x)
  # RandomWidth by 0.2
x = keras.layers.experimental.preprocessing.RandomWidth(0.2)(x)

# Let's pass the model to base model
x = base_model(x)

# Global average pooling for passing to dense layer 
x = keras.layers.GlobalAveragePooling2D()(x)

# Dense layer with 64 neuron
x = keras.layers.Dense(64, activation ='relu')(x)

# Dropout 20% of neuron
x = keras.layers.Dropout(0.2)(x)

# Let's predict
outputs= keras.layers.Dense(2, activation='softmax')(x)

# finally we are here to create model
model = keras.Model(inputs, outputs)

In [None]:
# Compile the model

 # We use SparseCategoricalCrossentropy because our label are in integet format
model.compile(loss = keras.losses.SparseCategoricalCrossentropy(),
             optimizer = keras.optimizers.Adam(learning_rate=0.001),
             metrics = ['accuracy'])

In [None]:
# We will stop training of model if there is no change in val loss upto 3 iteration
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience = 3)

In [None]:
# Fit the model 
history = model.fit(train_ds, epochs = 20,
          validation_data = valid_ds,
         batch_size = 32, callbacks = [early_stopping])

In [None]:
import pandas as pd
pd.DataFrame(history.history).plot()

In [None]:
Model_path= '/kaggle/working/Model/'

In [None]:
# Check is model path exists or not, otherwise create one
if not os.path.exists(Model_path):
    os.makedirs(Model_path)
    print(f'New dir created at {Model_path}')

In [None]:
model.save(Model_path + '/cat_dog.h5')

In [None]:
model = keras.models.load_model(Model_path + '/cat_dog.h5')

In [None]:
def get_image(img_path = None):
    # This functions, will return img array, 
    # if image path is not given it will choose randomly from the val directory, either from dog or cat class
    if img_path == None:
        random_class = str(random.sample(class_names,1)[0])
        img_path = os.path.join(base_dir + 'val/' +random_class + '/'+ random.sample(os.listdir(base_dir + 'val/'+ random_class + '/'),1)[0])
    else:
        img = tf.io.read_file(img_path)
        img = tf.image.decode_image(img, channels=3)
        img = tf.image.resize(img, size = [224,224])
        img = tf.expand_dims(img.numpy(), axis = 0)
    return img

### d) Evaluate model by visualizing

 Let's evaluate our dataset on 
> Some questions we need to answer
>> 1) Which images are the most confident?    
>> 2) Which images are the list confident?    
>> 3) Which images got high confidence instead of high probability?  

In [None]:
pred_prob_0= [] # prob of class 0
pred_prob_1= [] # prob of class 1
pred_class = [] # pred class by our model
actual_class = [] # Actualc class
path_of_image = [] # path of ptredicted image

for cat_name in class_names:
    for im_g in tqdm(os.listdir(base_dir + 'val/' + cat_name + '/')):  
        pred = model.predict(get_image(img_path=base_dir + 'val/' + cat_name + '/' + im_g))
        pred_prob_0.append(pred[0][0])
        pred_prob_1.append(pred[0][1])
        pred_class.append(np.argmax(pred))
        actual_class.append(class_names.index(cat_name))
        path_of_image.append(base_dir + 'val/' + cat_name + '/' + im_g)
  

In [None]:
# Let's store all data we collected inside a dataframe
df = pd.DataFrame(columns = ['pred_prob_0', 'pred_prob_1', 'pred_class', 'actual_class'])
df['pred_prob_0'] = pred_prob_0
df['pred_prob_1'] = pred_prob_1
df['pred_class'] = pred_class
df['actual_class'] = actual_class
df['path_of_image'] = path_of_image
# df['pred_class'] = df['pred_class'].astype('int64')
# df['actual_class'] = df['actual_class'].astype('int64')
df.sample(10)

#### Analysis of dog class

Intentionally i have written this code again and again to have big picture what we are doing

In [None]:
# Images with predicted class as dog with highest probability
x = df[df['pred_class']==0].sort_values(by = ['pred_prob_0'], ascending = False, axis = 0).iloc[:9]
plt.figure(figsize=(12,12))
for idx in range(9):
    plt.subplot(3,3, idx+1)
    img = get_image(x.iloc[idx]['path_of_image'])
    plt.imshow(tf.squeeze(img, axis=0)/255.0)
    plt.title(label = f'Pred_prob dog {round(x.iloc[idx]["pred_prob_0"], 2)} \n Actual image {class_names[x.iloc[idx]["actual_class"]]}')
    plt.axis('off')

In [None]:
# Now lets see images predicted as dog with least probability

x = df[df['pred_class']==0].sort_values(by = ['pred_prob_0'], ascending = True, axis = 0).iloc[:9]
plt.figure(figsize=(10,10))
for idx in range(9):
    plt.subplot(3,3, idx+1)
    img = get_image(x.iloc[idx]['path_of_image'])
    plt.imshow(tf.squeeze(img, axis=0)/255.0)
    plt.title(label = f'Pred_prob dog {round(x.iloc[idx]["pred_prob_0"], 2)} \n Actual image {class_names[x.iloc[idx]["actual_class"]]}')
    plt.axis('off')

In [None]:
# Let's see those which are confident about dog inspite of cats

x = df[(df['pred_class']==0) & (df['actual_class']==1)].sort_values(by = ['pred_prob_0'], ascending = False, axis = 0).iloc[:9]
le = len(x)
plt.figure(figsize=(10,10))
for idx in range(4):
    plt.subplot(2,2, idx+1)
    img = get_image(x.iloc[idx]['path_of_image'])
    plt.imshow(tf.squeeze(img, axis=0)/255.0)
    plt.title(label = f'Pred_prob as dog {round(x.iloc[idx]["pred_prob_0"], 2)} \n Actual image {class_names[x.iloc[idx]["actual_class"]]}')
    plt.axis('off')

#### Analysis of cat class

In [None]:
# Images with predicted class as cat with highest probability
x = df[df['pred_class']==1].sort_values(by = ['pred_prob_1'], ascending = False, axis = 0).iloc[:9]
plt.figure(figsize=(12,12))
for idx in range(9):
    plt.subplot(3,3, idx+1)
    img = get_image(x.iloc[idx]['path_of_image'])
    plt.imshow(tf.squeeze(img, axis=0)/255.0)
    plt.title(label = f'Pred_prob cat {round(x.iloc[idx]["pred_prob_1"], 2)} \n Actual image {class_names[x.iloc[idx]["actual_class"]]}')
    plt.axis('off')

In [None]:
# Now lets see images predicted as cat with least probability
x = df[df['pred_class']==1].sort_values(by = ['pred_prob_1'], ascending = True, axis = 0).iloc[:9]
plt.figure(figsize=(10,10))
for idx in range(9):
    plt.subplot(3,3, idx+1)
    img = get_image(x.iloc[idx]['path_of_image'])
    plt.imshow(tf.squeeze(img, axis=0)/255.0)
    plt.title(label = f'Pred_prob cat {round(x.iloc[idx]["pred_prob_1"], 2)} \n Actual image {class_names[x.iloc[idx]["actual_class"]]}')
    plt.axis('off')

In [None]:
# Let's see those which are confident about cat inspite of dog

x = df[(df['pred_class']==1) & (df['actual_class']==0)].sort_values(by = ['pred_prob_1'], ascending = False, axis = 0).iloc[:9]
plt.figure(figsize=(10,10))
for idx in range(4):
    plt.subplot(2,2, idx+1)
    img = get_image(x.iloc[idx]['path_of_image'])
    plt.imshow(tf.squeeze(img, axis=0)/255.0)
    plt.title(label = f'Pred_prob as cat {round(x.iloc[idx]["pred_prob_1"], 2)} \n Actual image {class_names[x.iloc[idx]["actual_class"]]}')
    plt.axis('off')

### e) evaluation matrics 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

#### 1) Classification accuracy




$Accuracy = \frac{\text{Number of correct prediction }}{\text{ No of all prediction }}$

In [None]:
print(f"Accuracy of our model is {accuracy_score(df['actual_class'], df['pred_class'])}")

In [None]:
# Pie chart

labels = ['Correctly predicted', 'Incorrectly predicted']
sizes = [accuracy_score(df['actual_class'], df['pred_class']), 1-accuracy_score(df['actual_class'], df['pred_class'])]
explode = (0, 0.1)  # only "explode" the 2nd slice

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

# it's look really cool

#### 2) Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
cm = confusion_matrix(df['actual_class'], df['pred_class'])
cm

This doesent look good , we will try to plot

In [None]:
ax = sns.heatmap(cm, annot=True, fmt='g');

## set title, X/Y Labels.
ax.set_title('Seaborn Confusion Matrix');
ax.set_xlabel('Predicted Class')
ax.set_ylabel('Actual Class');
## For the Tick Labels, the labels should be in Alphabetical order
ax.xaxis.set_ticklabels(class_names)
ax.yaxis.set_ticklabels(class_names)

#### 3) Precision

This will tell us how many predicted positive are correcly classified  
$ Precision = \frac{\text{TP}}{\text{TP + FP}} $



In [None]:
TP = cm[0][0]
FP = cm[1][0]
FN = cm[0][1]
TN = cm[1][1]

In [None]:
'''comment :-
      We are predicting correcly a positive class with 97% times, among positive prediction
      '''
precision = TP/(TP+FN)
precision

#### 4) Recall
This will tell us, How many predicted positive among all positive in our dataset.  

$ Recall = \frac{\text{TP}}{\text{TP + FN}} $

eg. as we can see from the confusion matrix, 1184 of dog class are predicted correctly and 16 predicted as cat though the image is of dog.

In [None]:
''' comment:
 97% times we are classifying positive image correctly
'''
Recall = TP/(TP + FN)
Recall

#### 5) F1 Score

$\text{F1 score} = 2 * \frac{\text{precision} *  \text{recall}}{\text{precision} +  \text{recall}}$

In [None]:
# It will take into account both precicion and recall, 
# Best value for f1 score is 1 and the worst 0

F1_score = 2* ((precision * Recall)/(precision + Recall))
F1_score

#### 6) Specificity

Proportion of negative class that is correctly predicted negative  , We can say this as recall for negative class

$ Specificity = \frac{\text{TN}}{\text{TN + FP}} $

In [None]:
specificity = (TN/(TN+FP))
specificity

In [None]:
# If you come along to the end of this notebook, Please do comment for any improvement, Thank you :) 