In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pwd

#### Extract the zip files

In [None]:
import zipfile

with zipfile.ZipFile('../input/dogs-vs-cats-redux-kernels-edition/test.zip','r') as z:
    z.extractall('.')
    
with zipfile.ZipFile('../input/dogs-vs-cats-redux-kernels-edition/train.zip','r') as z:
    z.extractall('.')

In [None]:
!ls

#### Extract labels, Downsize images and prepare the data

In [None]:
import cv2
from random import shuffle

IMG_SIZE = 50
NUM_TRAIN = 25000

X_Train_orig = []
Y_Train_orig = []

files = list(os.listdir('/kaggle/working/train/'))

shuffle(files)
files = files[:NUM_TRAIN]

for i in files:
    label = i.split('.')[-3]
    if label == 'cat':
        label = 0
    elif label == 'dog':
        label = 1
    img = cv2.imread('/kaggle/working/train/'+i, cv2.IMREAD_COLOR)
    img = cv2.resize(img,(IMG_SIZE,IMG_SIZE), interpolation=cv2.INTER_CUBIC)
    X_Train_orig.append([np.array(img)/255, np.array(label)])

np.save('Training_Data.npy', X_Train_orig)

In [None]:
!ls

In [None]:
X = np.array([i[0] for i in X_Train_orig]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y = np.array([i[1] for i in X_Train_orig])
del X_Train_orig
gc.collect

In [None]:
import seaborn as sns
sns.countplot(Y).set_title("Data Distribution")

#### Select a subset

In [None]:
X.shape

In [None]:
Y.shape

#### Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, stratify=Y)
del X
del Y
gc.collect

In [None]:
print('Shape of X_train is :', X_train.shape)
print('Shape of Y_train is :', Y_train.shape)
print('Shape of X_val is :', X_val.shape)
print('Shape of Y_val is :', Y_val.shape)

In [None]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(20,20))   # to fix a shape for each image print
for i in range(50):          # using a for loop to display a number of images
    plt.subplot(5, 10, i+1) # we need to use this function to print an array of pictures 
    plt.imshow(X_val[i,:,:,:]) # this will call the images from train set one by one
    plt.title('DOG' if Y_val[i] == 1 else 'CAT')  # Lets also look into the labels 
    plt.axis('off') 

#### Model Evaluation

#### Model 1

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models

input_shape=X_train.shape[1:4]

model = models.Sequential()

model.add(layers.Conv2D(32, (3, 3), input_shape=input_shape))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization(axis = 3))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Conv2D(64, (3, 3)))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization(axis = 3))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(1024))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))

In [None]:
from tensorflow.keras.optimizers import Adam

epochs = 200
batch_size = 16
lrate = 0.0001
decay = lrate/epochs
optimizer = Adam(learning_rate=lrate, epsilon=1e-08, decay = decay)

In [None]:
from tensorflow.keras.optimizers import RMSprop
model.compile(loss='binary_crossentropy',
              optimizer=optimizer, #'rmsprop',
              metrics=['accuracy'])

model.summary()

#### Model Training

#### Callbacks

In [None]:
from keras.callbacks import ReduceLROnPlateau
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=2, 
                                            verbose=1, factor=0.5, min_lr=0.0000001)

In [None]:
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')

In [None]:
history = model.fit(x = X_train, y = Y_train, batch_size = batch_size, 
                        epochs=epochs, verbose=1, 
                        validation_data = (X_val, Y_val),
                          shuffle = True, 
                          steps_per_epoch= len(X_train)//batch_size, validation_steps=len(X_val)//batch_size,
                        callbacks=[learning_rate_reduction, early_stopping] )

In [None]:
gc.collect
preds = model.evaluate(X_train, Y_train)
print ("Loss = " + str(preds[0]))
print ("Training set Accuracy = " + str(preds[1]))

In [None]:
preds_val = model.evaluate(X_val, Y_val)
print ("Loss = " + str(preds_val[0]))
print ("Validation Set Accuracy = " + str(preds_val[1]))

In [None]:
y_prob = model.predict(X_val)

#### Confusion Matrix

In [None]:
# Goodness of fit helpers
from sklearn import metrics

y_pred = [1 if prob > 0.5 else 0 for prob in y_prob]

metrics.confusion_matrix(Y_val, y_pred)

#### ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr , tpr , thresholds = roc_curve ( Y_val , y_prob)

print(f'AUC is {roc_auc_score(Y_val, y_prob)}')

In [None]:
import matplotlib.pyplot as plt

def plot_roc_curve(fpr,tpr): 
  plt.plot(fpr,tpr) 
  plt.axis([0,1,0,1]) 
  plt.xlabel('False Positive Rate') 
  plt.ylabel('True Positive Rate') 
  plt.show()    
  
plot_roc_curve (fpr,tpr) 

#### Precision-Recall Curves

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(Y_val, y_prob)

import matplotlib.pyplot as plt

def plot_pr_curve(recall,precision): 
  plt.plot(recall, precision) 
  plt.axis([0,1,0,1]) 
  plt.xlabel('Recall') 
  plt.ylabel('Precision') 
  plt.show()   
  
plot_pr_curve (recall, precision) 

In [None]:
del X_train
del Y_train
del X_val
del Y_val
gc.collect

#### Prepare for Submission

In [None]:
X_Test_orig = []
for i in os.listdir('/kaggle/working/test/'):
    label = i.split('.')[-2]
    img = cv2.imread('/kaggle/working/test/'+i, cv2.IMREAD_COLOR)
    img = cv2.resize(img,(IMG_SIZE,IMG_SIZE), interpolation = cv2.INTER_CUBIC)
    X_Test_orig.append([np.array(img)/255, np.array(label)])

np.save('Test_Data.npy', X_Test_orig)

X_test = np.array([i[0] for i in X_Test_orig]).reshape(-1,IMG_SIZE, IMG_SIZE, 3)
Label = np.array([i[1] for i in X_Test_orig])

In [None]:
probs = model.predict(X_test, batch_size = batch_size)

In [None]:
prediction = pd.DataFrame()
prediction['id'] = Label
prediction['label'] = probs

prediction.to_csv('submission-1.csv', index = False)

#### Results

In [None]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(20,20))   # to fix a shape for each image print
for i in range(50):          # using a for loop to display a number of images
    plt.subplot(5, 10, i+1) # we need to use this function to print an array of pictures 
    plt.imshow(X_test[i,:,:,:]) # this will call the images from train set one by one
    plt.title('DOG' if probs[i] > 0.5 else 'CAT')  # Lets also look into the labels 
    plt.axis('off') 

In [None]:
#del X_test
#del prediction
#gc.collect