In [None]:
# Importing important libraries and modules
import cv2
import os
import glob
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.layers import Dropout, Conv2D, Dense, BatchNormalization, AveragePooling2D, MaxPooling2D, Flatten, GlobalAveragePooling2D
from keras.models import Sequential, load_model
from keras.applications.xception import Xception
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# We have to read the data from a file which contains data in the form of image. 
# The folder is named as 'train' and it contains images of different breed of dogs and for label corresponding to each image we will need to read labels.csv

# First of all we will extract the detail of all the data and save all of them in terms of dataframe with foldername, imagename, objectname and labels
detail = sorted(glob.glob("../input/dog-breed-identification/train/*"))
foldername = [str(i.split("in/")[0]) + "in" for i in detail]
imagename = [str(i.split("/")[4]) for i in detail]
label = np.array((pd.read_csv('../input/dog-breed-identification/labels.csv'))["breed"])

# Defining dataframe and saving all the extracted information in that dataframe
data_detail = pd.DataFrame() 
data_detail["foldername"] = foldername
data_detail["imagename"] = imagename
data_detail["label"] = label


# Analying the train data detail
print("\nNumber of images in training set = "+str(len(detail)))
print(data_detail.columns)
data_detail.head()

In [None]:
# Checking the no of images we have for each class
fig = plt.figure(figsize = (40, 10))
ax = fig.add_axes([0,0,1,1])
ax.set_title("labels in Data set", fontsize = 50)
sns.countplot(x = "label", data = data_detail)
for i in ax.patches:
    ax.text(x = i.get_x() + 0.2, y = i.get_height()+1.5, s = str(i.get_height()), fontsize = 10, color = "black")
plt.xlabel("")
plt.ylabel("Count", fontsize = 35)
plt.tick_params(labelsize = 5)
plt.xticks(rotation = 90)
plt.show()               # Clearly it shows that there is a case of class imbalance here

In [None]:
# Splitting training set into initial training set and test set
train_data_detail, test_data_detail = train_test_split(data_detail, stratify=data_detail["label"], test_size = 0.08)

# Splitting training data into final training set and cross validation set
train_data_detail, cv_data_detail = train_test_split(train_data_detail, stratify=train_data_detail["label"], test_size = 0.086956)
train_data_detail.shape, test_data_detail.shape, cv_data_detail.shape

In [None]:
# Resetting index of train, cross validation and test set
train_data_detail.reset_index(inplace = True, drop = True)
cv_data_detail.reset_index(inplace = True, drop = True)
test_data_detail.reset_index(inplace = True, drop = True)

In [None]:
# plotting  and printing distribution of each class in all train, cross validation and test set

# for training data.................................................................................................................
fig = plt.figure(figsize = (40, 10))
ax = fig.add_axes([0,0,1,1])
ax.set_title("labels in Training Data set", fontsize = 50)
sns.countplot(x = "label", data = train_data_detail)
for i in ax.patches:
    ax.text(x = i.get_x() + 0.2, y = i.get_height(), s = str(i.get_height()), fontsize = 10, color = "black")
plt.xlabel("")
plt.ylabel("Count", fontsize = 35)
plt.tick_params(labelsize = 5)
plt.xticks(rotation = 90)
plt.show()

# for cross validation data............................................................................................................
fig = plt.figure(figsize = (40, 10))
ax = fig.add_axes([0,0,1,1])
ax.set_title("labels in cv Data set", fontsize = 50)
sns.countplot(x = "label", data = cv_data_detail)
for i in ax.patches:
    ax.text(x = i.get_x() + 0.2, y = i.get_height(), s = str(i.get_height()), fontsize = 10, color = "black")
plt.xlabel("")
plt.ylabel("Count", fontsize = 35)
plt.tick_params(labelsize = 5)
plt.xticks(rotation = 90)
plt.show()

# for test data............................................................................................................................
fig = plt.figure(figsize = (40, 10))
ax = fig.add_axes([0,0,1,1])
ax.set_title("labels in test Data set", fontsize = 50)
sns.countplot(x = "label", data = test_data_detail)
for i in ax.patches:
    ax.text(x = i.get_x() + 0.2, y = i.get_height(), s = str(i.get_height()), fontsize = 10, color = "black")
plt.xlabel("")
plt.ylabel("Count", fontsize = 35)
plt.tick_params(labelsize = 5)
plt.xticks(rotation = 90)
plt.show()


In [None]:
# To do tranfer learning creating a base model from VGG-16 pre trained model on Imagenet datset
base_model = Xception(weights='imagenet', include_top=False)

In [None]:
# Changing the data into an array of pixels and labels so that it can be fed into the model expect test which is for prediction only
# Initially it was in the form of a DataFrame
# Also Creating bottleneck features from base model and storing them

In [None]:
# for training data
train_x = []
train_y = []
for i in range(len(train_data_detail)):
        path1 = train_data_detail["foldername"][i]
        path2 = train_data_detail["imagename"][i]
        image = cv2.imread(os.path.join(path1, path2))
        image = cv2.resize(image, (224,224))
        #here, we are normalizing the images
        image = image/255.0 
        image = image.reshape(1,224,224,3)
        image = base_model.predict(image)
        image = image.reshape(image.shape[1],image.shape[2],image.shape[3] )
        #Creating and saving each image in the form of numerical data in an array 
        train_x.append(image)
        #appending corresponding labels 
        train_y.append(train_data_detail['label'][i])  
        if i%500 == 0:
            print("no of images processed =",i)
train_x = np.array(train_x,dtype=np.uint8)
train_y = np.array(pd.get_dummies(train_y),dtype=np.uint8)
print(" for training data ", train_x.shape, train_y.shape)

In [None]:
# for test data
cv_x = []
cv_y = []
for i in range(len(cv_data_detail)):
        path1 = cv_data_detail["foldername"][i]
        path2 = cv_data_detail["imagename"][i]
        image = cv2.imread(os.path.join(path1, path2))
        image = cv2.resize(image, (224,224))
        #here, we are normalizing the images
        image = image/255.0 
        image = image.reshape(1,224,224,3)
        image = base_model.predict(image)
        image = image.reshape(image.shape[1],image.shape[2],image.shape[3] ) 
        #Creating and saving each image in the form of numerical data in an array 
        cv_x.append(image)
        #appending corresponding labels 
        cv_y.append(cv_data_detail['label'][i]) 
        if i%500 == 0:
            print("no of images processed =",i)
cv_x = np.array(cv_x,dtype=np.uint8)
cv_y = np.array(pd.get_dummies(cv_y),dtype=np.uint8)
print(" for cv data ",cv_x.shape, cv_y.shape)

In [None]:
# Defining a model 
def model():
    model = Sequential()
    model.add(GlobalAveragePooling2D(input_shape=train_x.shape[1:]))
    model.add(Dropout(0.3))
    model.add(Dense(120, activation='softmax'))
    
    return model
model = model()
model.summary() 

In [None]:
# Compiling and running the model
model.compile(loss = 'categorical_crossentropy', optimizer = "adam", metrics = ["accuracy"])
hist = model.fit(train_x, train_y, validation_data=(cv_x, cv_y), epochs = 25)

In [None]:
# deleting some data to free up ram
del train_x
del train_y
del cv_x
del cv_y
gc.collect()

In [None]:
# visualizing losses and accuracy with epochs 
epoch_number = []
for epoch in range(25):
    epoch_number.append(epoch + 1)
train_loss = hist.history['loss']
val_loss   = hist.history['val_loss']
train_acc  = hist.history['accuracy']
val_acc    = hist.history['val_accuracy']

In [None]:
# printing a table depicting the detail about the trained model
log_frame = pd.DataFrame(columns = ["Epoch", "Train_Loss", "Train_Accuracy", "CV_Loss", "CV_Accuracy"])
log_frame["Epoch"] = epoch_number
log_frame["Train_Loss"] = train_loss
log_frame["Train_Accuracy"] = train_acc
log_frame["CV_Loss"] = val_loss
log_frame["CV_Accuracy"] = val_acc 
log_frame

In [None]:
# plotting epoch vs loss
def plotting(epoch, train_loss, CV_loss, title):
    fig, axes = plt.subplots(1,1, figsize = (12, 8))
    axes.plot(epoch, train_loss, color = 'red', label = "Train")
    axes.plot(epoch, CV_loss, color = 'blue', label = "CV")
    axes.set_title(title, fontsize = 25)
    axes.set_xlabel("Epochs", fontsize = 20)
    axes.set_ylabel("Loss", fontsize = 20)
    axes.grid()
    axes.legend(fontsize = 20)

plotting(list(log_frame["Epoch"]), list(log_frame["Train_Loss"]), list(log_frame["CV_Loss"]), "EPOCH VS LOSS") 

In [None]:
# plotting epoch vs accuracy
def plotting(epoch, train_acc, CV_acc, title):
    fig, axes = plt.subplots(1,1, figsize = (12, 8))
    axes.plot(epoch, train_acc, color = 'red', label = "Train_Accuracy")
    axes.plot(epoch, CV_acc, color = 'blue', label = "CV_Accuracy")
    axes.set_title(title, fontsize = 25)
    axes.set_xlabel("Epochs", fontsize = 20)
    axes.set_ylabel("Accuracy", fontsize = 20)
    axes.grid()
    axes.legend(fontsize = 20)

plotting(list(log_frame["Epoch"]), list(log_frame["Train_Accuracy"]), list(log_frame["CV_Accuracy"]), "EPOCH VS ACCURACY") 

In [None]:
# for cv data
test_x = []
test_y = []
for i in range(len(test_data_detail)):
        path1 = test_data_detail["foldername"][i]
        path2 = test_data_detail["imagename"][i]
        image = cv2.imread(os.path.join(path1, path2))
        image = cv2.resize(image, (224,224))
        #here, we are normalizing the images
        image = image/255.0 
        image = image.reshape(1,224,224,3)
        image = base_model.predict(image)
        image = image.reshape(image.shape[1],image.shape[2],image.shape[3] ) 
        #Creating and saving each image in the form of numerical data in an array 
        test_x.append(image)
        #appending corresponding labels 
        test_y.append(test_data_detail['label'][i])  
test_x = np.array(test_x,dtype=np.uint8)
test_y = np.array(pd.get_dummies(test_y),dtype=np.uint8)
print(" for test data ",test_x.shape, test_y.shape)

In [None]:
# predicting on test data
test_predict = model.predict(test_x)

In [None]:
# log loss on test data
from sklearn.metrics import log_loss
loss = log_loss(test_y, test_predict)
loss

In [None]:
# free up ram
del test_x
del test_y
del test_predict
gc.collect()

In [None]:
# We also need to read the test data for prediction from a file which contains data in the form of image. 
# The folder is named as 'test' and it contains images different breed of dogs

# First of all we will extract the detail of all the data and save all of them in terms of dataframe with foldername and imagename only
detail = sorted(glob.glob("../input/dog-breed-identification/test/*"))
foldername = [str(i.split("st/")[0]) + "st" for i in detail]
imagename = [str(i.split("/")[4]) for i in detail]

# Defining dataframe and saving all the extracted information in that dataframe
test_data_for_prediction_detail = pd.DataFrame() 
test_data_for_prediction_detail["foldername"] = foldername
test_data_for_prediction_detail["imagename"] = imagename

# Analying the test data set for prediction detail
print("\nNumber of images in test data set for prediction  = "+str(len(detail)))
print(test_data_for_prediction_detail.columns)
test_data_for_prediction_detail.head()

In [None]:
# Changing the data into an array of pixels and labels so that it can be fed into the model for prediction 
# Initially it was in the form of a DataFrame

# for test data for prediction data
prediction = []
for i in range(len(test_data_for_prediction_detail)):
        path1 = test_data_for_prediction_detail["foldername"][i]
        path2 = test_data_for_prediction_detail["imagename"][i]
        image = cv2.imread(os.path.join(path1, path2))
        image = cv2.resize(image, (224,224))
        #here, we are normalizing the images
        image = image/255.0 
        image = image.reshape(1,224,224,3)
        image = base_model.predict(image)
        image = image.reshape(image.shape[1],image.shape[2],image.shape[3] ) 
        #Creating and saving each image in the form of numerical data in an array 
        prediction.append(image) 
        if i%500 == 0:
            print("no of images processed =",i)
prediction = np.array(prediction,dtype=np.uint8)
print(" for test data for prediction ", prediction.shape)

In [None]:
# Now prediction on data to be predicted
prediction_predict = model.predict(prediction)

In [None]:
# Free up ram
del prediction
gc.collect()

In [None]:
prediction_predict

In [None]:
# Making new dataframe with id and all the classes and then overlaping with prediction
labels = pd.read_csv('/kaggle/input/dog-breed-identification/labels.csv')
classes = sorted(list(set(labels['breed'])))
submission = pd.DataFrame(columns=["id"] + list(classes))
test_path = "../input/dog-breed-identification/test"
submission["id"] = sorted([os.path.splitext(path)[0] for path in os.listdir(test_path)])
submission.loc[:,list(classes)]= prediction_predict

In [None]:
# Final submission file
submission

In [None]:
# Saving the submission file
submission.to_csv('submission.csv', index = False)