In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from PIL import Image
from keras.preprocessing import image

# **1**

## **1.a**

In [None]:
labels = pd.read_csv('../input/dog-breed-identification/labels.csv')
sample_submission = pd.read_csv('../input/dog-breed-identification/sample_submission.csv')
print('training set has ' + str(len(labels)) + ' entries')
print('test set has ' + str(len(sample_submission)) + ' entries')
print('total number of entries: ' + str(len(sample_submission) + len(labels)))

In [None]:
labels

## **1.b**

In [None]:
train_dir = '../input/dog-breed-identification/train/'
image_paths_train = {}
for name in labels['id']:
    image_paths_train[name]=Image.open(train_dir + name + '.jpg')
    
test_dir = '../input/dog-breed-identification/test/'

In [None]:
for k, img in image_paths_train.items():
    print('image dimensions: ' + str(img.size))

In [None]:
img = image_paths_train['000bec180eb18c7604dcecc8fe0dba07']
print('image mode: ' + str(img.mode) + ' (3 channels)')
classes = labels.groupby('breed')['breed'].count()
print('number of classes: ' + str(classes.shape[0]))
print('image dimensions is not constant between all images, so we need to change the dimensions to be the same')
print('we can use augmentation, like flipping the image or shifting it (which rearanges the pixels but the breed features are preserved)')

## **1.c**

In [None]:
classes.plot(kind='bar', figsize=(22,7)).set_ylabel('count per breed')
plt.title("entries per class distribution")
plt.show()
print('the data is somewhat balanced, but there are breeds with more examples than others')

## **1.d**

we found a result using FNN, with 4 layers and Adam optimizer. 
* results: 
* loss: 1.03
* acc: 0.917

we also found a result using validation Xception + inception.
* results: 
* logloss: 0.07
* acc: 0.975

## **1.e**

In [None]:
def plot_multiple_imgs(X,y=None,nrow=2,ncol=2,figsize=(13,7),preds=None,skip=0):
    fig,ax = plt.subplots(nrows=nrow,ncols=ncol,figsize=figsize)
    fig.subplots_adjust(hspace=0.1, wspace=0.1)
    for i in range(nrow*ncol):
        ax[i//ncol,i%ncol].imshow(X[skip+i],cmap='binary')
        ax[i//ncol,i%ncol].set_xticks([])
        ax[i//ncol,i%ncol].set_yticks([])
        if preds is not None:
            ax[i//ncol,i%ncol].text(0.85, 0.1, str(preds[skip+i]), transform=ax[i//ncol,i%ncol].transAxes,
                                   color='green' if y[skip+i]==preds[skip+i] else 'red',weight='bold')
            ax[i//ncol,i%ncol].text(0.05, 0.1, str(y[skip+i]), color='blue',transform=ax[i//ncol,i%ncol].transAxes,weight='bold')
        elif y is not None:
            ax[i//ncol,i%ncol].text(0.05, 0.1, str(y[skip+i]), color='blue',transform=ax[i//ncol,i%ncol].transAxes,weight='bold')
    plt.show()

In [None]:
def plot_breed_imgs(breed, df, dic):
    df_breed = df[df['breed'] == breed]
    breed_dic = { key: dic[key] for key in df_breed['id'] }
    plot_multiple_imgs(list(breed_dic.values()) )

### **Breeds that are easily separable**

In [None]:
print('golden retriever')
plot_breed_imgs('golden_retriever', labels, image_paths_train)
print('pekinese')
plot_breed_imgs('pekinese', labels, image_paths_train)

### **Breeds that are harder to distinguish**

In [None]:
print('norfolk terrier')
plot_breed_imgs('norfolk_terrier', labels, image_paths_train)
print('norwich terrier')
plot_breed_imgs('norwich_terrier', labels, image_paths_train)

# **2**

In [None]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D, Activation, Dropout, BatchNormalization, Conv2D, MaxPool2D
from keras.utils import to_categorical
import cv2

In [None]:
imgs = []
for idx in labels.index:
    img_id = labels['id'][idx]
    imgs.append(cv2.resize(cv2.imread(train_dir + img_id + '.jpg', cv2.IMREAD_UNCHANGED), (224,224)))

## **2.a**
 we will be using the train-test split validation strategy with 20% test

In [None]:
from sklearn.model_selection import train_test_split


## **2.b**

In [None]:
model = Sequential()
model.add(Conv2D(32,(3,3),activation='relu',input_shape=(224,224,3)))
model.add(Conv2D(32,(3,3),activation='relu'))
model.add(Dropout(0.1))
model.add(MaxPool2D())
model.add(Conv2D(16,(3,3),activation='relu'))
model.add(Conv2D(16,(3,3),activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPool2D())
model.add(Flatten())
model.add(Dense(120,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
from sklearn.preprocessing import LabelBinarizer
%env TF_KERAS = 1

encoder = LabelBinarizer()
transfomed_label = encoder.fit_transform(labels.breed)
X = np.asarray(imgs)
Y = np.asarray(transfomed_label)
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=42)

In [None]:
history = model.fit(X_train,y_train,validation_split=0.2,shuffle=True,epochs=10)

In [None]:
def show_results(history):
    fig, ax = plt.subplots(1,2,figsize=(12,4))
    ax[0].plot(history.history['accuracy'])
    ax[0].plot(history.history['val_accuracy'])
    ax[0].set_title('Model accuracy')
    ax[0].set_ylabel('Accuracy')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(['Train', 'Test'], loc='upper left')
    
    # Plot training & validation loss values
    ax[1].plot(history.history['loss'])
    ax[1].plot(history.history['val_loss'])
    ax[1].set_title('Model loss')
    ax[1].set_ylabel('Loss')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Test'], loc='upper left')
    plt.show()
show_results(history)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,log_loss
import seaborn as sns

In [None]:
y_test

In [None]:
pred_cat

In [None]:
preds = model.predict(X_test)
pred_cat = np.argmax(preds,axis=1)
y_cat = np.argmax(y_test,axis=1)
print('model accuracy on test set is: {0:.2f}%'.format(accuracy_score(y_cat,pred_cat)*100))
sns.heatmap(confusion_matrix(y_cat,pred_cat),cmap='Greens',annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('True label')
plt.title('mnist Convolutional model \n classification results on test set')

In [None]:
pred_cat

In [None]:
preds_max = np.max(preds, axis=1)
idx=0
for vals in preds:
    pred_max = np.max(vals)
    pred_idx = np.argmax(vals)
    if(pred_max == 1):
        


## **2.c**
Possible reasons why the model accuracy isn't good:
* model is overfitted because of the complexity of the model
* resizing the different images might result in distortion of features in the image
* the loss in the test grows while it goes down in the train, so we could use early stopping to stop at a better time 

## **2.d**
Prioritizing the reasons:
1. the loss in the test grows while it goes down in the train, so we could use early stopping to stop at a better time 
2. model is overfitted because of the complexity of the model
3. resizing the different images might result in distortion of features in the image

we will try to get better results by using ksplit verification strategy and simplifying the model

In [None]:
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler

1. implementing early stopping

In [None]:
labels['file_name'] = labels['id'] + '.jpg'
labels

In [None]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    validation_split=0.2)



# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
dg_train = datagen.flow_from_dataframe(
        subset='training',
        dataframe=labels,
        directory=train_dir,
        x_col="file_name",
        y_col="breed",
        target_size=(224, 224),
        batch_size=32,
        class_mode='categorical')

dg_test = datagen.flow_from_dataframe(
        subset='validation',
        dataframe=labels,
        directory=train_dir,
        x_col="file_name",
        y_col="breed",
        target_size=(224, 224),
        batch_size=32,
        class_mode='categorical') 

# fits the model on batches with real-time data augmentation:
history2 = model.fit_generator(dg_train, validation_data=dg_test, epochs=10)

In [None]:
show_results(history2)

In [None]:
model.evaluate(x=np.asarray(imgs_test), y=labels_test)

In [None]:
def set_callbacks(description = 'run1', es_patience = 10, rlop_patience = 7, tb_base_logdir = './logs/'):
    cp = ModelCheckpoint('best_modelweights{}.hp'.format(description), save_best_only=True)
    es = EarlyStopping(patience=es_patience, monitor = 'val_accuracy')
    rlop = ReduceLROnPlateau(patience=rlop_patience)
    cb = [cp,es,rlop]
    return cb

In [None]:
model = Sequential()
model.add(Conv2D(32,(3,3),activation='relu',input_shape=(224,224,3)))
model.add(Conv2D(32,(3,3),activation='relu'))
model.add(Dropout(0.1))
model.add(MaxPool2D())
model.add(Conv2D(16,(3,3),activation='relu'))
model.add(Conv2D(16,(3,3),activation='relu'))
model.add(Dropout(0.1))
model.add(MaxPool2D())
model.add(Flatten())
model.add(Dense(120,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
history = model.fit(X_train,y_train,validation_split=0.2,shuffle=True,epochs=10,callbacks=set_callbacks('initial_CNN_model_dog_breed1', es_patience=2))
show_results(history) 

2. simplifying the model to avoid overfitting

In [None]:
model = Sequential()
model.add(Conv2D(16,(3,3),activation='relu',input_shape=(224,224,3)))
model.add(Dropout(0.1))
model.add(MaxPool2D())
model.add(Conv2D(32,(3,3),activation='relu'))
model.add(Dropout(0.1))
model.add(MaxPool2D())
model.add(Flatten())
model.add(Dense(120,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
history = model.fit(X_train,y_train,validation_split=0.2,shuffle=True,epochs=10)
show_results(history) 