In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import model_selection

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn import datasets

from keras.datasets import mnist

from keras import losses, regularizers

from keras.models import Sequential, load_model, Model

from keras.layers import Activation,Dense, Dropout, Flatten, BatchNormalization

from keras.layers.convolutional import Conv2D, MaxPooling2D

from keras.constraints import maxnorm

from keras.utils.np_utils import to_categorical
import cv2
import glob

Ce dataset comporte des images de 200 espèces d'oiseaux. Il a 3 répertoires : train (qui comprend ~150 images par espèce), et test et valid (qui comprennent 5 images par espèce). Les images ne se répètent pas entre les répertoires : On est donc sûr de la validité des données (on valide bien sur des images jamais vues).

In [None]:
bird_dir="../input/100-bird-species/"
train_dir=bird_dir+"train/"
valid_dir=bird_dir+"valid/"
test_dir=bird_dir+"test/"

On stocke les différents noms de répertoire (qui correspondent aux noms des espèces).

In [None]:
classes=[]
for dirname, _, filenames in os.walk(train_dir):
    folderName = os.path.basename(dirname)
    if folderName!="" :
        classes.append(folderName)
classes=sorted(classes)

Puis on lit les images, qu'on enregistre dans 3 listes distinctes.

In [None]:
X_train=[]
y_train=[]
X_test=[]
y_test=[]
X_valid=[]
y_valid=[]

In [None]:
for i in range(len(classes)):
    cur_train_dir=train_dir+classes[i]
    data_train_path=os.path.join(cur_train_dir,'*g')
    train_files=glob.glob(data_train_path)
    for f1 in train_files:
        X_train.append(cv2.resize(cv2.imread(f1),(100,100)))
        y_train.append(i)
    cur_test_dir=test_dir+classes[i]
    data_test_path=os.path.join(cur_test_dir,'*g')
    test_files=glob.glob(data_test_path)
    for f1 in test_files:
        X_test.append(cv2.resize(cv2.imread(f1),(100,100)))
        y_test.append(i)
    cur_valid_dir=valid_dir+classes[i]
    data_valid_path=os.path.join(cur_valid_dir,'*g')
    valid_files=glob.glob(data_valid_path)
    for f1 in valid_files:
        X_valid.append(cv2.resize(cv2.imread(f1),(100,100)))
        y_valid.append(i)

On transforme ces liste en array pour pouvoir les utiliser

In [None]:
X_test=np.array(X_test, dtype=np.uint8)
X_train=np.array(X_train, dtype=np.uint8)
X_valid=np.array(X_valid, dtype=np.uint8)

On vérifie la forme des array

In [None]:
X_train.shape

On a bien les dimensions attendues (images en 100x100 pixels, x3 pour rgb). Essayons d'appliquer un modèle simple.

In [None]:
y_train_cat=to_categorical(y_train)
y_test_cat=to_categorical(y_test)
y_valid_cat=to_categorical(y_valid)

In [None]:
def plot_scores(train) :
    accuracy = train.history['accuracy']
    val_accuracy = train.history['val_accuracy']
    epochs = range(len(accuracy))
    plt.plot(epochs, accuracy, 'b', label='Score apprentissage')
    plt.plot(epochs, val_accuracy, 'r', label='Score validation')
    plt.title('Scores')
    plt.legend()
    plt.show()

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(100, 100, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(len(classes), activation='softmax'))

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
train=model.fit(X_train, y_train_cat, validation_data=(X_valid, y_valid_cat), epochs=20, batch_size=200, verbose=1)

In [None]:
plot_scores(train)

Ce n'est pas bon du tout. Vu les résultats, le modèle répond sans doute toujours la même chose. Essayons avec un modèle plus profond.

In [None]:
# Modèle CNN plus profond
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(100, 100, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(20, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(len(classes), activation='softmax'))

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
train=model.fit(X_train, y_train_cat, validation_data=(X_valid, y_valid_cat), epochs=20, batch_size=200, verbose=1)

In [None]:
model.evaluate(X_test, y_test_cat)

In [None]:
plot_scores(train)

C'est loin d'être mauvais en sachant qu'on a 200 espèces d'oiseaux, donc 200 classes, mais il semble qu'on ait un problème d'overfitting. Essayons de poursuivre un peu l'entraînement.

In [None]:
train=model.fit(X_train, y_train_cat, validation_data=(X_valid, y_valid_cat), epochs=10, batch_size=200, verbose=1)

In [None]:
model.evaluate(X_test, y_test_cat)

In [None]:
plot_scores(train)

On voit que le score de validation continue de stagner autour de 55%. Essayons de réguler le modèle.

In [None]:
# Modèle CNN plus profond
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(100, 100, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(20, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(classes), activation='softmax'))

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
train=model.fit(X_train, y_train_cat, validation_data=(X_valid, y_valid_cat), epochs=20, batch_size=200, verbose=1)

In [None]:
plot_scores(train)

On continue l'entraînement pour voir si le problème de dropout réapparaît 

In [None]:
train=model.fit(X_train, y_train_cat, validation_data=(X_valid, y_valid_cat), epochs=20, batch_size=200, verbose=1)

In [None]:
model.evaluate(X_test,y_test_cat)

In [None]:
plot_scores(train)

On a toujours un problème d'overfitting, mais un meilleur score de validation : Approximativement 66%, soit 2/3.

In [None]:
Y_pred = model.predict(X_test)
y_pred = np.argmax(Y_pred, axis=1)
print('Classification Report')
print(classification_report(y_test, y_pred, target_names=classes))