In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

import os
import os, warnings
import PIL
import PIL.Image

import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory
import tensorflow_datasets as tfds
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from sklearn import preprocessing

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

## HappyWhale - Whale and Dolphin Classification

In [None]:
df_train = pd.read_csv('../input/ml7641-train/train_subset.csv') #('../input/happy-whale-and-dolphin/train.csv')
df_train.head()

In [None]:
!ls ../input/ml7641-train/

## Class Distribution

In [None]:
df_train_random = df_train.sample(frac=1)
df_train_random

In [None]:
df_train_random.to_csv('df_train_random.csv')

In [None]:
df_train[df_train["image"]=="000be9acf46619.jpg"]

In [None]:
class_view = pd.DataFrame(df_train['species'].value_counts())
class_view

plt.figure(figsize=(14,7))
label=[class_view['species']]
sns.set_theme(style="whitegrid")
ax=sns.histplot(df_train, x="species",color='#0B606F', kde = False)

for rect in ax.patches:
    height = rect.get_height()
    ax.annotate(f'{int(height)}', xy=(rect.get_x()+rect.get_width()/2, height), 
                xytext=(0, 5), textcoords='offset points', ha='center', va='bottom') 

plt.xticks(rotation=90)
ax.set_title('Class count', x=0.54, y=1.1, fontsize=30)
plt.show()

In [None]:
df_train["species"].unique()

In [None]:
classe_names=df_train["species"].unique() #=['whale', 'dolphin']

## VGG16 preprocessing

In [None]:
!rm -r testing_data
!rm -r training_data

!mkdir testing_data
!mkdir training_data

In [None]:
for index, row in df_train_random.iterrows():
    print(row["image"])
    break

In [None]:
len(df_train_random) * 0.8

In [None]:
import os
import shutil
import random
from tqdm import tqdm

directory = "../input/ml7641-train/train_images/train_images"

# for filename in tqdm(os.listdir(directory)):
    
#     f = os.path.join(directory, filename)
    
#     if(random.random()<0.2):
#         fileName = os.path.join(".","testing_data",filename)
#         shutil.copy(f, fileName)
#     else:
#         fileName = os.path.join(".","training_data",filename)
#         shutil.copy(f, fileName)

for index, row in tqdm(df_train_random.iterrows()):
    
    filename = row["image"]
    f = os.path.join(directory, filename)
    
    if(index > len(df_train_random)*0.8):
        fileName = os.path.join(".","testing_data",filename)
        shutil.copy(f, fileName)
    else:
        fileName = os.path.join(".","training_data",filename)
        shutil.copy(f, fileName)

In [None]:
len(os.listdir('./training_data')), len(os.listdir('./testing_data'))

In [None]:
datagen = ImageDataGenerator(preprocessing_function=lambda x: x,validation_split=0.10)
#tf.keras.applications.vgg16.preprocess_input

In [None]:
classe_names = classe_names.tolist()

In [None]:
train_generator=datagen.flow_from_dataframe(
    df_train,
    directory='./training_data/',
    x_col='image',
    y_col='species',
    subset="training",
    target_size=(224,224), 
    batch_size=32,
    rescale=1.0/255,
    seed=7641,
    shuffle=True,
    classes=classe_names,
    class_mode="categorical",)

In [None]:
valid_generator=datagen.flow_from_dataframe(
    df_train,
    directory='./training_data/',
    x_col='image',
    y_col='species',
    subset="validation",
    target_size=(224,224), 
    batch_size=32,
    rescale=1.0/255,
    seed=7641,
    shuffle=True,
    classes=classe_names,
    class_mode="categorical",)

In [None]:
df_submission=pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
df_submission.head()

In [None]:
test_datagen = ImageDataGenerator(preprocessing_function=lambda x: x,validation_split=0.0)

test_generator=datagen.flow_from_dataframe(
    df_train,
    directory='./testing_data/',
    x_col='image',
    y_col='species',
    subset="training",
    target_size=(224,224), 
    batch_size=1,
    rescale=1.0/255,
    seed=7641,
    shuffle=True,
    classes=classe_names,
    class_mode="categorical",)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(15,15))

for i in range(4):
    image = next(test_generator)[0].astype('uint8')[0]
    image = np.squeeze(image)
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(15,15))

for i in range(4):
    image = next(train_generator)[0].astype('uint8')[0]
    image = np.squeeze(image)
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(15,15))

for i in range(4):
    image = next(valid_generator)[0].astype('uint8')[0]
    image = np.squeeze(image)
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers
from tensorflow.keras import optimizers
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout
from tensorflow.keras.optimizers import Adam


In [None]:
vgg_model = tf.keras.applications.vgg16.VGG16(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(224, 224, 3),
    pooling=None,
    classes=1000,
    classifier_activation='softmax'
)

vgg_model.trainable = False

vgg_model.summary()

In [None]:
model = Sequential()
# model.add(Conv2D(32, (3, 3), activation='relu', padding='same', name='conv_1', 
#                  input_shape=(224, 224, 3)))
# model.add(MaxPooling2D((2, 2), name='maxpool_1'))
# model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_2'))
# model.add(MaxPooling2D((2, 2), name='maxpool_2'))
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_3'))
# model.add(MaxPooling2D((2, 2), name='maxpool_3'))
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_4'))
# model.add(MaxPooling2D((2, 2), name='maxpool_4'))
model.add(vgg_model)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu', name='dense_1'))
model.add(Dense(256, activation='relu', name='dense_2'))
model.add(Dense(128, activation='relu', name='dense_3'))
model.add(Dense(len(classe_names), activation='sigmoid', name='output'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0,
    patience=0,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

history = model.fit(train_generator, epochs=24, validation_data=valid_generator, verbose=1, callbacks=[callback])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])
model.evaluate_generator(test_generator, 5431)

In [None]:
def extract_layers(main_model, starting_layer_ix, ending_layer_ix):
    # create an empty model
    new_model = Sequential()
    for ix in range(starting_layer_ix, ending_layer_ix + 1):
        curr_layer = main_model.get_layer(index=ix)
        # copy this layer over to the new model
        new_model.add(curr_layer)
    return new_model

In [None]:
model_part = extract_layers(model, 0, 6) #.summary()
model_part.summary()

In [None]:
from keras.preprocessing import image

folder_path = "./testing_data"
img_width, img_height = 224, 224

predictions=[]
for img in tqdm(os.listdir(folder_path)):
    img = os.path.join(folder_path, img)
    img = image.load_img(img, target_size=(img_width, img_height))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    
    prediction = model_part.predict(img, batch_size=10)
    predictions.append(prediction)

In [None]:
a = np.array(predictions)
# pd.DataFrame(a).to_csv('testing_data.csv')

In [None]:
import pickle

In [None]:
with open('testing_emb_vgg.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model_part.predict(images[0])

## Resnet preprocessing (None)

In [None]:
datagen = ImageDataGenerator(preprocessing_function=lambda x: x,validation_split=0.10)


In [None]:
classe_names

In [None]:
train_generator=datagen.flow_from_dataframe(
    df_train,
    directory='./training_data/',
    x_col='image',
    y_col='species',
    subset="training",
    target_size=(224,224), 
    batch_size=32,
    rescale=1.0/255,
    seed=7641,
    shuffle=True,
    classes=classe_names,
    class_mode="categorical",)

In [None]:
valid_generator=datagen.flow_from_dataframe(
    df_train,
    directory='./training_data/',
    x_col='image',
    y_col='species',
    subset="validation",
    target_size=(224,224), 
    batch_size=32,
    rescale=1.0/255,
    seed=7641,
    shuffle=True,
    classes=classe_names,
    class_mode="categorical",)

In [None]:
df_submission=pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
df_submission.head()

In [None]:
test_datagen = ImageDataGenerator(preprocessing_function=lambda x: x,validation_split=0.0)

test_generator=datagen.flow_from_dataframe(
    df_train,
    directory='./testing_data/',
    x_col='image',
    y_col='species',
    subset="training",
    target_size=(224,224), 
    batch_size=1,
    rescale=1.0/255,
    seed=7641,
    shuffle=True,
    classes=classe_names,
    class_mode="categorical",)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(15,15))

for i in range(4):
    image = next(test_generator)[0].astype('uint8')[0]
    image = np.squeeze(image)
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(15,15))

for i in range(4):
    image = next(train_generator)[0].astype('uint8')[0]
    image = np.squeeze(image)
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(15,15))

for i in range(4):
    image = next(valid_generator)[0].astype('uint8')[0]
    image = np.squeeze(image)
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
resnet_model = tf.keras.applications.resnet50.ResNet50(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(224, 224, 3),
    pooling=None,
    classes=1000
)

resnet_model.trainable = False

In [None]:
model = Sequential()
# model.add(Conv2D(32, (3, 3), activation='relu', padding='same', name='conv_1', 
#                  input_shape=(224, 224, 3)))
# model.add(MaxPooling2D((2, 2), name='maxpool_1'))
# model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_2'))
# model.add(MaxPooling2D((2, 2), name='maxpool_2'))
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_3'))
# model.add(MaxPooling2D((2, 2), name='maxpool_3'))
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_4'))
# model.add(MaxPooling2D((2, 2), name='maxpool_4'))
model.add(resnet_model)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu', name='dense_1'))
model.add(Dense(256, activation='relu', name='dense_2'))
model.add(Dense(len(classe_names), activation='sigmoid', name='output'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
#without VGG preprocessing
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0,
    patience=10,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)
history = model.fit(train_generator, epochs=24, validation_data=valid_generator, verbose=1, callbacks=[callback])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])
model.evaluate_generator(test_generator, 5431)

In [None]:
modelPredictions = model.predict(test_generator)
modelPredictions1 = [np.argmax(x) for x in modelPredictions]
testLabels = test_generator.labels
plt.imshow(tf.math.confusion_matrix(testLabels, modelPredictions1))

In [None]:
from keras.preprocessing import image

folder_path = "./training_data"
img_width, img_height = 224, 224

predictions=[]
for img in tqdm(os.listdir(folder_path)):
    img = os.path.join(folder_path, img)
    img = image.load_img(img, target_size=(img_width, img_height))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    
    prediction = model_part.predict(img, batch_size=10)
    predictions.append(prediction)

In [None]:
a = np.array(predictions)
# pd.DataFrame(a).to_csv('testing_data.csv')

In [None]:
import pickle

In [None]:
with open('training_emb_resnet.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model_part.predict(images[0])

# Xception

In [None]:
xception_model = tf.keras.applications.xception.Xception(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(224, 224, 3),
    pooling=None,
    classes=1000,
    classifier_activation='softmax'
)

xception_model.trainable = False

In [None]:
model = Sequential()
# model.add(Conv2D(32, (3, 3), activation='relu', padding='same', name='conv_1', 
#                  input_shape=(224, 224, 3)))
# model.add(MaxPooling2D((2, 2), name='maxpool_1'))
# model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_2'))
# model.add(MaxPooling2D((2, 2), name='maxpool_2'))
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_3'))
# model.add(MaxPooling2D((2, 2), name='maxpool_3'))
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_4'))
# model.add(MaxPooling2D((2, 2), name='maxpool_4'))
model.add(xception_model)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu', name='dense_1'))
model.add(Dense(256, activation='relu', name='dense_2'))
model.add(Dense(128, activation='relu', name='dense_3'))
model.add(Dense(len(classe_names), activation='sigmoid', name='output'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0,
    patience=10,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)
history = model.fit(train_generator, epochs=24, validation_data=valid_generator, verbose=1, callbacks=[callback])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])
model.evaluate_generator(test_generator, 5431)

In [None]:
modelPredictions = model.predict(test_generator)
modelPredictions1 = [np.argmax(x) for x in modelPredictions]
testLabels = test_generator.labels
plt.imshow(tf.math.confusion_matrix(testLabels, modelPredictions1))

## Save model

In [None]:
from tensorflow.keras.models import load_model
model.save('model.h5')
new_model=load_model('model.h5')

In [None]:
probability_model = tf.keras.Sequential([new_model, 
                                         tf.keras.layers.Softmax()])

In [None]:
predictions = probability_model.predict(test_generator[0])

In [None]:
predictions[0]

In [None]:
np.argmax(predictions[0])

In [None]:
train_generator.class_indices

In [None]:
import matplotlib.image as mpimg
img = mpimg.imread('../input/happy-whale-and-dolphin/test_images/000110707af0ba.jpg')
imgplot = plt.imshow(img)