In [None]:
import numpy as np 
import pandas as pd
import os
from PIL import Image
import gc

import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import dask
import dask.dataframe as dd

from subprocess import check_output

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50,MobileNetV2
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image

print(os.listdir("../input"))

In [None]:
def read_and_resize(filepath):
    im = image.load_img(filepath, 
                        color_mode = "grayscale", 
                        target_size=(resize, resize))
    x = image.img_to_array(im)
    x = preprocess_input(x)
    return x

In [None]:
train_dir = "../input/train"
test_dir = "../input/test"
resize = 124 ## size images will be resized to 
sample_to = 5
batch_size = 32

In [None]:
train = pd.read_csv("../input/train.csv")

In [None]:
print((train.Id=='new_whale').mean())
print((train.Id.value_counts()==1).mean())

In [None]:
im_count = train[train.Id != 'new_whale'].Id.value_counts()

In [None]:
im_count.name = 'sighting_count'
train = train.join(im_count, on='Id')

In [None]:
train.shape

In [None]:
val_fns = set(train.sample(frac=1)[(train.Id != 'new_whale') & (train.sighting_count > 1)].groupby('Id').first().Image)

In [None]:
train_xnw = train.loc[train['Id'] != 'new_whale'].reset_index(drop=True) #xnw = exclude new whale
train_nw = train.loc[train['Id'] == 'new_whale'].reset_index(drop=True) #nw = incude new whale
num_classes = len(train_xnw['Id'].unique())

In [None]:
del train
gc.collect()

In [None]:
train_xnw_val = train_xnw[train_xnw.Image.isin(val_fns)].reset_index(drop=True)
train_xnw_train = train_xnw[~train_xnw.Image.isin(val_fns)]

In [None]:
print(train_xnw_train.sighting_count.min())
print(train_xnw_train.sighting_count.max())

In [None]:
res = None

for grp in tqdm(train_xnw_train.groupby('Id')):
    n = grp[1].shape[0]
    additional_rows = grp[1].sample(0 if sample_to < n  else sample_to - n, replace=True)
    rows = pd.concat((grp[1], additional_rows))
    
    if res is None: res = rows
    else: res = pd.concat((res, rows))
        
res = res.reset_index(drop = True).drop(columns=['sighting_count'])

In [None]:
len(res)

In [None]:
del train_xnw_train
gc.collect()

In [None]:
res.shape

In [None]:
im_count_new = res.Id.value_counts()
im_count_new.name = 'sighting_count'
res = res.join(im_count_new, on='Id')

In [None]:
print(res.sighting_count.max()) ## took 1 image for the validation set, therefore its 72 instead of 73
print(res.sighting_count.min())

#### Classifier to identify whales that have been seen before

In [None]:
df = pd.DataFrame(train_xnw.Id.value_counts().sort_values(ascending=True))
df = df.reset_index()
df = df.rename(index=str, columns={"index": "Id", "Id": "Count"})
df.tail()

In [None]:
d = {cat: k for k,cat in enumerate(df['Id'])}

In [None]:
x_train = np.zeros((res.shape[0],resize,resize,1))
for index, row in tqdm(res.iterrows()):  
    im = read_and_resize(os.path.join(train_dir,row['Image']))
    x_train[index,:,:,:] = im
    del im

In [None]:
train_labels = []
for index, row in tqdm(res.iterrows()):  
        train_labels.append(d[row['Id']])
train_labels = np.array(train_labels)
y_train = keras.utils.to_categorical(train_labels)

In [None]:
del res, train_xnw
gc.collect()

In [None]:
x_val = np.zeros((train_xnw_val.shape[0],resize,resize,1))
for index, row in tqdm(train_xnw_val.iterrows()):  
    im = read_and_resize(os.path.join(train_dir,row['Image']))
    x_val[index,:,:,:] = im
    del im

In [None]:
val_labels = []
for index, row in tqdm(train_xnw_val.iterrows()):  
        val_labels.append(d[row['Id']])
val_labels = np.array(val_labels)
y_val = keras.utils.to_categorical(val_labels)

In [None]:
del train_xnw_val, val_labels
gc.collect()

In [None]:
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

In [None]:
gen =ImageDataGenerator( )

In [None]:
model = ResNet50(input_shape=(resize, resize, 1),
                      weights=None, 
                      classes=num_classes)

In [None]:
model.compile(optimizer=Adam(lr = 0.0005), 
              loss='categorical_crossentropy',
              metrics=['accuracy', 'top_k_categorical_accuracy'])
print(model.summary())

In [None]:
batches = gen.flow(x_train, y_train, batch_size=batch_size)
val_batches = gen.flow(x_val,y_val, batch_size=batch_size )

In [None]:
epochs = 10
history=model.fit_generator(generator=batches, 
                            steps_per_epoch= batches.n//batch_size, 
                            validation_data = val_batches,
                            validation_steps =  val_batches.n//batch_size,
                            epochs=epochs)

In [None]:
# Plot the loss and accuracy curves for training and validation 
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="Validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['top_k_categorical_accuracy'], color='b', label="Training Top 5 Accuracy")
ax[1].plot(history.history['val_top_k_categorical_accuracy'], color='r',label="Validation Top 5 accuracy")
legend = ax[1].legend(loc='best', shadow=True)