# Happywhale - Whale and Dolphin Identification

![](https://storage.googleapis.com/kaggle-media/competitions/Happywhale/AU%20Kaggle%20Competition%20Description%20Image-03.jpg)

Importing Library

In [None]:
# These library are for data manipulation 
import numpy as np
import pandas as pd

# These library are for working with directories
import os
from glob import glob
from tqdm import tqdm

# These library are for Visualization
import matplotlib.pyplot as plt
import plotly.express as px

# These Library are for converting Label Encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# These library are for building model 
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.imagenet_utils import preprocess_input
from tensorflow.keras.layers import AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Getting the Files present in the main Directories

path = '/kaggle/input/happy-whale-and-dolphin/'
os.listdir(path)

In [None]:
# Loading the Train csv file and Sample Submission File using main dir

train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
# Printing the dimension of the train.csv file

print('Number train samples:', len(train_data))

In [None]:
# Displaying the column name present in the train.csv file

train_data.columns

In [None]:
# Displaying the first five in rows in the train.csv file

train_data.head()

In [None]:
# Printing the Number of training images available in the train image directory

print('Number train images:', len(os.listdir(path+'train_images/')))

In [None]:
# Printing the Number of testing images available in the test image directory

print('Number test images:', len(os.listdir(path+'test_images/')))

In [None]:
# Dispalying the different species availabel in the dataset

train_data['species'].value_counts()

In [None]:
# Plotting using Pie Chart of the Species available in the dataset uisng Ploty

fig = px.pie(train_data, values=train_data['species'].value_counts().values, names=train_data['species'].value_counts().index)
fig.show()

In [None]:
# Plotting BarChart of the Species avialable

plt.figure(figsize=(15, 12))
plt.rcParams["font.size"] = 18
plt.barh(train_data["species"].value_counts().sort_values(ascending=True).index,
         train_data["species"].value_counts().sort_values(ascending=True),
         tick_label = train_data["species"].value_counts().sort_values(ascending=True).index)
plt.show()

In [None]:
# Displaying the individual id present in the dataset

train_data['individual_id'].value_counts()

In [None]:
# Now we will prepare our data for training and also plot few images 

BASE_PATH = "../input/happy-whale-and-dolphin/train_images/"
TRAIN_IMAGES = glob(BASE_PATH + "train/*.jpg")

In [None]:
# Displaying Images Randomly for training dataset

path = BASE_PATH + np.random.choice(train_data['image'])
im = plt.imread(path)
plt.figure(figsize=(15, 6))
plt.imshow(im)
plt.title(path.split("/")[-1])
plt.xticks([]), plt.yticks([])
train_data[train_data['image']==path.split('/')[-1]]


In [None]:
# Storing the Base path and then creating test images for further use

BASE_PATH = "../input/happy-whale-and-dolphin/test_images/"
TEST_IMAGES = glob(BASE_PATH + "*.jpg")

In [None]:
# Displaying images randomly using test data

path = np.random.choice(TEST_IMAGES)
im = plt.imread(path)
plt.figure(figsize=(15, 6))
plt.imshow(im)
plt.title(path.split("/")[-1])


In [None]:
# creating label in train.csv

train_data['label'] = train_data.species.map(lambda x: 'whale' if 'whale' in x else 'dolphin')

In [None]:
# Barchart of Whale vs Dolphin count

data = train_data['label'].value_counts().reset_index()
fig = px.bar(data, x='index', y='label', color='label', title='Whale Vs Dolphin', text_auto=True)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
# Plotting proportion of Whales vs Dolphins

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Whales and Dolphins ', size = 20, font="Serif")
explode = (0.05, 0.05)
labels = list(train_data.label.value_counts().index)
sizes = train_data.label.value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#0077b6","#90e0ef"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

# Model building

In [None]:
#loading train.csv as train_df

train_df = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")

In [None]:
# Printing the dimension of the train_df 

train_df.shape

In [None]:
# Displaying First Five column of the train_df

train_df.head()

In [None]:
# checking for null values

train_df.isnull().sum()

In [None]:
# Removing duplicate values form the train_df of column individual_id

train_df=train_df.drop_duplicates(subset=['individual_id'],keep='last')

In [None]:
# This function will Load Images of certain dimension

def Loading_Images(data, m, dataset):
    print("Loading images")
    X_train = np.zeros((m, 32, 32, 3))
    count = 0
    for fig in tqdm(data['image']):
        img = image.load_img("../input/happy-whale-and-dolphin/"+dataset+"/"+fig, target_size=(32, 32, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        X_train[count] = x
        count += 1
    return X_train

In [None]:
# This function will convert the text category to numeric

def prepare_labels(y):
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    y = onehot_encoded
    return y, label_encoder

In [None]:
X = Loading_Images(train_df, train_df.shape[0], "train_images")
X /= 255

In [None]:
y, label_encoder = prepare_labels(train_df['individual_id'])

In [None]:
y.shape

- Now that we have our data preprocessed we will build our model using Keras CNN

In [None]:
# Creating Model

model = Sequential()

model.add(Conv2D(32, (6, 6), strides = (1, 1), input_shape = (32, 32, 3)))
model.add(BatchNormalization(axis = 3))
model.add(Activation('relu'))

model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), strides = (1,1)))
model.add(Activation('relu'))
model.add(AveragePooling2D((3, 3)))

model.add(Flatten())
model.add(Dense(512, activation="relu"))
model.add(Dropout(0.85))

model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

In [None]:
# Model fitting 

history = model.fit(X, y, epochs=200, batch_size=128, verbose=1)

In [None]:
# saving our model for later use

model.save('model.h5')

# Evaluation of the model 

In [None]:
# Plotting the accuracy of the model over the epochs

plt.figure(figsize=(15,5))
plt.plot(history.history['accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

In [None]:
# Plotting the loss of the model over the epochs

plt.figure(figsize=(15,5))
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.show()

# Inference

In [None]:
test = os.listdir("../input/happy-whale-and-dolphin/test_images")
print(len(test))

In [None]:
col = ['image']
test_df = pd.DataFrame(test, columns=col)
test_df['predictions'] = ''
#test_df=test_df.head(n=250)

In [None]:
batch_size=5000
batch_start = 0
batch_end = batch_size
L = len(test_df)

while batch_start < L:
    limit = min(batch_end, L)
    test_df_batch = test_df.iloc[batch_start:limit]
    print(type(test_df_batch))
    X = Loading_Images(test_df_batch, test_df_batch.shape[0], "test_images")
    X /= 255
    predictions = model.predict(np.array(X), verbose=1)
    for i, pred in enumerate(predictions):
        p=pred.argsort()[-5:][::-1]
        idx=-1
        s=''
        s1=''
        s2=''
        for x in p:
            idx=idx+1
            if pred[x]>0.6:
                s1 = s1 + ' ' +  label_encoder.inverse_transform(p)[idx]
            else:
                s2 = s2 + ' ' + label_encoder.inverse_transform(p)[idx]
        s= s1 + ' new_individual' + s2
        s = s.strip(' ')
        test_df.loc[ batch_start + i, 'predictions'] = s
    batch_start += batch_size   
    batch_end += batch_size
    

- For Submission we will create submission.csv

In [None]:
# Creating submission.csv and printing first five rows

test_df.to_csv('submission.csv',index=False)
test_df.head()

> That's it,<br>
> I will continue to update the notebook,<br>
> I know we can update so many things to improve overall accuracy and submission score,<br>
> Let me know your suggestion :)

![](https://thumbs.dreamstime.com/b/dental-smile-whale-icon-cartoon-style-dental-smile-whale-icon-cartoon-dental-smile-whale-vector-icon-web-design-117675439.jpg)