**The objective of this competition is to identify individual dolphins and whales with a dataset of their images.
Therefore, in this EDA, I am going through the photos and individual information provided to try to get the whole picture of our data.**

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from PIL import Image
from tqdm import tqdm
import random 
import cv2

In [None]:
train = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
test = pd.read_csv("../input/happy-whale-and-dolphin/sample_submission.csv")
train.describe()

In [None]:
train.head()

**First, check if there is any missing values**

In [None]:
#check for missing values
train.isnull().sum()

In [None]:
test.isnull().sum()

**Adding the file path to our image dataframe below.**

In [None]:
train_img_dir = "../input/happy-whale-and-dolphin/train_images/"
test_img_dir = "../input/happy-whale-and-dolphin/test_images/"
train["path"] =  train_img_dir + train["image"]
test["path"] = test_img_dir + test["image"]
train.head()

**Check the species name**

In [None]:
species = train["species"].unique()
print("number of unique species: ", train["species"].nunique())
print("names of species: ", train["species"].unique())
species.sort()
print("------------------------------------------")
print(train["species"].value_counts())

In [None]:
### Cleaning miss-spelling species name
### Rename with the major species name so each one has a "dolphin/whale" at the end
train["species"].replace(
    {
        "bottlenose_dolpin" : "bottlenose_dolphin",
        "kiler_whale" : "killer_whale",
        "beluga" : "beluga_whale",
        "globis" : "globis_whale"
    },
    inplace = True
)
species = train["species"].unique()
print("number of unique species: ", train["species"].nunique())
print("names of species: ", train["species"].unique())

In [None]:
#Create label for class as dolphin/whale
train['labels'] = train["species"].map(lambda x : "dolphin" if "dolphin" in x else "whale")
train.head()

In [None]:
def plot_train_image(dataset,title):
    plt.figure(figsize = (15,10))
    plt.suptitle(title, fontsize=30)
    for i, sp in enumerate(dataset.species.unique()):
        plt.subplot(4,7,i+1)
        imgs = dataset[dataset["species"] == sp].reset_index(drop = True)
        random_pick_img_path = imgs.loc[random.randint(0, len(imgs)-1),'path']
        img = Image.open(random_pick_img_path)
        plt.imshow(img)
        plt.axis("off")
        plt.title(sp)
    plt.tight_layout()
    plt.show()

In [None]:
### Looking through photos randomly for every species in our dataset
plot_train_image(train,"Train Images")

In [None]:
# Function for retreving the width and height for every photo.
def get_image_size(train):
    widths, heights = [], []
    for path in tqdm(train["path"]):
        width, height = Image.open(path).size
        widths.append(width)
        heights.append(height)

    train["widths"] = widths
    train["heights"] = heights
    train["dimensions"] = train['widths'] * train['heights']


In [None]:
# train.to_csv("train_df_with_imgs.csv", index=False)

In [None]:
# The above data processing takes around ten minutes so I have saved the result for later use.
train = pd.read_csv("../input/happy-whale-and-dolphin-eda/train_df_with_imgs.csv")
train.head()

In [None]:
plt.figure(figsize = (20,8))
sns.scatterplot(x = "widths", y = "heights", data =train)
plt.show()

As we can see, we actually have multiple sizes of photos in our dataset.

In [None]:
print("Imgae Max Height: ", train["heights"].max())
print("Image Min Height: ", train["heights"].min())
print("Image Max Width: ", train["widths"].max())
print("Image Min Width: ", train["widths"].min())

In [None]:
#Image size distribution
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize= (30,20))

fig.suptitle("Image Size distribution on Species", size = 30, weight = "bold")

v1 = sns.violinplot(data = train, x = "species", y = "widths", ax=ax1, hue="labels")

ax1.set_title("Width", y = 0.97, size = 15, weight = "bold")
ax1.set_xlabel("")
ax1.set_ylabel("Width",size=13, weight = "bold")
ax1.set_xticklabels(ax1.get_xticklabels(),rotation = 45, ha = 'right')

v2 = sns.violinplot(data = train, x = "species", y = "heights", ax = ax2, hue = "labels")
ax2.set_title("Height", y = 0.97, size = 15, weight = "bold")
ax2.set_xlabel("")
ax2.set_ylabel("Height",size=13, weight = "bold")
ax2.set_xticklabels(ax2.get_xticklabels(),rotation = 45, ha = 'right')

v3 = sns.violinplot(data = train, x = "species", y = "dimensions", ax = ax3, hue = "labels")
ax3.set_title("Dimensions", y = 0.97, size = 15, weight = "bold")
ax3.set_xlabel("")
ax3.set_ylabel("Dimensions",size=13, weight = "bold")
ax3.set_xticklabels(ax3.get_xticklabels(),rotation = 45, ha = 'right')

sns.despine(left=True, bottom=True)
plt.subplots_adjust(
    left=None, 
    bottom=None, 
    right=None, 
    top=0.90, 
    wspace=None, 
    hspace=0.5
);

I was a bit curious after seeing that the photos of gray whales all have the same size. So I print out some random pictures of this species below to take a look.

In [None]:
tmp = train[train["species"] == "gray_whale"].reset_index(drop=True)
plt.figure(figsize=(20,10))
for i, n in enumerate(random.sample(range(0,len(tmp)-1), 10)):
    im = Image.open(tmp.loc[n,"path"])
    plt.subplot(2,5,i+1)
    plt.axis("off")
    plt.imshow(im)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,8))
whales = train[train["labels"] == "whale"]
dolphins = train[train["labels"] == "dolphin"]
assert (len(whales)+len(dolphins))==len(train)

sns.countplot(
    y = "species",
    data = whales,
    order = whales["species"].value_counts().index,
    ax = ax[0]
)

ax[0].set_title('Whales')
ax[0].set_ylabel(None)

sns.countplot(
    y="species", 
    data=dolphins, 
    ax=ax[1], 
    order = dolphins["species"].value_counts().index,
    palette="RdYlGn"
)
ax[1].set_title('Dolphins')
ax[1].set_ylabel(None)

plt.tight_layout()
plt.show()


In [None]:
#pie chart
plt.figure(figsize=(8,8))
label_cnt = train.groupby(["labels"]).size().reset_index(name ="counts")
plt.pie(label_cnt["counts"], labels= label_cnt["labels"],autopct='%1.1f%%',colors = sns.color_palette('Paired')[0:9],
        shadow=True, startangle=90)
plt.legend(loc = "upper left")
plt.show()

In our dataset, the amount of photos varies by individials and by classes(whale/dolphin)

let's take a look at individuals now.


In [None]:
individuals = train["individual_id"].value_counts().head(10)
top_ten = pd.DataFrame({'individual_id':individuals.index, 'frequency' : individuals.values})
plt.figure(figsize = (12,4))
plt.bar(top_ten["individual_id"], top_ten["frequency"], width = 0.8, color=(0.2, 0.4, 0.6), zorder =4)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.ylabel("frequency")
plt.xlabel("Individual Ids")
plt.title("Top 10 Individual Ids used by frequency")
plt.grid(visible = True, color ='grey',linestyle ='-', linewidth = 0.9,alpha = 0.2, zorder=0)
plt.show()

In [None]:
plt.figure(figsize = (20, 10))
sns.kdeplot(np.log(train.loc[train['labels'] == 'whale']['individual_id'].value_counts()))
sns.kdeplot(np.log(train.loc[train['labels'] == 'dolphin']['individual_id'].value_counts()))
plt.xlabel("Scaled Counts for Individual Occurences", fontsize=15)
plt.ylabel("Density", fontsize=15)
plt.legend(labels = ['whale', 'dolphin'], prop= {'size': 20})
plt.show()

In [None]:
train.head()

References:

https://www.kaggle.com/code/sahamed/eda-visualization-augmentation

https://www.kaggle.com/code/kayvanshah/eda-whale-dolphin-identification

Really grateful for these shared notebooks. I have learnt a lot and hope my version can be helpful to others as well.