# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">IMPORT</p></div>
****


In [None]:
!pip install -U -q kaleido

In [None]:
print("\n... IMPORTS STARTING ...\n")

# print("\n... PIP/APT INSTALLS AND DOWNLOADS/ZIP STARTING ...")
# print("... PIP/APT INSTALLS COMPLETE ...\n")

print("\n\tVERSION INFORMATION")
# Machine Learning and Data Science Imports
import tensorflow as tf; print(f"\t\t– TENSORFLOW VERSION: {tf.__version__}");
import tensorflow_addons as tfa; print(f"\t\t– TENSORFLOW ADDONS VERSION: {tfa.__version__}");
import pandas as pd; pd.options.mode.chained_assignment = None;
import numpy as np; print(f"\t\t– NUMPY VERSION: {np.__version__}");
import sklearn; print(f"\t\t– SKLEARN VERSION: {sklearn.__version__}");
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from pandarallel import pandarallel; pandarallel.initialize();
from sklearn.model_selection import GroupKFold;

# RAPIDS
# import cudf, cupy, cuml

# Built In Imports
from kaggle_datasets import KaggleDatasets
from collections import Counter
from datetime import datetime
from glob import glob
import warnings
import requests
import hashlib
import imageio
import IPython
import sklearn
import urllib
import zipfile
import pickle
import random
import shutil
import string
import json
import math
import time
import gzip
import ast
import sys
import io
import os
import gc
import re

# Visualization Imports
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm; tqdm.pandas();
import plotly.express as px
import seaborn as sns
from PIL import Image, ImageEnhance
import matplotlib; print(f"\t\t– MATPLOTLIB VERSION: {matplotlib.__version__}");
from matplotlib import animation, rc; rc('animation', html='jshtml')
import plotly
import PIL
import cv2

def seed_it_all(seed=42):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    
print("\n\n... IMPORTS COMPLETE ...\n")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">SETUP</p></div>
****


In [None]:
print(f"\n... ACCELERATOR SETUP STARTING ...\n")

# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  
except ValueError:
    TPU = None

if TPU:
    print(f"\n... RUNNING ON TPU - {TPU.master()}...")
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    print(f"\n... RUNNING ON CPU/GPU ...")
    # Yield the default distribution strategy in Tensorflow
    #   --> Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy() 

# What Is a Replica?
#    --> A single Cloud TPU device consists of FOUR chips, each of which has TWO TPU cores. 
#    --> Therefore, for efficient utilization of Cloud TPU, a program should make use of each of the EIGHT (4x2) cores. 
#    --> Each replica is essentially a copy of the training graph that is run on each core and 
#        trains a mini-batch containing 1/8th of the overall batch size
N_REPLICAS = strategy.num_replicas_in_sync
    
print(f"... # OF REPLICAS: {N_REPLICAS} ...\n")

print(f"\n... ACCELERATOR SETUP COMPLTED ...\n")

In [None]:
print("\n... DATA ACCESS SETUP STARTED ...\n")

if TPU:
    # Google Cloud Dataset path to training and validation images
    DATA_DIR = KaggleDatasets().get_gcs_path('happy-whale-and-dolphin')
    save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
else:
    # Local path to training and validation images
    DATA_DIR = "/kaggle/input/happy-whale-and-dolphin"
    save_locally = None
    load_locally = None

EXTRA_DATA_DIR = "/kaggle/input/extra-happywhale-metadata"
EMBED_DATA_DIR = "/kaggle/input/baseline-solution-train-embed"
    
print(f"\n... DATA DIRECTORY PATH IS:\n\t--> {DATA_DIR}")
print(f"\n... EXTRA METADATA DIRECTORY PATH IS:\n\t--> {EXTRA_DATA_DIR}")

print(f"\n... IMMEDIATE CONTENTS OF DATA DIRECTORY IS:")
for file in tf.io.gfile.glob(os.path.join(DATA_DIR, "*")): print(f"\t--> {file}")

print(f"\n... IMMEDIATE CONTENTS OF EXTRA METADATA DIRECTORY IS:")
for file in tf.io.gfile.glob(os.path.join(EXTRA_DATA_DIR, "*")): print(f"\t--> {file}")

print(f"\n... IMMEDIATE CONTENTS OF EMBED DATA DIRECTORY IS:")
for file in tf.io.gfile.glob(os.path.join(EMBED_DATA_DIR, "*")): print(f"\t--> {file}")



In [None]:
print(f"\n... XLA OPTIMIZATIONS STARTING ...\n")

print(f"\n... CONFIGURE JIT (JUST IN TIME) COMPILATION ...\n")
# enable XLA optmizations (10% speedup when using @tf.function calls)
tf.config.optimizer.set_jit(True)

print(f"\n... XLA OPTIMIZATIONS COMPLETED ...\n")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">HELPING FUNCTIONS</p></div>
****

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">LOADING DATA</p></div>
****


In [None]:
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
train_df = pd.read_csv(TRAIN_CSV)

In [None]:
# Check the number of train_images and length of csv file.
if len(os.listdir('../input/happy-whale-and-dolphin/train_images')) == train_df.shape[0]:
    train_len = train_df.shape[0]
else:
    print("Check again your training folder dataset!")

In [None]:
train_df.head()

In [None]:
print("\n... BASIC DATA SETUP STARTING ...\n\n")

# Thanks @karthickp6 for noticing this
FIX_NAME_MAPPING = {"bottlenose_dolpin":"bottlenose_dolphin", 
                    "kiler_whale":"killer_whale",
                    "pilot_whale":"short_finned_pilot_whale",
                    "globis":"short_finned_pilot_whale"}

print("\n... TRAIN DATAFRAME ...\n")
# Image Path 
train_df["img_path"] = os.path.join(DATA_DIR, "train_images")+"/"+train_df.image
# Image Shape
if os.path.isdir(EXTRA_DATA_DIR):
    EX_META_TRAIN_CSV = os.path.join(EXTRA_DATA_DIR, "train.csv")
    ex_train_df = pd.read_csv(EX_META_TRAIN_CSV)
    train_df["img_shape"] = ex_train_df["img_shape"]
    train_df["img_width"] = ex_train_df["img_width"]
    train_df["img_height"] = ex_train_df["img_height"]
else:
    train_df["img_shape"] = [plt.imread(train_df["img_path"].values[i]).shape for i in range(train_len)]
    train_df["img_width"] = train_df["img_shape"][1]
    train_df["img_height"] = train_df["img_shape"][0]
train_df["species"] = train_df["species"].apply(lambda x: x if x not in FIX_NAME_MAPPING.keys() else FIX_NAME_MAPPING[x])
all_species = sorted(train_df.species.unique().tolist())
# The number of each image of Individual ID
train_df["n_img_of_ind"] = train_df.individual_id.map(train_df.individual_id.value_counts().to_dict())
species_int2str_map = {i:_s for i,_s in enumerate(all_species)}
species_str2int_map = {v:k for k,v in species_int2str_map.items()}

all_individuals = sorted(train_df.individual_id.unique().tolist())
ind_int2str_map = {i:_s for i,_s in enumerate(all_individuals)}
ind_str2int_map = {v:k for k,v in ind_int2str_map.items()}

train_df["ind_sparse"] = train_df["individual_id"].map(ind_str2int_map)
train_df["species_sparse"] = train_df["species"].map(species_str2int_map)

display(train_df)

print("\n... TEST DATAFRAME ...\n")
TEST_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
EX_META_TEST_CSV = os.path.join(EXTRA_DATA_DIR, "test.csv")
test_df = pd.read_csv(TEST_CSV)
ex_test_df = pd.read_csv(EX_META_TEST_CSV)
test_df["img_path"] = os.path.join(DATA_DIR, "test_images")+"/"+test_df.image
test_df["img_shape"] = ex_test_df["img_shape"]
test_df["img_width"] = ex_test_df["img_width"]
test_df["img_height"] = ex_test_df["img_height"]
test_df = test_df.drop(columns=["predictions"])
display(test_df)

print("\n... Sample Submission DATAFRAME ..\n")
SS_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
ss_df = pd.read_csv(SS_CSV)
display(ss_df)

print("\n\n... BASIC DATA SETUP FINISHING ...\n")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">DATASET EXPLORATION</p></div>
****


# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:70%;text-align:left">TRAIN METADATA</p></div>


In [None]:
N_TRAIN = len(train_df)
N_TEST = len(ss_df)
N_SPECIES = train_df["species"].nunique()
N_INDIV = train_df["individual_id"].nunique()
print(f"\n... NUMBER OF UNIQUE TRAINING IMAGES: {N_TRAIN} ...")
print(f"... NUMBER OF UNIQUE SPECIES IN TRAINING DATASET: {N_SPECIES} ...")
print(f"... NUMBER OF UNIQUE INDIVIDUALS IN TRAINING DATASET: {N_INDIV} ...")
print("\n... TRAIN DATAFRAME PANDAS DESCRIPTION ...\n\n")
display(train_df.describe().T)

In [None]:
# Plotly stuff
color_discrete_sequence=px.colors.qualitative.Light24
category_orders={"species": all_species}

# Species Plot
fig = px.histogram(train_df, "species", 
                   color="species", 
                   color_discrete_sequence=color_discrete_sequence, 
                   category_orders=category_orders, 
                   title="<b>UNIQUE SPECIES DISTRIBUTION</b>")
fig.show(renderer="png")

In [None]:
# Individual ID Plot
tmp_df = train_df.groupby("individual_id")[["species", "individual_id"]].first()
tmp_df["n_count"] = train_df.groupby("individual_id").size()
print("\n... OUTLIER ...\n")
display(tmp_df[tmp_df["n_count"]>300])
tmp_df = tmp_df[tmp_df["n_count"]<300] # There is one outlier
fig = px.histogram(tmp_df, "n_count", log_y=True,
                   color="species", color_discrete_sequence=color_discrete_sequence, 
                   category_orders=category_orders, 
                   title="<b># OF EXAMPLES PER UNIQUE INDIVIDUAL DISTRIBUTION</b>")
fig.show(renderer="png")


In [None]:
tmp_df_raw = pd.DataFrame()
tmp_df_round = pd.DataFrame()
round_to=100

tmp_df_raw["raw_img_shape"] = train_df.groupby("img_shape")["img_path"].count().keys()
tmp_df_raw["raw_img_width"] = tmp_df_raw["raw_img_shape"].apply(lambda x: ast.literal_eval(x)[1])
tmp_df_raw["raw_img_height"] = tmp_df_raw["raw_img_shape"].apply(lambda x: ast.literal_eval(x)[0])
tmp_df_raw["area"] = tmp_df_raw["raw_img_width"]*tmp_df_raw["raw_img_height"]
tmp_df_raw["raw_n_count"] = tmp_df_raw.groupby("raw_img_shape")["area"].count().values
tmp_df_raw["raw_n_count__2"] = tmp_df_raw["raw_n_count"]**2

tmp_df_round["round_img_shape"] = train_df.img_shape.apply(lambda x: str(tuple([int(round_to*round(x/round_to)) if x!=3 else x for x in ast.literal_eval(x)]))).value_counts().keys()
tmp_df_round["round_img_width"] = tmp_df_round["round_img_shape"].apply(lambda x: int(round_to*round(ast.literal_eval(x)[1]/round_to)))
tmp_df_round["round_img_height"] = tmp_df_round["round_img_shape"].apply(lambda x: int(round_to*round(ast.literal_eval(x)[0]/round_to)))
tmp_df_round["area"] = tmp_df_round["round_img_width"]*tmp_df_round["round_img_height"]
tmp_df_round["round_n_count"] = train_df.img_shape.apply(lambda x: str(tuple([int(round_to*round(x/round_to)) if x!=3 else x for x in ast.literal_eval(x)]))).value_counts().values
tmp_df_round["round_n_count__2"] = tmp_df_round["round_n_count"]**2

# Image Shape Plot
fig = px.scatter(tmp_df_raw, x="raw_img_width", y="raw_img_height", 
                 color="area", size="raw_n_count", size_max=2, 
                 title=f"<b>Image Shapes Within The Dataset (No Rounding - <i>Many Individual Sizes</i>)</b>")
fig.update_layout(yaxis_range=[0, tmp_df_round.round_img_height.max()+100], 
                  xaxis_range=[0, tmp_df_round.round_img_width.max()+100])
fig.show(renderer="png")

In [None]:
# Image Shape Plot
fig = px.scatter(tmp_df_round, x="round_img_width", y="round_img_height", 
                 color="area", size="round_n_count", size_max=100,
                 title=f"<b>Image Shapes Within The Dataset (Round To Nearest {round_to})</b>")
fig.update_layout(yaxis_range=[0, tmp_df_round.round_img_height.max()+100], 
                  xaxis_range=[0, tmp_df_round.round_img_width.max()+100])
fig.show(renderer="png")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:70%;text-align:left">VISUALIZE THE UNIQUE SPECIES PRESENT IN THE DATASET</p></div>


In [None]:
for _s in all_species:
    ex_img_paths = train_df[train_df.species==_s].sample(2).img_path.values
    plt.figure(figsize=(20,10))
    for i, ex_img_path in enumerate(ex_img_paths):
        plt.subplot(1,2,i+1)
        ex_img = cv2.imread(ex_img_path)[..., ::-1]
        plt.title(f"Example #{i+1} of species={_s}\n(shape={ex_img.shape})", fontweight="bold")
        plt.axis(False)
        plt.imshow(ex_img)
    plt.tight_layout()
    plt.show()
    


# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:70%;text-align:left">VISUALIZE EXAMPLES OF UNIQUE INDIVIDUALS IN THE DATASET</p></div>


In [None]:
for i, (img_id, img_id_df) in enumerate(train_df[train_df.n_img_of_ind==3].groupby("individual_id")):
    if i==3:
        break
    ex_img_paths = img_id_df.img_path.values
    
    plt.figure(figsize=(20,4))
    for i, ex_img_path in enumerate(ex_img_paths):
        plt.subplot(1,3,i+1)
        ex_img = cv2.imread(ex_img_path)[..., ::-1]
        plt.title(f"Example #{i+1} of image_id={img_id}\n(shape={ex_img.shape})", fontweight="bold")
        plt.axis(False)
        plt.imshow(ex_img)
    plt.tight_layout()
    plt.show()
    print("\n\n\n")