In [1]:
import os
from zipfile import ZipFile
from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py as h5


import tensorflow as tf
import tensorflow_hub as hub
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold

# Custom library in dev
import happy as hp

In [2]:
P = {}

P["TRAIN_CSV"] = "input/happy-whale-and-dolphin/train.csv"

P["TRAIN_FOLDER"] = "input/happy-whale-and-dolphin/train_images"

P["BATCH_SIZE"] = 32

P["EPOCHS"] = 3

P["LEARNING_RATE"] = 1e-3

P["LEARNING_RATE_FINETUNING"] = 5e-5

In [3]:
data_df = pd.read_csv(P["TRAIN_CSV"])

In [4]:
hp.print_class_statistics(data_df, "species")

CLASS NAME                    COUNTS    PERCENTAGE
--------------------------------------------------
bottlenose_dolphin            9664      18.94     
beluga                        7443      14.58     
humpback_whale                7392      14.48     
blue_whale                    4830      9.46      
false_killer_whale            3326      6.52      
dusky_dolphin                 3139      6.15      
spinner_dolphin               1700      3.33      
melon_headed_whale            1689      3.31      
minke_whale                   1608      3.15      
killer_whale                  1493      2.93      
fin_whale                     1324      2.59      
gray_whale                    1123      2.20      
bottlenose_dolpin             1117      2.19      
kiler_whale                   962       1.89      
southern_right_whale          866       1.70      
spotted_dolphin               490       0.96      
sei_whale                     428       0.84      
short_finned_pilot_whale      3

The classes in this dataset, the column "species", are too imbalanced. Let's group some less represented classes to get something significant.

In [5]:
# Classes are too much imbalanced, let's group the one with few example

cutoff = 800 # Minimum number of sample for a species to be considered


unique_species, count_species = np.unique(data_df["species"], return_counts=True)

unique_species = unique_species[np.argsort(count_species)]
count_species = count_species[np.argsort(count_species)]

map_species = {}
idx = 0 
acc = 0
name = []
for i, j in zip(count_species, unique_species):
    acc += i
    name.append(j)    
    if acc >= cutoff:
        for n in name:
            map_species[n] = idx
        idx += 1
        acc = 0
        name = []
            
        
    
data_df["class"] = data_df.apply(lambda x: map_species[x["species"]], axis=1)
        
data_df.head()

Unnamed: 0,image,species,individual_id,class
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,10
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,15
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,13
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,17
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,15


In [6]:
hp.print_class_statistics(data_df, "class")

CLASS NAME                    COUNTS    PERCENTAGE
--------------------------------------------------
17                            9664      18.94     
16                            7443      14.58     
15                            7392      14.48     
14                            4830      9.46      
13                            3326      6.52      
12                            3139      6.15      
11                            1700      3.33      
10                            1689      3.31      
9                             1608      3.15      
8                             1493      2.93      
3                             1356      2.66      
7                             1324      2.59      
2                             1142      2.24      
6                             1123      2.20      
5                             1117      2.19      
4                             962       1.89      
0                             884       1.73      
1                             8

In [7]:
# 80 / 20 split
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

train_index, val_index = next(skf.split(np.zeros(len(data_df)), data_df["class"].values))

train_df = data_df.loc[train_index].copy()
val_df = data_df.loc[val_index].copy()




print("Stats for train set:")
hp.print_class_statistics(train_df, "class")

print("\nStats for val set:")
hp.print_class_statistics(val_df, "class")

Stats for train set:
CLASS NAME                    COUNTS    PERCENTAGE
--------------------------------------------------
17                            7731      18.94     
16                            5955      14.59     
15                            5913      14.48     
14                            3864      9.46      
13                            2661      6.52      
12                            2511      6.15      
11                            1360      3.33      
10                            1351      3.31      
9                             1286      3.15      
8                             1194      2.92      
3                             1085      2.66      
7                             1059      2.59      
2                             913       2.24      
6                             899       2.20      
5                             894       2.19      
4                             770       1.89      
0                             707       1.73      
1         

The splits looks fairly good with a conserved class prevalence

In [8]:
class ShardedGenerator:
    def __init__(self, df: pd.DataFrame, n_shards: int = None):
        
        self._df = df.copy()
                
        self._n_shards = n_shards
        if self._n_shards is None:
            self._n_shards = cpu_count()
            
        self._df["filepath"] = self._df.apply(lambda x: os.path.join(P["TRAIN_FOLDER"], x["image"]), axis=1)
        
        
    def __call__(self, n):
        with h5.File("preprocessed_224-224/data.h5", "r") as f:
            for count, (i, row) in enumerate(self._df.iterrows()):
                if count % self._n_shards != n:
                    continue
                    
                img = tf.convert_to_tensor(f["img"][i])
                
                label = tf.convert_to_tensor(row["class"], dtype=tf.int64)

                yield img, label
                
                
gen = ShardedGenerator(train_df, 1)

for i, j in gen(0):
    print(i, j)
    break

tf.Tensor(
[[[ 76 122 171]
  [ 76 124 170]
  [ 84 127 174]
  ...
  [ 62 100 147]
  [ 63  99 151]
  [ 60 101 150]]

 [[ 89 136 178]
  [ 86 134 174]
  [ 94 139 179]
  ...
  [ 49  86 131]
  [ 53  87 136]
  [ 56  90 138]]

 [[ 87 132 174]
  [ 92 137 178]
  [ 90 136 177]
  ...
  [ 58  92 137]
  [ 59  93 138]
  [ 60  92 139]]

 ...

 [[ 47  44  53]
  [ 47  46  54]
  [ 44  44  52]
  ...
  [ 48  47  55]
  [ 47  47  55]
  [ 49  46  55]]

 [[ 43  43  51]
  [ 42  42  50]
  [ 47  46  54]
  ...
  [ 49  48  56]
  [ 48  48  56]
  [ 48  47  55]]

 [[ 40  43  52]
  [ 43  43  53]
  [ 43  41  52]
  ...
  [ 51  50  59]
  [ 52  51  59]
  [ 49  49  57]]], shape=(224, 224, 3), dtype=uint8) tf.Tensor(10, shape=(), dtype=int64)


In [9]:
def get_dataset(data:pd.DataFrame, n_shards=4):
    gen = ShardedGenerator(data, n_shards)

    out_sign = (tf.TensorSpec(shape=(224, 224, 3), dtype=tf.uint8), tf.TensorSpec(shape=(), dtype=tf.int64))

    ds = tf.data.Dataset.from_tensor_slices(np.arange(n_shards))

    ds = ds.interleave(lambda x: tf.data.Dataset.from_generator(gen, output_signature=out_sign, args=(x,)),
                       cycle_length=n_shards,
                       block_length=1,
                       num_parallel_calls=n_shards,
                       deterministic=True)
    
    return ds


STEPS_PER_EPOCH = np.ceil(len(train_df) / P["BATCH_SIZE"])


ds_train = get_dataset(train_df, n_shards=16)
ds_train = ds_train.batch(P["BATCH_SIZE"]).cache()
ds_train = ds_train.map(lambda x, y: (tf.image.convert_image_dtype(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.prefetch(tf.data.AUTOTUNE).repeat(P["EPOCHS"])


ds_val = get_dataset(val_df, n_shards=16)
ds_val = ds_val.batch(P["BATCH_SIZE"]).cache()
ds_val = ds_val.map(lambda x, y: (tf.image.convert_image_dtype(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
ds_val = ds_val.prefetch(tf.data.AUTOTUNE)

In [10]:
num_classes = len(train_df["class"].unique())
print(num_classes)

class_weight = compute_class_weight("balanced", classes=np.arange(num_classes), y=train_df["class"])
class_weight = dict({i:class_weight[i] for i in range(len(class_weight))})
print(class_weight)

18
{0: 3.208077950652208, 1: 3.3701502393924385, 2: 2.484239990264087, 3: 2.0904249871991807, 4: 2.9455988455988455, 5: 2.537037037037037, 6: 2.5229267086886664, 7: 2.1417479802748924, 8: 1.8995905453191886, 9: 1.763694487644721, 10: 1.6788387202894974, 11: 1.6677287581699347, 12: 0.9032700561971768, 13: 0.8523529166144724, 14: 0.5869852772026685, 15: 0.38358043482345866, 16: 0.3808750816307491, 17: 0.29337874933528796}


In [11]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(224, 224, 3)),
    hub.KerasLayer("https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/5",trainable=False),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])


In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 2048)              23564800  
                                                                 
 dense (Dense)               (None, 18)                36882     
                                                                 
Total params: 23,601,682
Trainable params: 36,882
Non-trainable params: 23,564,800
_________________________________________________________________


In [13]:
history = model.fit(ds_train, validation_data=ds_val, epochs=P["EPOCHS"], 
                    steps_per_epoch=STEPS_PER_EPOCH, class_weight=class_weight)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=P["LEARNING_RATE_FINETUNING"]),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])


In [15]:
for l in model.layers:
    l.trainable = True

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 2048)              23564800  
                                                                 
 dense (Dense)               (None, 18)                36882     
                                                                 
Total params: 23,601,682
Trainable params: 23,556,242
Non-trainable params: 45,440
_________________________________________________________________


In [17]:
history = model.fit(ds_train, validation_data=ds_val, epochs=P["EPOCHS"],
                    steps_per_epoch=STEPS_PER_EPOCH, class_weight=class_weight)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
history.history

{'loss': [0.3970639705657959, 0.38937488198280334, 0.3863593637943268],
 'accuracy': [0.8851222395896912, 0.8892862200737, 0.8899720907211304],
 'val_loss': [0.46997886896133423, 0.46740734577178955, 0.46557819843292236],
 'val_accuracy': [0.8561771512031555, 0.8566669821739197, 0.8568629622459412]}

In [25]:
for i, j in ds_val.unbatch().take(10).batch(1):
    y = model.predict(i)
    print(np.argmax(y), j)
    


15 tf.Tensor([15], shape=(1,), dtype=int64)
4 tf.Tensor([7], shape=(1,), dtype=int64)
15 tf.Tensor([15], shape=(1,), dtype=int64)
9 tf.Tensor([9], shape=(1,), dtype=int64)
16 tf.Tensor([16], shape=(1,), dtype=int64)
8 tf.Tensor([8], shape=(1,), dtype=int64)
9 tf.Tensor([9], shape=(1,), dtype=int64)
13 tf.Tensor([13], shape=(1,), dtype=int64)
8 tf.Tensor([2], shape=(1,), dtype=int64)
2 tf.Tensor([2], shape=(1,), dtype=int64)
