In [1]:
import os
from zipfile import ZipFile
from multiprocessing import Pool, cpu_count
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py as h5
import mlflow


import tensorflow as tf
import tensorflow_hub as hub
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold

# Custom library in dev
import happy as hp

/home/salemi/Documents/Kaggle/happywhale


In [2]:
# Parameter dictionnary
P = {}

P["TEST_RUN"] = True

P["TRAIN_CSV"] = "input/happy-whale-and-dolphin/train.csv"

P["TRAIN_FOLDER"] = "input/happy-whale-and-dolphin/train_images"

P["BATCH_SIZE"] = 32

P["EPOCHS"] = 10

P["LEARNING_RATE"] = 1e-3

P["LEARNING_RATE_FINETUNING"] = 5e-5

In [3]:
data_df = pd.read_csv(P["TRAIN_CSV"])

if P["TEST_RUN"]:
    data_df = data_df.iloc[:1500]

species, counts = np.unique(data_df["species"], return_counts=True)

P["CUTOFF"] = int(np.floor(np.max(counts) * 0.07))

In [4]:
hp.print_class_statistics(data_df, "species")

CLASS NAME                    COUNTS    PERCENTAGE
--------------------------------------------------
bottlenose_dolphin            284       18.93     
beluga                        227       15.13     
humpback_whale                207       13.80     
blue_whale                    135       9.00      
false_killer_whale            103       6.87      
dusky_dolphin                 86        5.73      
minke_whale                   51        3.40      
melon_headed_whale            50        3.33      
spinner_dolphin               47        3.13      
killer_whale                  44        2.93      
gray_whale                    34        2.27      
fin_whale                     33        2.20      
southern_right_whale          30        2.00      
bottlenose_dolpin             29        1.93      
kiler_whale                   28        1.87      
spotted_dolphin               21        1.40      
short_finned_pilot_whale      16        1.07      
sei_whale                     1

The classes in this dataset, the column "species", are too imbalanced. Let's group some less represented classes to get something significant.

In [5]:
# Classes are too much imbalanced, let's group the one with few example
unique_species, count_species = np.unique(data_df["species"], return_counts=True)

unique_species = unique_species[np.argsort(count_species)]
count_species = count_species[np.argsort(count_species)]

map_species = {}
idx = 0 
acc = 0
name = []
for i, j in zip(count_species, unique_species):
    acc += i
    name.append(j)    
    if acc >= P["CUTOFF"]:
        for n in name:
            map_species[n] = idx
        idx += 1
        acc = 0
        name = []
            
        
    
data_df["class"] = data_df.apply(lambda x: map_species[x["species"]], axis=1)
        
data_df.head()

Unnamed: 0,image,species,individual_id,class
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,12
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,17
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,15
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,19
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,17


In [6]:
hp.print_class_statistics(data_df, "class")

CLASS NAME                    COUNTS    PERCENTAGE
--------------------------------------------------
19                            284       18.93     
18                            227       15.13     
17                            207       13.80     
16                            135       9.00      
15                            103       6.87      
14                            86        5.73      
13                            51        3.40      
12                            50        3.33      
11                            47        3.13      
10                            44        2.93      
9                             34        2.27      
8                             33        2.20      
7                             30        2.00      
6                             29        1.93      
3                             29        1.93      
5                             28        1.87      
2                             22        1.47      
4                             2

In [7]:
# 80 / 20 split
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

train_index, val_index = next(skf.split(np.zeros(len(data_df)), data_df["class"].values))

train_df = data_df.loc[train_index].copy()
val_df = data_df.loc[val_index].copy()




print("Stats for train set:")
hp.print_class_statistics(train_df, "class")

print("\nStats for val set:")
hp.print_class_statistics(val_df, "class")

Stats for train set:
CLASS NAME                    COUNTS    PERCENTAGE
--------------------------------------------------
19                            227       18.92     
18                            181       15.08     
17                            165       13.75     
16                            108       9.00      
15                            83        6.92      
14                            68        5.67      
13                            41        3.42      
12                            40        3.33      
11                            37        3.08      
10                            36        3.00      
8                             27        2.25      
9                             27        2.25      
7                             24        2.00      
6                             24        2.00      
5                             23        1.92      
3                             23        1.92      
2                             17        1.42      
1         

The splits looks fairly good with a conserved class prevalence

In [8]:
class ShardedGenerator:
    def __init__(self, df: pd.DataFrame, n_shards: int = None):
        
        self._df = df.copy()
                
        self._n_shards = n_shards
        if self._n_shards is None:
            self._n_shards = cpu_count()
            
        self._df["filepath"] = self._df.apply(lambda x: os.path.join(P["TRAIN_FOLDER"], x["image"]), axis=1)
        
        
    def __call__(self, n):
        with h5.File("preprocessed_224-224/data.h5", "r") as f:
            for count, (i, row) in enumerate(self._df.iterrows()):
                if count % self._n_shards != n:
                    continue
                    
                img = tf.convert_to_tensor(f["img"][i])
                
                label = tf.convert_to_tensor(row["class"], dtype=tf.int64)

                yield img, label
                
                
gen = ShardedGenerator(train_df, 1)

for i, j in gen(0):
    print(i, j)
    break

tf.Tensor(
[[[ 76 122 171]
  [ 76 124 170]
  [ 84 127 174]
  ...
  [ 62 100 147]
  [ 63  99 151]
  [ 60 101 150]]

 [[ 89 136 178]
  [ 86 134 174]
  [ 94 139 179]
  ...
  [ 49  86 131]
  [ 53  87 136]
  [ 56  90 138]]

 [[ 87 132 174]
  [ 92 137 178]
  [ 90 136 177]
  ...
  [ 58  92 137]
  [ 59  93 138]
  [ 60  92 139]]

 ...

 [[ 47  44  53]
  [ 47  46  54]
  [ 44  44  52]
  ...
  [ 48  47  55]
  [ 47  47  55]
  [ 49  46  55]]

 [[ 43  43  51]
  [ 42  42  50]
  [ 47  46  54]
  ...
  [ 49  48  56]
  [ 48  48  56]
  [ 48  47  55]]

 [[ 40  43  52]
  [ 43  43  53]
  [ 43  41  52]
  ...
  [ 51  50  59]
  [ 52  51  59]
  [ 49  49  57]]], shape=(224, 224, 3), dtype=uint8) tf.Tensor(12, shape=(), dtype=int64)


In [9]:
def get_dataset(data:pd.DataFrame, n_shards=4):
    gen = ShardedGenerator(data, n_shards)

    out_sign = (tf.TensorSpec(shape=(224, 224, 3), dtype=tf.uint8), tf.TensorSpec(shape=(), dtype=tf.int64))

    ds = tf.data.Dataset.from_tensor_slices(np.arange(n_shards))

    ds = ds.interleave(lambda x: tf.data.Dataset.from_generator(gen, output_signature=out_sign, args=(x,)),
                       cycle_length=n_shards,
                       block_length=1,
                       num_parallel_calls=n_shards,
                       deterministic=True)
    
    return ds


P["STEPS_PER_EPOCH"] = int(np.ceil(len(train_df) / P["BATCH_SIZE"]))


ds_train = get_dataset(train_df, n_shards=16)
ds_train = ds_train.batch(P["BATCH_SIZE"]).cache()
ds_train = ds_train.map(lambda x, y: (tf.image.convert_image_dtype(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.prefetch(tf.data.AUTOTUNE).repeat(P["EPOCHS"])


ds_val = get_dataset(val_df, n_shards=16)
ds_val = ds_val.batch(P["BATCH_SIZE"]).cache()
ds_val = ds_val.map(lambda x, y: (tf.image.convert_image_dtype(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
ds_val = ds_val.prefetch(tf.data.AUTOTUNE)

In [10]:
num_classes = len(train_df["class"].unique())
print(num_classes)

class_weight = compute_class_weight("balanced", classes=np.arange(num_classes), y=train_df["class"])
class_weight = dict({i:class_weight[i] for i in range(len(class_weight))})
print(class_weight)

20
{0: 3.75, 1: 3.5294117647058822, 2: 3.5294117647058822, 3: 2.608695652173913, 4: 3.75, 5: 2.608695652173913, 6: 2.5, 7: 2.5, 8: 2.2222222222222223, 9: 2.2222222222222223, 10: 1.6666666666666667, 11: 1.6216216216216217, 12: 1.5, 13: 1.4634146341463414, 14: 0.8823529411764706, 15: 0.7228915662650602, 16: 0.5555555555555556, 17: 0.36363636363636365, 18: 0.3314917127071823, 19: 0.2643171806167401}


In [11]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(224, 224, 3)),
    hub.KerasLayer("https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/5",trainable=False),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 2048)              23564800  
                                                                 
 dense (Dense)               (None, 20)                40980     
                                                                 
Total params: 23,605,780
Trainable params: 40,980
Non-trainable params: 23,564,800
_________________________________________________________________


In [13]:
exp_name = "classification"
experiment = mlflow.get_experiment_by_name(exp_name)
if experiment is not None:
    mlflow.delete_experiment(experiment.experiment_id)
    hp.config.clean_mlflow_trash()

exp_id = mlflow.create_experiment(name=exp_name)    
        

with mlflow.start_run(experiment_id=exp_id) as run:
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=P["LEARNING_RATE"]),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    
    history = model.fit(ds_train,
                        validation_data=ds_val,
                        epochs=P["EPOCHS"], 
                        steps_per_epoch=P["STEPS_PER_EPOCH"],
                        class_weight=class_weight)
    
    mlflow.log_dict(history.history, "history.json")
    
    # Allow finetuning
    for l in model.layers:
        l.trainable = True
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=P["LEARNING_RATE_FINETUNING"]),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    
    history = model.fit(ds_train,
                        validation_data=ds_val,
                        epochs=P["EPOCHS"], 
                        steps_per_epoch=P["STEPS_PER_EPOCH"],
                        class_weight=class_weight)
                                                                     
    mlflow.log_dict(history.history, "history-finetune.json")
    
    mlflow.log_dict(P, "params.yaml")
                                                                     

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [15]:
runs = mlflow.search_runs(exp_id)

runs.head()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name
0,18c0acfb9ab944dbaa1adf9e17e9cc51,1,FINISHED,file:///home/salemi/Documents/Kaggle/happywhal...,2022-02-06 00:03:52.611000+00:00,2022-02-06 00:07:56.358000+00:00,LOCAL,salemi,/home/salemi/miniconda3/envs/happywhale/lib/py...


In [17]:
runs.at[0, "artifact_uri"]

'file:///home/salemi/Documents/Kaggle/happywhale/mlflow/1/18c0acfb9ab944dbaa1adf9e17e9cc51/artifacts'