In [1]:
import os
from zipfile import ZipFile
from multiprocessing import Pool, cpu_count
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py as h5
import mlflow


import tensorflow as tf
import tensorflow_hub as hub
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold

# Custom library in dev
import happy as hp

/home/salemi/Documents/Kaggle/happywhale


In [2]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [3]:
print(hp.utils.Config.MLFLOW_URI)
print(hp.config.Config.MLFLOW_URI)
print(hp.Config.MLFLOW_URI)

hp.Config.MLFLOW_URI = "TROLL"

print(hp.utils.Config.MLFLOW_URI)
print(hp.config.Config.MLFLOW_URI)
print(hp.Config.MLFLOW_URI)


/home/salemi/Documents/Kaggle/happywhale/mlruns
/home/salemi/Documents/Kaggle/happywhale/mlruns
/home/salemi/Documents/Kaggle/happywhale/mlruns
TROLL
TROLL
TROLL


In [None]:
# Parameter dictionnary
P = {}

P["TEST_RUN"] = True

P["TRAIN_CSV"] = "input/happy-whale-and-dolphin/train.csv"

P["TRAIN_FOLDER"] = "input/happy-whale-and-dolphin/train_images"

P["BATCH_SIZE"] = 32

P["EPOCHS"] = 10

P["LEARNING_RATE"] = 1e-3

P["LEARNING_RATE_FINETUNING"] = 5e-5

In [None]:
data_df = pd.read_csv(P["TRAIN_CSV"])

if P["TEST_RUN"]:
    data_df = data_df.iloc[:1500]

species, counts = np.unique(data_df["species"], return_counts=True)

P["CUTOFF"] = int(np.floor(np.max(counts) * 0.07))

In [None]:
hp.print_class_statistics(data_df, "species")

The classes in this dataset, the column "species", are too imbalanced. Let's group some less represented classes to get something significant.

In [None]:
# Classes are too much imbalanced, let's group the one with few example
unique_species, count_species = np.unique(data_df["species"], return_counts=True)

unique_species = unique_species[np.argsort(count_species)]
count_species = count_species[np.argsort(count_species)]

map_species = {}
idx = 0 
acc = 0
name = []
for i, j in zip(count_species, unique_species):
    acc += i
    name.append(j)    
    if acc >= P["CUTOFF"]:
        for n in name:
            map_species[n] = idx
        idx += 1
        acc = 0
        name = []
            
        
    
data_df["class"] = data_df.apply(lambda x: map_species[x["species"]], axis=1)
        
data_df.head()

In [None]:
with h5.File("preprocessed_224-224/data.h5") as f:
    np_data = f["img"][:]
    
with tf.device("/device:CPU:0"):
    np_data = tf.convert_to_tensor(np_data, dtype=tf.uint8)

In [None]:
ds = tf.data.Dataset.from_tensor_slices(np_data)

for i in ds.take(2):
    print(i)

In [None]:
hp.print_class_statistics(data_df, "class")

In [None]:
# 80 / 20 split
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

train_index, val_index = next(skf.split(np.zeros(len(data_df)), data_df["class"].values))

train_df = data_df.loc[train_index].copy()
val_df = data_df.loc[val_index].copy()




print("Stats for train set:")
hp.print_class_statistics(train_df, "class")

print("\nStats for val set:")
hp.print_class_statistics(val_df, "class")

The splits looks fairly good with a conserved class prevalence

In [None]:
class ShardedGenerator:
    def __init__(self, df: pd.DataFrame, n_shards: int = None):
        
        self._df = df.copy()
                
        self._n_shards = n_shards
        if self._n_shards is None:
            self._n_shards = cpu_count()
            
        self._df["filepath"] = self._df.apply(lambda x: os.path.join(P["TRAIN_FOLDER"], x["image"]), axis=1)
        
        
    def __call__(self, n):
        with h5.File("preprocessed_224-224/data.h5", "r") as f:
            for count, (i, row) in enumerate(self._df.iterrows()):
                if count % self._n_shards != n:
                    continue
                    
                img = tf.convert_to_tensor(f["img"][i])
                
                label = tf.convert_to_tensor(row["class"], dtype=tf.int64)

                yield img, label
                
                
gen = ShardedGenerator(train_df, 1)

for i, j in gen(0):
    print(i, j)
    break

In [None]:
def get_dataset(data:pd.DataFrame, n_shards=4):
    gen = ShardedGenerator(data, n_shards)

    out_sign = (tf.TensorSpec(shape=(224, 224, 3), dtype=tf.uint8), tf.TensorSpec(shape=(), dtype=tf.int64))

    ds = tf.data.Dataset.from_tensor_slices(np.arange(n_shards))

    ds = ds.interleave(lambda x: tf.data.Dataset.from_generator(gen, output_signature=out_sign, args=(x,)),
                       cycle_length=n_shards,
                       block_length=1,
                       num_parallel_calls=n_shards,
                       deterministic=True)
    
    return ds


P["STEPS_PER_EPOCH"] = int(np.ceil(len(train_df) / P["BATCH_SIZE"]))


ds_train = get_dataset(train_df, n_shards=16)
ds_train = ds_train.batch(P["BATCH_SIZE"]).cache()
ds_train = ds_train.map(lambda x, y: (tf.image.convert_image_dtype(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.prefetch(tf.data.AUTOTUNE).repeat(P["EPOCHS"])


ds_val = get_dataset(val_df, n_shards=16)
ds_val = ds_val.batch(P["BATCH_SIZE"]).cache()
ds_val = ds_val.map(lambda x, y: (tf.image.convert_image_dtype(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
ds_val = ds_val.prefetch(tf.data.AUTOTUNE)

In [None]:
num_classes = len(train_df["class"].unique())
print(num_classes)

class_weight = compute_class_weight("balanced", classes=np.arange(num_classes), y=train_df["class"])
class_weight = dict({i:class_weight[i] for i in range(len(class_weight))})
print(class_weight)

In [None]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(224, 224, 3)),
    hub.KerasLayer("https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/5",trainable=False),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
model.summary()

In [None]:
model.save("saved_model")

In [None]:
np.random.seed(0)
x = np.random.rand(1, 224, 224, 3).astype(np.float32)
y = model.predict(x)
y

In [None]:
x = "prepend"
y = x if x else "DEFAULT"

print(y)

In [None]:
class MLflowCallback(tf.keras.callbacks.Callback):
    def __init__(self, prepend: str = None):
        self._prepend = prepend if prepend else ""
            
    def on_epoch_end(self, epoch, logs=None):
        if logs is not None:
            for k, v in logs.items():
                mlflow.log_metric(key=self._prepend + k, value=v, step=epoch)

In [None]:
exp_name = "classification"
experiment = mlflow.get_experiment_by_name(exp_name)
if experiment is not None:
    mlflow.delete_experiment(experiment.experiment_id)
    hp.config.clean_mlflow_trash()

exp_id = mlflow.create_experiment(name=exp_name)    
        

with mlflow.start_run(experiment_id=exp_id) as run:
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=P["LEARNING_RATE"]),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    
    history = model.fit(ds_train,
                        validation_data=ds_val,
                        epochs=P["EPOCHS"], 
                        steps_per_epoch=P["STEPS_PER_EPOCH"],
                        class_weight=class_weight,
                        callbacks=[MLflowCallback()])
    
    mlflow.log_dict(history.history, "history.json")
    
    # Allow finetuning
    for l in model.layers:
        l.trainable = True
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=P["LEARNING_RATE_FINETUNING"]),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    
    history = model.fit(ds_train,
                        validation_data=ds_val,
                        epochs=P["EPOCHS"], 
                        steps_per_epoch=P["STEPS_PER_EPOCH"],
                        class_weight=class_weight,
                        callbacks=[MLflowCallback(prepend="finetune-")])
                                                                     
    mlflow.log_dict(history.history, "history-finetune.json")
    
    mlflow.log_dict(P, "params.yaml")
                                                                     

In [None]:
runs = mlflow.search_runs(exp_id, output_format="list")




In [None]:
client = mlflow.tracking.MlflowClient()

metrics = client.get_metric_history(runs[0].info.run_id, "accuracy")
df = pd.DataFrame()
for i in metrics:
    df = df.append(dict(i), ignore_index=True)
df.head()

plt.plot(df.step, df.value, "o-")

In [None]:
i, j = model.layers[-1].get_weights()

print(i.shape, j.shape)