In [None]:
import sys
import gc
import math
import configparser
import numpy as np
import pandas as pd
import optuna
from tqdm import tqdm
import sklearn
import tensorflow as tf
from tensorflow import keras
from typing import Tuple

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
INPUT = '/kaggle/input'
DATA = f'{INPUT}/shopee-product-matching'
OUTPUT = '/kaggle/working'
RESOURCE_DIR = f'{INPUT}/shopee-product-matching-lib/kaggle-shopee-product-matching-1.0'
sys.path.append(f'{INPUT}/sgcharts-ml/src')
sys.path.append(f"{INPUT}/sentence-transformers/sentence-transformers-1.0.4")
sys.path.append(f'{RESOURCE_DIR}/src')
import mylib
import scml
from scml.nlp import strip_punctuation, to_ascii_str
scml.seed_everything()

In [None]:
MODEL = 'efficientnetb0'
CONF = configparser.ConfigParser()
CONF.read(f"{RESOURCE_DIR}/app.ini")
resolution = int(CONF[MODEL]["resolution"])
INPUT_SHAPE = (resolution, resolution, 3)
print(f"INPUT_SHAPE={INPUT_SHAPE}")

In [None]:
train = pd.read_csv(f"{DATA}/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
le = sklearn.preprocessing.LabelEncoder()
train["label_group"] = le.fit_transform(train['label_group'])
n_classes=len(le.classes_)
print(f"n_classes={n_classes}")
train.info()

In [None]:
train.head()

In [None]:
def _data_gen(
    dataframe,
    directory,
    target_size,
    batch_size,
    color_mode="rgb",
    class_mode="raw",
    x_col="image",
    y_col="label_group"
):
    dtype = np.float32
    rescale = 1./255
    interpolation = "nearest"
    data_format = "channels_last"
    shuffle = True
    idg = keras.preprocessing.image.ImageDataGenerator(
        #shear_range=0.2,
        #zoom_range=0.2,
        #horizontal_flip=True,
        rescale=rescale,
        data_format=data_format,
        dtype=dtype
    )
    g = idg.flow_from_dataframe(
        dataframe=dataframe,
        x_col=x_col,
        y_col=y_col,
        directory=directory,
        target_size=target_size,
        color_mode=color_mode,
        batch_size=batch_size,
        shuffle=shuffle,
        class_mode=class_mode,
        interpolation=interpolation,
    )
    while True:
        x, y = g.next()
        yield [x, y], y

In [None]:
def _model(
    pretrained,
    n_classes: int,
    lr: float,
    input_shape: Tuple[int, int, int],
    dtype=np.float32
):
    pretrained.trainable = False
    #kernel_initializer = keras.initializers.he_normal()
    #kernel_regularizer = keras.regularizers.l2(0.01)
    image_input = keras.layers.Input(shape=input_shape, name="image_input")
    label_input = keras.layers.Input(shape=(), name="label_input")
    x = pretrained(image_input)
    x = keras.layers.LayerNormalization()(x)
    x = keras.layers.Dense(pretrained.output_shape[1], activation="relu")(x)
    x = keras.layers.LayerNormalization(name="embedding_output")(x)
    x = mylib.ArcMarginProduct(
        n_classes=n_classes, 
        s=30, 
        m=0.5, 
        name='head/arc_margin', 
        dtype=dtype
    )([x, label_input])
    output = tf.keras.layers.Softmax(dtype=dtype)(x)
    model = tf.keras.models.Model(inputs = [image_input, label_input], outputs = [output])
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    loss = keras.losses.SparseCategoricalCrossentropy()
    sca = keras.metrics.SparseCategoricalAccuracy()
    model.compile(loss=loss, optimizer=optimizer, metrics=[sca])
    return model

In [None]:
pretrained = mylib.efficient_net(
    variant=MODEL,
    pooling="avg",
    directory=f"{RESOURCE_DIR}/pretrained/efficientnet",
)
model = _model(
    pretrained=pretrained,
    input_shape=INPUT_SHAPE,
    n_classes=n_classes,
    lr=1e-3,
)
model.summary(line_length=150)

In [None]:
def _callbacks(patience: int, directory: str):
    return [
        keras.callbacks.EarlyStopping(
            monitor="loss", patience=patience, verbose=1
        ),
        keras.callbacks.ModelCheckpoint(
            filepath=f"{directory}/model.h5",
            monitor="loss",
            save_best_only=True,
            verbose=1
        )
    ]

In [None]:
class MyObjective:
    def __init__(
        self,
        df,
        epochs: int,
        batch_size: int,
        patience: int,
        job_dir: str,
        lr: Tuple[float, float],
        n_classes: int,
        input_shape: Tuple[int, int, int],
    ):
        self.df = df
        self.epochs = epochs
        self.batch_size = batch_size
        self.patience = patience
        self.job_dir = job_dir
        self.lr = lr
        self.n_classes = n_classes
        self.input_shape = input_shape
        self.history: List[Dict[str, Union[str, int, float]]] = []

    def __call__(self, trial):
        hist = {
            "trial_id": trial.number,
            "learning_rate": trial.suggest_loguniform(
                "learning_rate", self.lr[0], self.lr[1]
            ),
        }
        train_gen = _data_gen(
            dataframe=self.df,
            directory=f"{DATA}/train_images",
            target_size=self.input_shape[:2],
            batch_size=self.batch_size,
        )
        pretrained = mylib.efficient_net(
            variant=MODEL,
            pooling="avg",
            directory=f"{RESOURCE_DIR}/pretrained/efficientnet",
        )
        model = _model(
            pretrained=pretrained,
            input_shape=self.input_shape,
            n_classes=self.n_classes,
            lr=hist["learning_rate"],
        )
        directory = f"{self.job_dir}/trial_{hist['trial_id']}"
        history = model.fit(
            train_gen,
            epochs=self.epochs,
            steps_per_epoch=len(self.df) / self.batch_size + 1,
            #validation_steps=len(vi) / self.batch_size + 1,
            #validation_data=val_gen,
            callbacks=_callbacks(self.patience, directory=directory),
            verbose=1
        )
        #y_pred = model.predict(x_val, batch_size=self.batch_size)
        #score = metrics.mean_squared_error(y_val, y_pred, squared=False)
        #print(repr(history.history))
        score = history.history["sparse_categorical_accuracy"][-1]
        #log.info(f"score={score:.4f}, fold={fold}, trial={hist['trial_id']}")
        print(f"score={score:.4f}, trial={hist['trial_id']}")
        del model
        gc.collect()
        hist["score_worst"] = score
        self.history.append(hist)
        return hist["score_worst"]

In [None]:
obj = MyObjective(
    df=train,
    epochs=40,
    batch_size=32,  # B3: OOM if batch size > 128 
    patience=2,
    job_dir=OUTPUT,
    lr=(1e-3, 1e-3),
    n_classes=n_classes,
    input_shape=INPUT_SHAPE,
)
study = optuna.create_study(direction="maximize")
study.optimize(obj, n_trials=1)

In [None]:
history = pd.DataFrame.from_records(obj.history)
history.sort_values("score_worst", ascending=False, inplace=True, ignore_index=True)
history.to_csv(f"{OUTPUT}/cv.csv", index=False)
history.head()