In [None]:
!pip install -q efficientnet
import pandas as pd
import numpy as np
from scipy import stats
import random
import os
import tensorflow as tf
import math
import cv2
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import re
import tensorflow.keras.applications.efficientnet as eff
import tensorflow.keras.applications as tfka
import efficientnet.tfkeras as efn
from sklearn import metrics
import gc
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
from tqdm.notebook import tqdm
from kaggle_datasets import KaggleDatasets
import tensorflow_addons as tfa

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# Configuration
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

EPOCHS = 10
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
IMAGE_SIZE = [256, 256]
# Seed
SEED = 9527
seed = 9527
# Learning rate
LR = 0.0005
# Verbosity
VERBOSE = 2
# Label_dim
label_dim = 1

# dataset path
path = "../input/asthma-trigger-dataset/food_ingredients_dataset.csv"
img_path = '../input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images/'
GCS_PATH = KaggleDatasets().get_gcs_path('asthamtest')

train_set = tf.io.gfile.glob(GCS_PATH + '/train' + '*.tfrec')
val_set = tf.io.gfile.glob(GCS_PATH + '/val' + '*.tfrec')


# common food allergens
common_allergens = {
    'cows milk': {'Cheese', 'Butter', 'Margarine', 'Yogurt', 'Cream', 'Ice cream'},
    'eggs': {'egg'},
    'tree nuts': {'Brazil nut', 'Almond', 'Cashew', 'Macadamia nut', 'Pistachio','Pine nut','Walnut'},
    'peanuts': {'peanut'},
    'shellfish': {'Shrimp','Prawn','Crayfish', 'Lobster', 'Squid', 'Scallops'},
    'wheat': {'flour', 'wheat', 'pasta', 'noodle', 'bread', 'crust'},
    'soy': {'soy', 'tofu', 'soya'},
    'fish': {'fish', 'seafood'}
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
# Data augmentation function
def data_augment(image, label):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.70, 1.30)
    image = tf.image.random_contrast(image, 0.80, 1.20)
    image = tf.image.random_brightness(image, 0.10)
    return image, label

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image


# This function parse our images and also get the target variable
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "image": tf.io.FixedLenFeature([], tf.string),
        "cows_milk": tf.io.FixedLenFeature([], tf.int64),
        "eggs": tf.io.FixedLenFeature([], tf.int64),
        "tree nuts": tf.io.FixedLenFeature([], tf.int64),
        "peanuts": tf.io.FixedLenFeature([], tf.int64),
        "shellfish": tf.io.FixedLenFeature([], tf.int64),
        "wheat": tf.io.FixedLenFeature([], tf.int64),
        "soy": tf.io.FixedLenFeature([], tf.int64),
        "fish": tf.io.FixedLenFeature([], tf.int64),
    }

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    
    image = decode_image(example['image'])
    milk = tf.cast(example['cows_milk'], tf.float32)
    eggs = tf.cast(example['eggs'], tf.float32)
    nuts = tf.cast(example['tree nuts'], tf.float32)
    peanuts = tf.cast(example['peanuts'], tf.float32)
    shellfish = tf.cast(example['shellfish'], tf.float32)
    wheat = tf.cast(example['wheat'], tf.float32)
    soy = tf.cast(example['soy'], tf.float32)
    fish = tf.cast(example['fish'], tf.float32)
    # label = [milk, eggs, nuts, peanuts, shellfish, wheat, soy, fish]
    label = tf.cast(example[target], tf.float32)
    return  image, label

def load_dataset(filenames, target, ordered = False):
    
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset

# This function is to get our training tensors
def get_training_dataset(filenames, target, ordered = False):
    dataset = load_dataset(filenames, target, ordered = ordered)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our validation tensors
def get_validation_dataset(filenames, target, ordered = True):
    dataset = load_dataset(filenames, target, ordered = ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) 
    return dataset

# Function to count how many photos we have in
def count_data_items(filenames):
    # The number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

NUM_TRAINING_IMAGES = count_data_items(train_set)
print(f'Dataset: {NUM_TRAINING_IMAGES} training images')

In [None]:
# Function for a custom learning rate scheduler with warmup and decay
def get_lr_callback():
    # lr_start   = 0.0000001
    # lr_max     = 0.000005 * BATCH_SIZE
    lr_min     = 0.0000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < EPOCHS/2:
            lr = 0.000001 * (BATCH_SIZE - epoch)
        else:
            lr = lr_min
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
    return lr_callback



def get_model(mode):

    with strategy.scope():

        inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp')
        print(f'load Model_{mode}')
        if mode == 'eff0':
            x = efn.EfficientNetB0(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff1':
            x = efn.EfficientNetB1(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff2':
            x = efn.EfficientNetB2(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff3':
            x = efn.EfficientNetB3(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff4':
            x = efn.EfficientNetB4(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff5':
            x = efn.EfficientNetB5(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff6':
            x = efn.EfficientNetB6(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'eff7':
            x = efn.EfficientNetB7(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'ICPV2':
            x = tfka.InceptionResNetV2(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'ICPV3':
            x = tfka.InceptionV3(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'XCP':
            x = tfka.Xception(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'VGG16':
            x = tfka.VGG16(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'VGG19':
            x = tfka.VGG19(weights = 'imagenet', include_top = False)(inp)
        elif mode == 'RN50':
            x = tfka.ResNet50(weights = 'imagenet', include_top = False)(inp)
        else:
            # 'RN101'
            x = tfka.ResNet101(weights = 'imagenet', include_top = False)(inp)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)
        
        output = tf.keras.layers.Dense(label_dim, activation='sigmoid')(x)

        model = tf.keras.models.Model(inputs = [inp], outputs = [output])
        opt = tf.keras.optimizers.Adam(learning_rate = LR)

        model.compile(
            optimizer = opt,
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            metrics = [tf.keras.metrics.Precision()]
            ) 
        
        return model

In [None]:
def image_mapping_check(dataset):
    counter = 0
    record = []
    while counter < dataset.shape[0]-1:
        row = dataset.loc[counter]
        img_name = row['Image_Name']
        img = cv2.imread(img_path+img_name+'.jpg')
        try:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 
        except:
            record.append(counter)
        counter+= 1
    new = dataset.drop(record, axis = 0)
    new = new.reset_index(drop = True)
    return new

def allergens_mapping(row, types):
    for item in common_allergens[types]:
        if item.lower() in row.lower():
                return 1
    return 0

def get_weights(target):
    path = "../input/food-ingredients-and-recipe-dataset-with-images/Food Ingredients and Recipe Dataset with Image Name Mapping.csv"
    df = pd.read_csv(path)
    df = image_mapping_check(df)
    df['cows_milk'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'cows milk'))
    df['eggs'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'eggs'))
    df['tree nuts'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'tree nuts'))
    df['peanuts'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'peanuts'))
    df['shellfish'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'shellfish'))
    df['wheat'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'wheat'))
    df['soy'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'soy'))
    df['fish'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'fish'))
    x_train, x_val, y_train, y_val = train_test_split(df[['Image_Name']], df.iloc[:,6:14], shuffle = True, random_state = seed, test_size = 0.25)
    weight_0 = 1
    weight_1 = (len(y_train)-y_train[target].sum())/y_train[target].sum() *1.5

    
    class_weights = {
        0: weight_0, 
        1: weight_1
    }
    return class_weights

In [None]:
def train_and_evaluate(target):

    # Seed everything
    seed_everything(SEED)
    
    print('\n')
    print('-'*50)
    
    train_dataset = get_training_dataset(train_set, target, ordered = False)
    val_dataset = get_validation_dataset(val_set, target, ordered = True)
    STEPS_PER_EPOCH = count_data_items(train_set) // BATCH_SIZE
    K.clear_session()
    model = get_model(mode)
    # Model checkpoint
    checkpoint = tf.keras.callbacks.ModelCheckpoint(f'Model_{mode}_{target}_{SEED}.h5', 
                                                    monitor = 'val_loss', 
                                                    verbose = VERBOSE, 
                                                    save_best_only = True,
                                                    save_weights_only = True, 
                                                    mode = 'min')

    class_weight = get_weights(target)
    
    history = model.fit(train_dataset,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        class_weight=class_weight,
                        epochs = EPOCHS,
                        callbacks = [checkpoint, get_lr_callback()], 
                        validation_data = val_dataset,
                        verbose = VERBOSE)
    gc.collect()

In [None]:
models = ['eff1','eff2','eff3','eff4','eff5','eff6','eff7','XCP', 'RN50']
targets = ["cows_milk", "eggs", "tree nuts", "peanuts", "shellfish", "wheat", "soy", "fish"]
for target in targets:
    for mode in models:
        print('prediction target is', target)
        train_and_evaluate(target)