In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dask.dataframe as dd
import os
# import sys11
import subprocess

from six import string_types

# Make sure you have all of these packages installed, e.g. via pip
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy
from skimage import io
from scipy import ndimage
from time import time
from IPython.display import display
import random
import tqdm

##model libraries
import logging
import warnings

import matplotlib.style as style
import tensorflow as tf
import tensorflow_hub as hub

from datetime import datetime
from keras.preprocessing import image
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint
import glob
# from utils import *

import cv2
warnings.filterwarnings('ignore')
logging.getLogger("tensorflow").setLevel(logging.ERROR)

%matplotlib inline

In [None]:
print("TF version:", tf.__version__)
tf.test.gpu_device_name()

## Reading Input Data

In [None]:
PLANET_KAGGLE_ROOT = os.path.abspath("/kaggle/input/planets-dataset/planet/planet/")
PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')
PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_classes.csv')
assert os.path.exists(PLANET_KAGGLE_ROOT)
assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)
assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)

In [None]:
labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)
print(labels_df.shape)
labels_df.sample(5)

In [None]:
# Add onehot features for every label
# for label in label_list:
labels_df['tags'] = labels_df['tags'].apply(lambda x: x.split(' '))
# Display head
labels_df.head()

In [None]:
val = labels_df.memory_usage(index=True).sum()
print(val)

In [None]:
# translating image name to image path
X = labels_df['image_name'].apply(lambda x : PLANET_KAGGLE_JPEG_DIR+'/'+x+'.jpg')
y = labels_df['tags']
X.head(), y.head()

In [None]:
nobs = 8 # Maximum number of images to display
ncols = 4 # Number of columns in display
nrows = nobs//ncols # Number of rows in display

style.use("default")
plt.figure(figsize=(16,4*nrows))
for i in range(nrows*ncols):
    ax = plt.subplot(nrows, ncols, i+1)
    plt.imshow(Image.open(X[i]))
    plt.title(y[i], size=10)
    plt.axis('off')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.1, random_state=42)

print("Labels:")
mlb = MultiLabelBinarizer()
mlb.fit(y_train)

# Loop over all labels and show them
N_LABELS = len(mlb.classes_)
for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i, label))
    
# y_train_bin = mlb.transform(y_train)
# y_val_bin = mlb.transform(y_val)

In [None]:
# for i in range(3):
#     print(X_train.iloc[i], y_train_bin[i])

## Input Pipeline

In [None]:
## ref:https://github.com/ashrefm/multi-label-soft-f1/blob/master/Multi-Label%20Image%20Classification%20in%20TensorFlow%202.0.ipynb

IMG_SIZE = 300 # Specify height and width of image to match the input format of the model
CHANNELS = 3 # Keep RGB color channels to match the input format of the model

def parse_function(filename, label):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    # Read an image from a file
    image_string = tf.io.read_file(filename)
    # Decode it into a dense vector
    image_decoded = tf.image.decode_jpeg(image_string, channels=CHANNELS)
    # Resize it to fixed shape
    image_resized = tf.image.resize(image_decoded, [IMG_SIZE, IMG_SIZE])
    # Normalize it from [0, 255] to [0.0, 1.0]
    image_normalized = image_resized / 255.0
    return image_normalized, label

In [None]:
BATCH_SIZE = 256 # Big enough to measure an F1-score
AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically
SHUFFLE_BUFFER_SIZE = 1024 # Shuffle the training data by a chunck of 1024 observations

def create_dataset(filenames, labels, is_training=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """
    
    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function, num_parallel_calls=AUTOTUNE)
    
    if is_training == True:
        # This is a small dataset, only load it once, and keep it in memory.
        dataset = dataset.cache()
        # Shuffle the data each buffer size
        dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(BATCH_SIZE)
    # Fetch batches in the background while the model is training.
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
# train_ds = create_dataset(X_train, y_train_bin)
# val_ds = create_dataset(X_val, y_val_bin)

## Model building (Transfer learning)

In [None]:
# for batch in train_ds:
#     print(model.predict(batch)[:1])
#     break

In [None]:
## LOSS FUNCTION

def macro_soft_f1(y, y_hat):
    """Compute the macro soft F1-score as a cost (average 1 - soft-F1 across all labels).
    Use probability values instead of binary predictions.
    
    Args:
        y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        
    Returns:
        cost (scalar Tensor): value of the cost function for the batch
    """
    y = tf.cast(y, tf.float32)
    y_hat = tf.cast(y_hat, tf.float32)
    tp = tf.reduce_sum(y_hat * y, axis=0)
    fp = tf.reduce_sum(y_hat * (1 - y), axis=0)
    fn = tf.reduce_sum((1 - y_hat) * y, axis=0)
    soft_f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    cost = 1 - soft_f1 # reduce 1 - soft-f1 in order to increase soft-f1
    macro_cost = tf.reduce_mean(cost) # average on all labels
    return macro_cost

In [None]:
##metric function

def macro_f1(y, y_hat, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)
    
    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive
        
    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y, axis=0), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    macro_f1 = tf.reduce_mean(f1)
    return macro_f1

In [None]:
# Load the extension and start TensorBoard

%load_ext tensorboard
%tensorboard --logdir '/kaggle/working/logs'

In [None]:
LR = 1e-5 # Keep it small when transfer learning
EPOCHS = 25

# class KerasWrapper:
#     def __init__(self, model, feat_mean, feat_std):
#         self.model = model
#         self.feat_mean = feat_mean
#         self.feat_std = feat_std
        
#     def predict_proba(self, X):
        
#         preds = self.model.predict((X - self.feat_mean)/self.feat_std)
#         return np.c_[preds, preds]
        
def dask_read_and_incrementally_fit_keras(blocksize):
    
    # reading df with dask
    df_train = dd.read_csv(PLANET_KAGGLE_LABEL_CSV, blocksize=blocksize)
    
    # The feature extractor accepts images of shape (224, 224, 3) and returns a 1280-length vector for each image.
    feature_extractor_url = "https://tfhub.dev/google/efficientnet/b3/feature-vector/1"
    height, width = 300,300
    feature_extractor_layer = hub.KerasLayer(feature_extractor_url,
                                         input_shape=(height,width,CHANNELS))
    
    # We should freeze the variables in the feature extractor layer, so that the training only modifies the new classification layers.
    # Usually, it is a good practice when working with datasets that are very small compared to the orginal dataset the feature extractor was trained on.
    feature_extractor_layer.trainable = False
    
    # creating keras model
    model = tf.keras.Sequential([
    feature_extractor_layer,
    layers.Dense(1024, activation='relu', name='hidden_layer'),
    layers.Dense(N_LABELS, activation='sigmoid', name='output')
        ])

    model.summary()
    
    model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
      loss=macro_soft_f1,
      metrics=[macro_f1])
    
    filepath="/kaggle/working/model/planet_mobilenet_pretrained-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='macro_f1', verbose=1, save_best_only=True, mode='max')
    my_callbacks = [
        tf.keras.callbacks.EarlyStopping(patience=8),
        checkpoint,
        tf.keras.callbacks.TensorBoard(log_dir='/kaggle/working/logs'),
        ]

    # loop for number of partitions
    for i in range(df_train.npartitions):
        
        # getting one partition
        part = df_train.get_partition(i).compute(scheduler='synchronous')
        
        part['tags'] = part['tags'].apply(lambda x: x.split(' '))
        X = part['image_name'].apply(lambda x : PLANET_KAGGLE_JPEG_DIR+'/'+x+'.jpg')
        y = part['tags']
        X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2, random_state=42)
        y_train_bin = mlb.transform(y_train)
        y_val_bin = mlb.transform(y_val)

        train_ds = create_dataset(X_train, y_train_bin)
        val_ds = create_dataset(X_val, y_val_bin)
        
        # running partial fit
        history = model.fit(train_ds,
                    epochs=EPOCHS,
                            validation_data=val_ds,
                            callbacks=my_callbacks)
    
    return (model, history)
val = labels_df.memory_usage(index=True).sum()
print(val)
start = time()
model, mem_history_1 = dask_read_and_incrementally_fit_keras(blocksize=(val/4))
print('\nTraining took {}'.format(time()-start))

In [None]:
image_name = []
tags = []
imgs = glob.glob("/kaggle/input/planets-dataset/planet/planet/test-jpg/*.jpg")
imgs2 = glob.glob('../input/planets-dataset/test-jpg-additional/test-jpg-additional/*')
imgs = imgs + imgs2
for img in tqdm.tqdm(imgs):
# Read an image from a file
    bgr_image = cv2.imread(img)
    rgb_image = bgr_image[:, :, [2,1,0]]
    # Resize it to fixed shape
    image_resized = cv2.resize(rgb_image, (300, 300))
    # Normalize it from [0, 255] to [0.0, 1.0]
    image_normalized = image_resized / 255.0    
    

    img_name = (img.split('/')[-1]).split('.')[0]
    image_name.append(img_name)
#     y_sub = model.predict(image_normalized)
    result = model.predict(image_normalized[np.newaxis,:])
# result.shape
    tags.append(result)

In [None]:
df_out = pd.DataFrame({'image_name':image_name,'tags':tags})
df_out.head()

In [None]:
df_out['tags_v2'] = df_out['tags'].apply(lambda x : (x[0] >= 0.7)*1)

In [None]:
((df_out['tags_v2'].values)[0]).shape

In [None]:
def remapping(x):
    val2 = x*mlb.classes_
    result = ''
    for ele in val2:
        if ele !='':
            result=result+ele+' '
    return result[:-1]
        
df_out['tags_v3'] = df_out['tags_v2'].apply(lambda x : remapping(x))


In [None]:
df_out.head()

In [None]:
df_out['tags'] = df_out['tags_v3']
df_out.head()

In [None]:
df2 = df_out[['image_name','tags']]
df2.head()

In [None]:
df2.to_csv('submission.csv', index=False)

In [None]:
len(df2)