In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf, tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import time
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

In [None]:
#TPU preparation

AUTO = tf.data.experimental.AUTOTUNE
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
#paths:
ADDITIONAL_DATA_NO_BG_IMAGES_PATH = 'plant-pathology-more-data-no-background' 
ADDITIONAL_DATA_IMAGES_PATH = 'plant-pathology-2020-preprocessed-images'
ORIGINAL_DATA_IMAGES_PATH = 'plant-pathology-2020-fgvc7'

CSV_TRAIN_PATH = '../input/plant-pathology-more-data-no-background/train.csv'
CSV_TEST_PATH =  '../input/plant-pathology-more-data-no-background/test.csv'

USE_ADDITIONAL_DATA = False

In [None]:
#important constants:
IMG_SIZE = 784
BATCH_SIZE = 8*strategy.num_replicas_in_sync
nb_classes = 4
CATEGORY_NAMES = ['healthy','multiple_diseases','rust','scab']

In [None]:
def get_train_data(path = ORIGINAL_DATA_IMAGES_PATH):
    train = pd.read_csv(CSV_TRAIN_PATH)
    train_id = train.pop('image_id')
        
    y_train = train.to_numpy().astype('float32')
    
    if not USE_ADDITIONAL_DATA:
        train_id = [idee for idee in train_id if idee[0]=='T']
        y_train = y_train[:len(train_id)]
    
    GCS_DS_PATH = KaggleDatasets().get_gcs_path(path)
    
    root_img = 'images'
    if path == ADDITIONAL_DATA_NO_BG_IMAGES_PATH:
        root_img = 'images/images'
    if path == ADDITIONAL_DATA_IMAGES_PATH:
        root_img = 'plant_processed/images'
    
    images_paths = [os.path.join(GCS_DS_PATH,root_img,idee+'.jpg') for idee in train_id]
    
    return images_paths,y_train



def train_val_split(x_train,y_train, test_size = 0.2):
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    sss = StratifiedShuffleSplit(n_splits=1, test_size = test_size, random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        x_train, x_test = x_train[train_index], x_train[test_index]
        y_train, y_test = y_train[train_index], y_train[test_index]
    return x_train, y_train, x_test, y_test


def read_train_image(path, label = None):
    bits = tf.io.read_file(path)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, (IMG_SIZE,IMG_SIZE))
    
    if label is None:
        return image
    else:
        return image, label
    
    
    
def augment_train_image(image, label = None, seed=1024):
    image = tf.image.random_flip_left_right(image, seed=seed)
    image = tf.image.random_flip_up_down(image, seed=seed)
    
    if label is None:
        return image
    else:
        return image, label



def get_train_val_datasets(x_train,y_train,x_val = None ,y_val = None):
    train_dataset = (tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .map(read_train_image, num_parallel_calls=AUTO)
    .map(augment_train_image, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
    )
       
    if x_val is None:
        return train_dataset
    
    val_dataset = (tf.data.Dataset
               .from_tensor_slices((x_val,y_val))                
               .map(read_train_image,num_parallel_calls=AUTO)
               .batch(BATCH_SIZE)
               .cache()
               .prefetch(AUTO)
              )
   
    return train_dataset, val_dataset

In [None]:
def get_class_weights(y):
    class_weights = compute_class_weight('balanced',np.unique(y.argmax(axis=1)),y.argmax(axis=1))
    print('class weights: ',class_weights)
    return class_weights

def plot_class_weights(class_weights,title = 'Categories distribution'):
    plt.bar(range(4),1 / class_weights,color=['springgreen', 'lightcoral', 'mediumpurple', 'gold'],width=0.9)
    plt.xticks(range(4), CATEGORY_NAMES) 
    plt.title(title);
    plt.ylabel('Probability')
    plt.xlabel('Data')
    plt.show()

In [None]:
#helper functions to show image or images

def show_image(filename, image_height = 10, image_width = 10):
    fig=plt.figure(figsize=(8, 8))
    img = read_train_image(filename)
    fig.add_subplot(1, 1, 1)
    plt.imshow(img)
    plt.show()


def show_images(images, height = 20, width = 20, images_in_one_row = 5):
    
    images = np.array(images)
    size = images.size
    
    width = (size+images_in_one_row-1)//images_in_one_row*images_in_one_row
    rows = (size + images_in_one_row - 1) // images_in_one_row
    fig=plt.figure(figsize=(height, width))
    
    for i in range(size):
        cur_image = read_train_image(images[i])   
        fig.add_subplot(rows, images_in_one_row, i + 1)
        plt.imshow(cur_image)
    plt.show()

In [None]:
#installing effnet

!pip install efficientnet
import efficientnet.tfkeras as efn

In [None]:
#custom metrics

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float32'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float32'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float32'), axis=0)
 
    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())
 
    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [None]:
MODEL_CFG = {
    'optimizer': Adam(lr=0.0001),
    'loss': 'categorical_crossentropy',
    'metrics': ['accuracy',tf.keras.metrics.AUC(),f1],
    'name':'effNetB7.h5'
}

CL_BEST_MODEL= ModelCheckpoint(MODEL_CFG['name'], 
                                 monitor='val_loss', 
                                 verbose=1, 
                                 save_best_only=True,
                                 save_weights_only=True,
                                 mode='min')
    
CL_REDUCE_LR = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.5,
                                  verbose=0,
                                  epsilon = 1e-4,
                                  min_lr = 1e-5,
                                  patience=10)

CL_EARLY_STOPPING = EarlyStopping(monitor = "val_loss" , verbose = 1 , mode = 'min' , patience = 50)
CALLBACKS = [CL_BEST_MODEL,CL_REDUCE_LR,CL_EARLY_STOPPING]

In [None]:
def get_model(): 
    with strategy.scope():
        base_model = efn.EfficientNetB7(weights='imagenet',
                              include_top=False,
                              input_shape=(IMG_SIZE,IMG_SIZE, 3),
                              pooling='avg')
        
        x = base_model.output
        '''
        x = Dropout(0.2)(x)
        x = BatchNormalization()(x)
        x = Dense(512,activation = 'relu')(x)
        x = Dense(256,activation='relu')(x)
        '''
        
        predictions = Dense(4,activation='softmax')(x)

        model = Model(inputs=base_model.input, outputs=predictions)

        model.compile(
            optimizer = MODEL_CFG['optimizer'],
            loss = MODEL_CFG['loss'],
            metrics = MODEL_CFG['metrics'],
        )
        
        return model
    

In [None]:
def vizualize_history(history):
    
    #plot accuracy
    plt.rcParams["figure.figsize"] = (8,8)
    plt.plot(history.history["accuracy"])
    plt.plot(history.history['val_accuracy'])
    plt.title("model accuracy")
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend(["Accuracy","Val. Accuracy"])
    plt.show()
    
    #plot loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title("model loss")
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend(["Loss","Validation Loss"])
    plt.show()
    

def train_model(model,
                train_dataset,
                val_dataset = None,
                epochs = 10,
                steps = 22,
                class_weights = None
               ):

    history = model.fit(train_dataset,
                    steps_per_epoch=steps,
                    epochs=epochs,
                    verbose=1,
                    validation_data=val_dataset,
                    callbacks=CALLBACKS,
                    class_weight = class_weights
                       )
    
    return history

In [None]:
def read_test_image(path):
    bits = tf.io.read_file(path)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, (IMG_SIZE,IMG_SIZE))
    
    return image

def augment_test_image(image):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
           
    return image

In [None]:
def get_test_data(path):

    test = pd.read_csv(CSV_TEST_PATH)
    test_id = test['image_id']

    GCS_DS_PATH = KaggleDatasets().get_gcs_path(path)
    
    root_img = 'images'
    if path == ADDITIONAL_DATA_NO_BG_IMAGES_PATH:
        root_img = 'images/images'
    if path == ADDITIONAL_DATA_IMAGES_PATH:
        root_img = 'plant_processed/images'
    
    x_test = [(os.path.join(GCS_DS_PATH,root_img,idee+'.jpg')) for idee in test_id]
    
    return x_test

def get_test_dataset(x_test):
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .map(read_test_image, num_parallel_calls=AUTO)
        .batch(BATCH_SIZE)
        )
    return test_dataset

In [None]:
def get_augment_predict(model,test_dataset,iterations=5):
    y_pred = model.predict(test_dataset.map(augment_test_image,num_parallel_calls=AUTO),verbose=1)
    return y_pred

def get_predict(model,test_dataset,iterations = 5,USE_TTA = True):
    
    y_predictions = []
    
    if USE_TTA:
        for i in range(iterations):
            y_predictions.append(get_augment_predict(model,test_dataset,iterations))
            print(y_predictions[i])
    else:
        y_predictions.append(model.predict(test_dataset,verbose=1))
        print(y_predictions)
    
    return np.mean(y_predictions,axis=0)

In [None]:
#function to save results to submission.csv
def save_results(y_pred):
    
    test = pd.read_csv(CSV_TEST_PATH)
    test_id = test['image_id']

    result = pd.read_csv(CSV_TRAIN_PATH)[:y_pred.shape[0]]
    result['image_id'] = test_id
    
    labels = result.keys()
    print(labels)
    for i in range(1,5):
        result[labels[i]] = y_pred[:,i-1]

    result.to_csv('submission.csv',index=False)
    print(result.head)

In [None]:
 def compute_confusion_matrix(true, pred):
    result = np.zeros((nb_classes,nb_classes))
    
    true = true.argmax(axis=1)
    pred = pred.argmax(axis=1)
    
    for i in range(len(true)):
        result[true[i]][pred[i]] += 1
        
    return result.astype('uint16')


def plot_confusion_matrix(matrix, labels_x=CATEGORY_NAMES, labels_y=CATEGORY_NAMES):
    
    # vertical axis = true, horizontal axis = pred
    
    plt.title("Confusion Matrix")
    ax = sns.heatmap(matrix, annot=True, fmt='d', xticklabels = labels_x, yticklabels = labels_y)
    ax.set(ylabel="True Label", xlabel="Predicted Label")
    

In [None]:
def get_train_emb(x_train,y_train):
    train_emb = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .map(read_train_image, num_parallel_calls=AUTO)
    .map(augment_train_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
    )
    
    return train_emb

def get_val_emb(x_val,y_val):
    val_emb = (tf.data.Dataset
               .from_tensor_slices((x_val,y_val))                
               .map(read_train_image,num_parallel_calls=AUTO)
               .batch(BATCH_SIZE)
               .cache()
               .prefetch(AUTO)
              )
        
    return val_emb

#def get_intermediate_layer_output(model,dataset,layer_name= -2):
    #intermediate_layer_model = Model(inputs=model.input,
                                     #outputs=model.layers[layer_name].output)
    
    #intermediate_output = intermediate_layer_model.predict(dataset,verbose=1)
    
    #return intermediate_output



def get_intermediate_layer_output(model,dataset,layer_name= 'stem_bn'):
    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.get_layer(layer_name).output)
    
    intermediate_output = intermediate_layer_model.predict(dataset,verbose=1)
    
    return intermediate_output


In [None]:
def fit_PCA(x):
    pca_50 = PCA(n_components=50)
    pca = pca_50.fit(x)
    return pca

def transform_PCA(pca,x):
    return pca.transform(x)

def tsne_fit_transform(x):
    tsne = TSNE(n_components=2, verbose=1,n_iter=100000)
    tsne_pca_results = tsne.fit_transform(x)
    return tsne_pca_results
    
def vizualize_latent_space(x,y):
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    palette = sns.color_palette("bright", 4)
    plt.title('latent space vizualization')
    y_hue = [CATEGORY_NAMES[i] for i in y.argmax(axis=1)]
    sns.scatterplot(x[:,0], x[:,1],hue=y_hue, legend='full', palette=palette)

In [None]:
def dbscan(X,eps=10):
    db = DBSCAN(eps = eps).fit(X)
    return db

def vizualize_clusters(db,X):
    
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)

    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=5)

        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=5)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
    
    
def get_clusters(db,y):
    num_clusters = db.labels_.max()+1
    clusters = [[] for i in range(num_clusters)]
    labels = db.labels_

    noise = []
    for i in range(len(labels)):
        if labels[i]==-1:
            noise.append(i)
        else:
            clusters[labels[i]].append(i)

    cluster_labels = num_clusters*[0]

    for cluster,ptr in zip(clusters,range(num_clusters)):
        cnt = np.array(num_clusters*[0])
        for idx in cluster:
            cnt[y[idx].argmax()]+=1

        cluster_labels[ptr] = cnt.argmax()
        
    return clusters,cluster_labels,noise


def get_incorrectly_predicted_data(clusters,cluster_labels,y,noise = []):
    incorrect_data = []
    for idx in noise:
        incorrect_data.append(idx)
    
    for cluster,ptr in zip(clusters,range(len(clusters))):
        for idx in cluster:
            if cluster_labels[ptr]!=y[idx].argmax():
                incorrect_data.append(idx)
    return np.array(incorrect_data)

# Your code starts HERE!!!

In [None]:
x_train,y_train = get_train_data()
x_train,y_train,x_val,y_val = train_val_split(x_train,y_train)
train_dataset,val_dataset = get_train_val_datasets(x_train,y_train,x_val,y_val)

### Class weights

In [None]:
c_w = get_class_weights(y_train)
plot_class_weights(c_w,title = 'Train distribution')
c_w_train = dict(zip(range(4),c_w))

In [None]:
plot_class_weights(get_class_weights(y_val),title = 'Validation distribution')

### Preparing model

In [None]:
model = get_model()

In [None]:
model.summary()

### Train model

In [None]:
hist = train_model(model,train_dataset,val_dataset,epochs=10)

### Vizualize history

In [None]:
vizualize_history(hist)

### Prepare test dataset

In [None]:
x_test = get_test_data(ORIGINAL_DATA_IMAGES_PATH)
test_dataset = get_test_dataset(x_test)

### Get predictions

In [None]:
#model.load_weights(MODEL_CFG['name'])
#model.load_weights('../input/model-effnet/effNetPlants_val_loss_0.099.h5')

In [None]:
y_pred = get_predict(model,test_dataset)

### Saving results

In [None]:
save_results(y_pred)

### Confusion matrix for validation dataset

In [None]:
val_pred = get_predict(model,val_dataset,USE_TTA = False)
val_conf_matrix = compute_confusion_matrix(y_val,val_pred)
plot_confusion_matrix(val_conf_matrix)

### Latent space vizualization for full trainset

In [None]:
x_emb,y_emb = get_train_data()
train_emb = get_train_emb(x_emb,y_emb)
x = get_intermediate_layer_output(model,train_emb,layer_name = 'stem_bn')

In [None]:
pca = fit_PCA(x)
pca_result = transform_PCA(pca,x)
pca_tsne_result = tsne_fit_transform(pca_result)

vizualize_latent_space(pca_tsne_result,y_emb)

### Latent space vizualization for valset

In [None]:
val_emb = get_val_emb(x_val,y_val)
x = get_intermediate_layer_output(model,train_emb,layer_name = -2)

In [None]:
pca_result = transform_PCA(pca,x)
pca_tsne_result = tsne_fit_transform(pca_result)

vizualize_latent_space(pca_tsne_result,y_train)

### Clustering

In [None]:
db = dbscan(pca_tsne_result,eps = 10)

In [None]:
vizualize_clusters(db,pca_tsne_result)

In [None]:
clusters,cluster_labels,noise = get_clusters(db,y_emb)
incorrect_predicts = get_incorrectly_predicted_data(clusters,cluster_labels,y_emb,noise)

### Getting incorrectly predicted data for learning

In [None]:
x_emb = np.array(x_emb)
x_incorrect,y_incorrect = x_emb[incorrect_predicts],y_emb[incorrect_predicts]
incorrect_train_dataset = get_train_val_datasets(x_incorrect,y_incorrect)

### Train model using incorrectly predicted data

In [None]:
hist = train_model(model,incorrect_train_dataset,epochs=7)

In [None]:
x_emb,y_emb = get_train_data()
train_emb = get_train_emb(x_emb,y_emb)
x = get_intermediate_layer_output(model,train_emb,layer_name = -2)

In [None]:
pca_result = transform_PCA(pca,x)
pca_tsne_result = tsne_fit_transform(pca_result)

vizualize_latent_space(pca_tsne_result,y_emb)

In [None]:
x_test = get_test_data(ORIGINAL_DATA_IMAGES_PATH)
test_dataset = get_test_dataset(x_test)

In [None]:
y_pred = get_predict(model,test_dataset)

In [None]:
save_results(y_pred)