In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#####################################
# Libraries
#####################################
# Common libs
import pandas as pd
import numpy as np
import sys
import os
import random
from pathlib import Path

# Image processing
import imageio
import cv2
import skimage.transform
#from skimage.transform import rescale, resize, downscale_local_mean

# Charts
import matplotlib.pyplot as plt
import seaborn as sns

# ML, statistics
import scipy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score

# Tensorflow
#from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, RMSprop

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#INPUT_DIR = "/kaggle/input/plant-pathology-2021-fgvc8/"
#PRETRAINED_DIR = "/kaggle/input/pretrained-model/"
#NUM_CLASSES = 6 #12

INPUT_DIR = "../input/plant-pathology-2021-fgvc8/"
PRETRAINED_DIR = "../input/pretrained-model/"
NUM_CLASSES = 6

IMAGE_SIZE= (224,224)  #(256,256)

BATCH_SIZE = 32





In [2]:
class ModelTrainer:
    """
    Create and fit the model
    """
    
    def __init__(self):
        #(18632, 2) (17700, 2) (932, 2)
        #15930 , 1770, 932
        
        self.nb_train_samples = 15930 #4275 #32706
        self.nb_validation_samples = 1770 #475 #10902
        self.batch_size=512 #512 #32 #1024   #32
               
        
        self.img_width = IMAGE_SIZE[0]
        self.img_height = IMAGE_SIZE[1]
        print(self.img_width,self.img_height)
        
    
    def create_model(self, model_name):
        if model_name == "Xception" :
                model = self.create_model_1()
        elif model_name == "XceptionDenseNet121": 
                model = self.create_model_3()
        else:
                print("Model not found")
        self.model_name = model_name
        return model
    

    #Total params: 20,886,068
    def create_model_1(self):
        pretrained_model = tf.keras.applications.xception.Xception(include_top=False, weights='imagenet', input_shape=(self.img_width, self.img_height, 3))
        model = tf.keras.models.Sequential([
            pretrained_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(NUM_CLASSES,activation='softmax')
        ])
    
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        #tf.keras.utils.plot_model(densenet_model, to_file='densenet_model.png')
        return model

    def create_model_3(self):
        path1 = PRETRAINED_DIR + 'xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
        xception_model = tf.keras.models.Sequential([
           tf.keras.applications.xception.Xception(include_top=False, weights=None, input_shape=(self.img_width, self.img_height, 3)),#(512, 512, 3)),
           tf.keras.layers.GlobalAveragePooling2D(),
           #tf.keras.layers.Dense(NUM_CLASSES,activation='softmax')
           tf.keras.layers.Dense(NUM_CLASSES,activation='sigmoid')
        ])
        # Freezing the weights
        #for layer in xception_model.layers[:-1]:
        #    layer.trainable=False
            
        #xception_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        #xception_model.summary()
        path2 = PRETRAINED_DIR + 'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
        densenet_model = tf.keras.models.Sequential([
            tf.keras.applications.densenet.DenseNet121(include_top=False, weights=None,input_shape=(self.img_width, self.img_height, 3)),#(512, 512, 3)),
            tf.keras.layers.GlobalAveragePooling2D(),
            #tf.keras.layers.Dense(NUM_CLASSES,activation='softmax')
            tf.keras.layers.Dense(NUM_CLASSES,activation='sigmoid')
        ])
        #for layer in densenet_model.layers[:-1]:
        #    layer.trainable=False
        #densenet_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        #densenet_model.summary()
        
        inputs = tf.keras.Input(shape=(self.img_width, self.img_height, 3)) #(512, 512, 3))

        xception_output = xception_model(inputs)
        densenet_output = densenet_model(inputs)

        outputs = tf.keras.layers.average([densenet_output, xception_output])


        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        return model

In [3]:
trainer = ModelTrainer()
model = trainer.create_model("XceptionDenseNet121") # 0.9062 Total params: 27,935,872
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.load_weights(PRETRAINED_DIR+'best_XceptionDenseNet121_v2.h5')


224 224


In [12]:
#classes = ['complex', 'frog_eye_leaf_spot', 'frog_eye_leaf_spot complex', 'healthy', 'powdery_mildew', 'powdery_mildew complex', 'rust', 'rust complex', 'rust frog_eye_leaf_spot', 'scab', 'scab frog_eye_leaf_spot', 'scab frog_eye_leaf_spot complex']
classes = ['complex', 'frog_eye_leaf_spot', 'healthy', 'powdery_mildew', 'rust', 'scab' ]# 'frog_eye_leaf_spot complex','powdery_mildew complex', 'rust complex', 'rust frog_eye_leaf_spot', 'scab frog_eye_leaf_spot', 'scab frog_eye_leaf_spot complex']
class_names = ['complex', 'frog_eye_leaf_spot', 'healthy', 'powdery_mildew', 'rust', 'scab']

class Submitter:
    """
    Predict and submit
    """
    def __init__(self, model, img_size):
        self.model = model
        batch_size=BATCH_SIZE
        print("Initializing submitter")
        #Submission generator
        # flow_from_directory for input/test didn't work for me, so quick fix is to use flow_from_dataframe with list of files
        # Load list of files from test folder into dataframe
        self.test_files_df=pd.DataFrame()
        TEST_DIR = INPUT_DIR + 'test_images1/'
        self.test_files_df['image']=os.listdir(TEST_DIR)
        print("Loaded test files list")
        
        
        # Create generator in it
        #self.generator=ImageDataGenerator(rescale=1./255.).flow_from_dataframe(
        _test_datagen=ImageDataGenerator(rescale=1./255.)
        self.generator = _test_datagen.flow_from_dataframe(
                    dataframe=self.test_files_df,
                    directory=TEST_DIR,
                    x_col="image",
                    y_col=None,
                    #has_ext=True,
                    class_mode=None,
                    batch_size=batch_size,
                    seed=42,
                    shuffle=False,
                    target_size=img_size)    
        
        print('Submission generator created')    


    def predict_for_submit(self):
        """
        Predict submission test data and form dataframe to submit
        """
        print("Forming submission dataframe...")
        # Predict
        aa= self.model.predict_proba(self.generator)
        print(aa)
        print(aa[:,1])
            
        y_pred = self.model.predict(self.generator)
        y_pred = y_pred.tolist()
        print(y_pred)
        #y_pred = np.argmax(y_pred, axis=1)
        #print(y_pred)
        
        indices = []
        for pred in y_pred:
            temp = []
            for category in pred:
                #print(category)
                if category>=0.3:
                    temp.append(pred.index(category))
            print(temp)
            if temp!=[]:
                indices.append(temp)
            else:
                temp.append(np.argmax(pred))
                indices.append(temp)
        
        print(indices)
        
        y_pred_str = []        
        for image in indices:
            temp = []
            for i in image:
                #if i > 5:
                #    temp.append(classes[0])
                #else:
                temp.append(classes[i])
                    
            y_pred_str.append(' '.join(temp))

        print(y_pred_str)
        
        self.test_files_df['labels'] = y_pred_str
        # Write to csv
        self.test_files_df.to_csv('./submission.csv', index=False)
        print("Submission completed: written submission.csv")
        return self.test_files_df

In [13]:
# Get dataframe for submission
submitter = Submitter(model, IMAGE_SIZE)
submission_df = submitter.predict_for_submit()     
submission_df.head()

Initializing submitter
Loaded test files list
Found 10 validated image filenames.
Submission generator created
Forming submission dataframe...


AttributeError: 'Functional' object has no attribute 'predict_proba'

In [6]:
submission_df.head(30)

Unnamed: 0,image,labels
0,800113bb65efe69e.jpg,healthy powdery_mildew
1,8002cb321f8bfcdf.jpg,complex frog_eye_leaf_spot
2,80070f7fb5e2ccaa.jpg,scab
3,80077517781fb94f.jpg,scab
4,800cbf0ff87721f8.jpg,complex frog_eye_leaf_spot
5,800edef467d27c15.jpg,healthy
6,800f85dc5f407aef.jpg,rust
7,85f8cb619c66b863.jpg,scab
8,ad8770db05586b59.jpg,complex frog_eye_leaf_spot scab
9,c7b03e718489f3ca.jpg,frog_eye_leaf_spot


In [7]:
all_df = pd.read_csv(INPUT_DIR + "train.csv")
all_df.head(10)

Unnamed: 0,image,labels
0,800113bb65efe69e.jpg,healthy
1,8002cb321f8bfcdf.jpg,scab frog_eye_leaf_spot complex
2,80070f7fb5e2ccaa.jpg,scab
3,80077517781fb94f.jpg,scab
4,800cbf0ff87721f8.jpg,complex
5,800edef467d27c15.jpg,healthy
6,800f85dc5f407aef.jpg,cider_apple_rust
7,801d6dcd96e48ebc.jpg,healthy
8,801f78399a44e7af.jpg,complex
9,8021b94d437eb7d3.jpg,healthy


In [8]:
#!cp /kaggle/input/notebook-sam/best_XceptionDenseNet121.h5 .

#import os
#print(os.listdir("/kaggle/input/"))
#os.chdir(r'/kaggle/working')
#from IPython.display import FileLink
#FileLink(r'./best_XceptionDenseNet121.h5')

In [9]:
def get_labels(prediction,threshold):
  pred = []
  idx = np.where(prediction>threshold)[0]
  for i in idx:
    pred.append(class_names[i])
  pred = ' '. join(pred)
  if len(pred) == 0:
    pred = []
    idx = np.argmax(prediction)
    pred.append(class_names[idx])
    pred = ' '. join(pred)
    return pred
  else :
    return pred

In [10]:
train = pd.DataFrame()
train['image'] = all_df['image']
train['labels'] = all_df['labels'].apply(lambda string: string.split(' '))

data = train #.sample(5000)
print(data.shape)
data.head()

(18632, 2)


Unnamed: 0,image,labels
0,800113bb65efe69e.jpg,[healthy]
1,8002cb321f8bfcdf.jpg,"[scab, frog_eye_leaf_spot, complex]"
2,80070f7fb5e2ccaa.jpg,[scab]
3,80077517781fb94f.jpg,[scab]
4,800cbf0ff87721f8.jpg,[complex]


In [11]:
TRAIN_DATA_DIR = INPUT_DIR + 'train_images'

full_datagen = ImageDataGenerator(rescale = 1./255)

full_set = full_datagen.flow_from_dataframe(dataframe=data,
                                    directory=TRAIN_DATA_DIR,
                                    x_col='image',
                                    y_col= None,                                    
                                    target_size = IMAGE_SIZE,
                                    batch_size = BATCH_SIZE,
                                    shuffle=False,
                                    validate_filenames=False,
                                    class_mode=None)



predicts = model.predict(full_set) 

Found 18632 non-validated image filenames.


KeyboardInterrupt: 

In [None]:
print(predicts.shape)

In [None]:
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
num_classes = 6
roc_auc_macro = []

for t in thresholds:
  labels = []
  for i in range(len(predicts)):
    pred = predicts[i]

    labels.append(get_labels(prediction = pred, threshold = t))
  data["pred"]= labels

  from sklearn.preprocessing import MultiLabelBinarizer
  mlb = MultiLabelBinarizer()
  # binarize truth 
  mlb.fit(data["labels"])

  y_test = mlb.transform(data["labels"])
  # binarize pred
  data["pred"] = data["pred"].str.split(" ")
  y_score = mlb.transform(data["pred"]) 

  from sklearn.metrics import roc_curve, auc
  from scipy import interp
  from itertools import cycle
  # Compute ROC curve and ROC area for each class
  fpr = dict()
  tpr = dict()
  thresholds = dict()
  roc_auc = dict()
  for i in range(num_classes):
      fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_score[:, i])
      roc_auc[i] = auc(fpr[i], tpr[i])

  # Compute micro-average ROC curve and ROC area
  fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
  roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
  all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))

  # Then interpolate all ROC curves at this points
  mean_tpr = np.zeros_like(all_fpr)
  for i in range(num_classes):
      mean_tpr += interp(all_fpr, fpr[i], tpr[i])

  # Finally average it and compute AUC
  mean_tpr /= num_classes

  fpr["macro"] = all_fpr
  tpr["macro"] = mean_tpr
  roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

  roc_auc_macro.append(round(roc_auc["macro"],2))

  print("Threshold:",t,"ROC: ",round(roc_auc["macro"],2))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
# binarize truth 
mlb.fit(data["labels"])

y_true = mlb.transform(data["labels"])#.tolist()
y_pred = predicts#.tolist()
print(y_true[0])
print(y_pred[0])
print(y_true.shape, y_pred.shape)

In [None]:



# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_classes):
    fpr[i], tpr[i], thresholds[i] = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(num_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Class ' + str(i) + " "+ class_names[i])
    plt.legend(loc="lower right")
    plt.show()

In [None]:
print(y_pred[0], y_true[0])

#labels.append(get_labels(prediction = pred, threshold = t))
#y_pred = y_pred.tolist()
indices = []
for pred in y_pred:
    temp = []
    for category in pred:
        #print(category)
        if category>=0.2:
            temp.append(pred.index(category))
    #print(temp)
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)

#print(indices)
y_pred1 = []
for pred in indices:
    item = []
    for i in range(6):
        if i in pred :
            item.append(1)
        else:
            item.append(0)
    #print(item)
    y_pred1.append(item)

print(y_pred1[0])
y_true1 = y_true[:,:-1].tolist()
print(y_true1[0])

In [None]:
print(metrics.classification_report( y_pred1, y_true1))

In [None]:
#fpr, tpr, thresholds = roc_curve(y_true, y_pred)
res = pd.DataFrame({'FPR': fpr[0], 'TPR': tpr[0], 'Threshold': thresholds[0]})  #0 0.25  #1 0.7
aa = res[['TPR', 'FPR', 'Threshold']]

In [None]:
pd.set_option("display.max_rows", 999)
aa.loc[(aa['TPR'] >= 0.5)]

In [None]:
df = pd.DataFrame()
df["thresholds"]= thresholds
df["roc_auc_macro"]= roc_auc_macro
df.sort_values("roc_auc_macro", inplace=True, ascending=False)
df.reset_index(inplace=True, drop = True)
df.head(10)