In [None]:
import numpy as np, seaborn as sns, pandas as pd, matplotlib.pyplot as plt, os, cv2, tensorflow as tf, keras, math, plotly.express as px
from keras.applications.inception_v3 import InceptionV3
print(keras.__version__, tf.__version__)

In [None]:
size, n_channels = 227, 3
df = pd.read_csv('/kaggle/input/tomato-class/rating.csv')
axes = df.groupby('rating').count().plot(kind="bar")
axes.tick_params(axis='both', which='minor', labelsize=14)
axes.set_ylabel("Count", fontsize=14)
axes.set_xlabel("Rating", fontsize=14)
axes.set_title("Rating Distribution in the data set", fontsize=14, pad=20)
axes.legend().remove()
for p in axes.patches:
    axes.annotate(np.round(p.get_height(), decimals=2), (p.get_x() + p.get_width() / 2, p.get_height()),
                  ha='center', va='center', rotation=0, xytext=(2, 20), textcoords='offset points')
plt.ylim([0, 220])
plt.show()
df = df[df['rating'] != -1]
len(df)
images={'sick': [], 'healthy': []}
os.chdir(f'/kaggle/input/tomato-class/labeled_data')
for id, (image, rating) in df.iterrows():
    if rating >= 4 and rating < 6:
        images['healthy'].append(cv2.resize(plt.imread(image), (size, size)).astype(np.int16))
    else:
        images['sick'].append(cv2.resize(plt.imread(image), (size, size)).astype(np.int16))

In [None]:
target_to_one_hot_vec = {'healthy':np.array([1, 0]), 'sick':np.array([0, 1])}

total_images=0
for key in ['healthy', 'sick']:
    total_images += len(images[key])
    print(f'Total Number of {key}: {len(images[key])}')
print('Total Number of images is :', total_images)
data_x = np.zeros((total_images, size, size, n_channels)) 
data_y = np.zeros((total_images, 2))
i = 0
for key in ['healthy', 'sick']:
  for img in images[key]:
    data_x[i] = img
    data_y[i] = target_to_one_hot_vec[key]
    i +=1

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(15, 13))
for i, axis in enumerate(ax.flatten()): #these images are from the external dataset
    axis.imshow(images[list(images.keys())[i]][int(np.random.randint(0, 94, 1))])
    axis.title.set_text(list(images.keys())[i])

In [None]:
plt.subplots(figsize=(12, 7))
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Schedule')
lrs= []
for epoch in range(1, 101):
    cos_inner = (math.pi * (epoch % 101)) / (101)
    lrs.append(5e-4/2 * (math.cos(cos_inner) + 1))
sns.lineplot(x=list(range(1, 101)), y=lrs)

In [None]:
gen = keras.preprocessing.image.ImageDataGenerator(horizontal_flip=True, vertical_flip=True, rotation_range=180,
                         zoom_range=.1, width_shift_range=.2, height_shift_range=.2)
mc = keras.callbacks.ModelCheckpoint(filepath='/kaggle/working/model.h5',  verbose=True, save_best_only=True)#For saving the model when the val_loss goes down
mc2 = keras.callbacks.ModelCheckpoint(filepath='/kaggle/working/model2.h5',  verbose=True, save_best_only=True)#For saving the model when the val_loss goes down

def aggressive_lrs(epoch, _):
    cos_inner = (math.pi * (epoch % 101)) / (101)
    return 5e-4/2 * (math.cos(cos_inner) + 1) #initial learning rate is 5e-4
 
lr = keras.callbacks.LearningRateScheduler(aggressive_lrs)

In [None]:
from sklearn.model_selection import train_test_split
indices = pd.Series(np.round(np.linspace(0, data_x.shape[0]-1, data_x.shape[0])))
X_train, X_test = train_test_split(indices, test_size=0.3)
X_train, X_valid = train_test_split(X_train, test_size=0.2)
train_indices = X_train.values.astype(np.int16)
valid_indices = X_valid.values.astype(np.int16)
test_indices =  X_test.values.astype(np.int16)
print(len(train_indices), len(valid_indices), len(test_indices))

In [None]:
os.chdir('/kaggle/working/')
!git clone https://github.com/qubvel/efficientnet.git
!pip install -U efficientnet
import efficientnet.efficientnet.keras as efn
md  = efn.EfficientNetB6(weights='imagenet', include_top=False, input_shape=(size, size, 3), pooling='avg')

In [None]:
model = keras.models.Sequential([md,
                                 keras.layers.Dense(900, activation='relu'),
                                 keras.layers.Dense(800, activation='relu'),
                                 keras.layers.Dense(2, activation='softmax')
                                ])
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])#, tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

print(f'Total healthy in train is: {len([vec for vec in data_y[train_indices] if vec[0] == 1])}')
print(f'Total sick in train is: {len([vec for vec in data_y[train_indices] if vec[0] == 0])}')
print(f'Total healthy in validation is: {len([vec for vec in data_y[valid_indices] if vec[0] == 1])}')
print(f'Total sick in validation is: {len([vec for vec in data_y[valid_indices] if vec[0] == 0])}')
print(f'Total healthy in test is: {len([vec for vec in data_y[test_indices] if vec[0] == 1])}')
print(f'Total sick in test is: {len([vec for vec in data_y[test_indices] if vec[0] == 0])}')

model.fit_generator(gen.flow(data_x[train_indices], data_y[train_indices], batch_size=8), epochs=10,
                    validation_data=(data_x[valid_indices], data_y[valid_indices]),
                    callbacks=[mc, lr])

In [None]:
base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(size, size, 3), pooling='avg')
model2 = keras.models.Sequential([base_model,
                                 keras.layers.Dense(2, activation='softmax')
                                ])
model2.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])#, tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

model2.fit_generator(gen.flow(data_x[train_indices], data_y[train_indices], batch_size=8), epochs=10,
                    validation_data=(data_x[valid_indices], data_y[valid_indices]),
                    callbacks=[mc2])

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc
os.chdir('/kaggle/working/')
mdl = keras.models.load_model('model.h5')
os.chdir(f'/kaggle/input/tomato-class/labeled_data')
print("Evaluate on test data")
results = mdl.evaluate(data_x[test_indices], data_y[test_indices], batch_size=8)
print(results)
y_pred_keras = mdl.predict(data_x[test_indices])
y_pred = np.argmax(y_pred_keras, axis=1)
y_true = np.argmax(data_y[test_indices], axis=1)
print('Confusion Matrix for EfficientnetB6')
print(confusion_matrix(y_true,y_pred))
print('Classification Report for EfficientnetB6')
target_names = ['Healthy', 'Sick']
print(classification_report(y_true, y_pred, target_names=target_names))
fpr_keras, tpr_keras, thresholds_keras = roc_curve(data_y[test_indices].ravel(), y_pred_keras.ravel())
auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
os.chdir('/kaggle/working/')
mdl2 = keras.models.load_model('model2.h5')
os.chdir(f'/kaggle/input/tomato-class/labeled_data')
print("Evaluate on test data")
results2 = mdl2.evaluate(data_x[test_indices], data_y[test_indices], batch_size=8)
print(results2)
y_pred_keras2 = mdl2.predict(data_x[test_indices])
y_pred2 = np.argmax(y_pred_keras2, axis=1)
y_true2 = np.argmax(data_y[test_indices], axis=1)
print('Confusion Matrix for InceptionV3')
print(confusion_matrix(y_true2,y_pred2))
print('Classification Report For InceptionV3')
print(classification_report(y_true2, y_pred2, target_names=target_names))

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='EfficientnetB6 (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_keras2, tpr_keras2, label='InceptionV3 (area = {:.3f})'.format(auc_keras2))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
# Zoom in view of the top right.
plt.figure(2)
plt.xlim(0, 0.6)
plt.ylim(0.4, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='EfficientnetB6 (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_keras2, tpr_keras2, label='InceptionV3 (area = {:.3f})'.format(auc_keras2))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()

In [None]:
# # check if difference between algorithms is real
# t, p = paired_ttest_5x2cv(estimator1=mdl, estimator2=mdl2, X=data_x[test_indices], y=data_y[test_indices], scoring='accuracy')
# # summarize
# print('P-value: %.3f, t-Statistic: %.3f' % (p, t))
# # interpret the result
# if p <= 0.05:
#     print('Difference between mean performance is probably real')
# else:
#     print('Algorithms probably have the same performance')
y_pred_keras = mdl.predict(data_x)
y_preds =  pd.DataFrame(y_pred_keras, index=df.index)
y_preds.columns = ['healthy', 'sick']
y_preds_bool = []
for i, (id, (healthy, sick)) in enumerate(y_preds.iterrows()):
    y_preds_bool.extend([healthy > sick])
y_bool = []
for i, (healthy, sick) in enumerate(data_y):
    y_bool.extend([healthy > sick])
fpr_keras2, tpr_keras2, thresholds_keras2 = roc_curve(data_y[test_indices].ravel(), y_pred_keras2.ravel())
auc_keras2 = auc(fpr_keras2, tpr_keras2)
y_preds['class'] = y_preds_bool
df = pd.concat([df, y_preds], axis=1)
md1_mask = ((df['healthy'] >= 0.5) & (df['rating'] >= 4) & (df['rating'] < 6)) | ((df['healthy'] < 0.5) & ((df['rating'] < 4) | (df['rating'] == 6)))
y_pred_keras2 = mdl2.predict(data_x)
y_preds2 =  pd.DataFrame(y_pred_keras2, index=df.index)
y_preds2.columns = ['healthy2', 'sick2']
y_preds_bool2 = []
for i, (id, (healthy, sick)) in enumerate(y_preds2.iterrows()):
    y_preds_bool2.extend([healthy > sick])
y_preds2['class'] = y_preds_bool2
df = pd.concat([df, y_preds2], axis=1)
md1_mask = ((df['healthy'] >= 0.5) & (df['rating'] >= 4) & (df['rating'] < 6)) | ((df['healthy'] < 0.5) & ((df['rating'] < 4) | (df['rating'] == 6)))
md1_mask.sum()
(~md1_mask).sum()
md2_mask = ((df['healthy2'] >= 0.5) & (df['rating'] >= 4) & (df['rating'] < 6)) | ((df['healthy2'] < 0.5) & ((df['rating'] < 4) | (df['rating'] == 6)))
md2_mask.sum()
print('Confusion Matrix for All')
print(confusion_matrix(y_preds_bool, y_preds_bool2))
print('Classification Report For All')
print(classification_report(y_preds_bool, y_preds_bool2, target_names=['Correct', 'Wrong']))
print(f"Precision for healthy is: {1 - abs(len(df[(df['rating'] >= 4) & (df['rating'] < 6)]) - len(df[(df['healthy'] >= 0.5) & (df['rating'] >= 4) & (df['rating'] < 6)])) / len(df[(df['rating'] >= 4) & (df['rating'] < 6)])}")
print(f"Precision for sick is: {1 - abs(len(df[(df['rating'] < 4) | (df['rating'] == 6)]) - len(df[(df['sick'] >= 0.5) & ((df['rating'] < 4) | (df['rating'] == 6))])) / len(df[(df['rating'] < 4) | (df['rating'] == 6)])}")
print(f"Precision for healthy2 is: {1 - abs(len(df[(df['rating'] >= 4) & (df['rating'] < 6)]) - len(df[(df['healthy2'] >= 0.5) & (df['rating'] >= 4) & (df['rating'] < 6)])) / len(df[(df['rating'] >= 4) & (df['rating'] < 6)])}")
print(f"Precision for sick2 is: {1 - abs(len(df[(df['rating'] < 4) | (df['rating'] == 6)]) - len(df[(df['sick2'] >= 0.5) & ((df['rating'] < 4) | (df['rating'] == 6))])) / len(df[(df['rating'] < 4) | (df['rating'] == 6)])}")

md1_mask.sum(), md2_mask.sum()
both_right = md1_mask & md2_mask
print(f'both correct {both_right.sum()}')
both_wrong = (~md1_mask) | (~md2_mask)
print(f'both wrong {both_right.sum()}')
we_win = md1_mask & (~md2_mask)
print(f'we Won in {we_win.sum()}')
we_lose = md2_mask & (~md1_mask)
print(f'we Lose in {we_lose.sum()}')

In [None]:
os.chdir('/kaggle/working/')
mdl = keras.models.load_model('model.h5')
os.chdir('/kaggle/input/user-test')
user_test = np.zeros((6, size, size, n_channels))
for i in range(6):
    user_test[i] = cv2.resize(plt.imread(f'{i}.jpg'), (size, size)).astype(np.int16)
results = mdl.predict(user_test)
pred = [healthy > sick for healthy, sick in results]
pred = np.argmax(results, axis=1)
print(pred)


fig, axes = plt.subplots(ncols=len(df.columns), figsize=(10,5))
for col, ax in zip(df, axes):
    df[col].value_counts().sort_index().plot.bar(ax=ax, title=col)

plt.tight_layout()    
plt.show()