## Imports

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as image
import os
import shutil
import glob
import random
import re
import tensorflow as tf
import numpy as np
import json
import time
import sklearn.model_selection
import sklearn.metrics
import datetime

In [None]:
# run os.path.join over all elements of folderList
def join_all(*folderList):
  finalPath = folderList[0]
  for i in range(1,len(folderList)):
    finalPath = os.path.join(finalPath, folderList[i])
  return finalPath

## Colab

In [None]:
RunningInCOLAB = 'google.colab' in str(get_ipython())

# check if in colab
if RunningInCOLAB and not os.path.isdir('/content/gdrive'):
    print("Running in colab")
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    colab_root = '/content/drive'
      
if RunningInCOLAB:
    root_dir = "/content/gdrive/My Drive/"
    base_dir = root_dir + 'visual-proh/'
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
else:
    root_dir= os.getcwd()
    base_dir = root_dir

os.chdir(base_dir)

os.getcwd()

## Preprocess

In [None]:
im_path = join_all('data', 'train')
im_path

In [None]:
if not os.path.isdir(im_path):
  os.mkdir(im_path)

In [None]:
os.listdir(im_path)[0:20]

In [None]:
classes = [f for f in os.listdir(im_path) if os.path.isdir(os.path.join(im_path, f))]
classes

In [None]:
def getDataFromScratch():
  #files count
  train_files_count = 0
  for c in classes:
    train_files_count += len(glob.glob(join_all(im_path, c, '*.jpg')))

  print("Files number #", train_files_count)

  filtered_df = pd.DataFrame(index=range(train_files_count), columns=['class','path'])
  fill_index = 0
  for c in classes:
    img_files = glob.glob(join_all(im_path, c, '*.jpg'))

    filtered_df.loc[fill_index:fill_index+len(img_files)-1, "path"] = img_files
    filtered_df.loc[fill_index:fill_index+len(img_files)-1, "class"] = c
    
    fill_index += len(img_files)

  filtered_df.tail()

  # one hot encoding classes
  for current_class in classes:
      filtered_df[current_class] = (filtered_df["class"] == current_class).astype('int32')
  filtered_df.head()

## Load preprocessed csv

In [None]:
df_csv_name = 'filtered_df_v2.csv'
if 'filtered_df' not in globals():
  if os.path.isfile(df_csv_name):
    print("Loading from file...")
    filtered_df = pd.read_csv(df_csv_name, index_col=0)
  else:
    print("Cannot find file to load.\nGetting data from scratch...")
    filtered_df = getDataFromScratch()

    print("Saving to file...")
    filtered_df.to_csv(df_csv_name)

filtered_df.head()

## Train test split

In [None]:
test_size = 0.2
validation_size = 0.15
random_state = 1000

In [None]:
train_val_df, test_df = sklearn.model_selection.train_test_split(
    filtered_df, test_size=test_size, random_state=random_state)

In [None]:
train_df, validation_df = sklearn.model_selection.train_test_split(
    train_val_df, test_size=validation_size, random_state=random_state)

In [None]:
print("Train shape", train_df.shape)
print("Validation shape", validation_df.shape)
print("Test shape", test_df.shape)

## Create training batches

In [None]:
# number of batches
# maybe create it so that every batch contains up to N images
# divide_into = 4
# index of batch to choose
#batch_idx = 0

In [None]:
# def get_batch_idxs(df, divide_into=10):
#   start_idxs = list(range(0, df.shape[0], int(df.shape[0] / divide_into)))[0:divide_into]
#   start_idxs.append(df.shape[0])

#   batches = [(start_idxs[i], start_idxs[i+1]) for i in range(0,len(start_idxs)-1)]

#   return [(i,b) for i,b in enumerate(batches)]

In [None]:
# train_batch_idxs = get_batch_idxs(train_val_df, divide_into=divide_into)

In [None]:
#chosen_batch = train_batch_idxs[batch_idx][1]
#chosen_batch

In [None]:
#print("Total shape", train_val_df.shape)
#batched_train_val_df = train_val_df[chosen_batch[0]:chosen_batch[1]]
#print("Batch shape", batched_train_val_df.shape)

In [None]:
#train_df, validation_df = sklearn.model_selection.train_test_split(batched_train_val_df, test_size=0.15, random_state=1000)
#del batched_train_val_df

## Load train images

In [None]:
def loadImages(pathlist, basedir='.'):
    size = len(pathlist)

    batcharr = np.zeros(shape=(size, 300, 300, 3))

    for i in range(0,size):

        img_path = os.path.join(basedir, pathlist[i])

        im = tf.keras.preprocessing.image.load_img(
            img_path,
            target_size=(300, 300, 3)
        )
        imarr = tf.keras.preprocessing.image.img_to_array(im)
        imarr = tf.keras.applications.efficientnet.preprocess_input(imarr)

        batcharr[i] = imarr

        print(f'\r{i+1}/{size}', end='')

    return batcharr

In [None]:
#imgs_loaded = loadImages(train_df["path"].values)

In [None]:
#imgs_loaded_validation = loadImages(validation_df["path"].values)

In [None]:
#validation_y = validation_df[classes].values

In [None]:
#train_y = train_df[classes].values

In [None]:
#train_y.shape

## Model

In [None]:
def getModel(num_classes, dropout_lvl = 0.2, input_shape=(300, 300, 3)):
    basemodel = efb3 = tf.keras.applications.EfficientNetB3(
        weights="imagenet",
        include_top=False
    )
    basemodel.trainable = False

    inputs = tf.keras.layers.Input(shape=input_shape)
    
    x = basemodel(inputs)

    features = tf.keras.layers.GlobalAveragePooling2D(name='features')(x)

    y = tf.keras.layers.Dropout(dropout_lvl)(features)

    results = tf.keras.layers.Dense(num_classes, activation='softmax', name='results')(y)
    
    model = tf.keras.Model(inputs=inputs, outputs=[results,features])

    return model

In [None]:
mymodel = getModel(len(classes))
mymodel.summary()

In [None]:
mymodel.compile(optimizer='adam',
                loss = {
                    'results': 'categorical_crossentropy',
                    'features': None
                    },
                metrics = {
                    'results': 'accuracy',
                    'features': None
                    }
                )

## Train and Validation

In [None]:
epochs_per_batch = 25
batch_size = 32
classes = classes
patience = 5

# number of batches
n_batches = 4

In [None]:
def get_batch_idxs(df, divide_into):
  start_idxs = list(range(0, df.shape[0], int(df.shape[0] / divide_into)))[0:divide_into]
  start_idxs.append(df.shape[0])

  batches = [(start_idxs[i], start_idxs[i+1]) for i in range(0,len(start_idxs)-1)]

  return [(i,b) for i,b in enumerate(batches)]

In [None]:
train_batch_idxs = get_batch_idxs(train_val_df, divide_into=divide_into)

In [None]:
callbacks = [
             tf.keras.callbacks.EarlyStopping(
                  monitor='val_loss',
                  patience=patience,
                  restore_best_weights=True
                  )
             ]

In [None]:
print("Loading validation images...")
validation_imgs = loadImages(validation_df["path"].values)
print()

validation_y = validation_df[classes].values

In [None]:
hist_df = pd.DataFrame()

for i, current_batch_idx in train_batch_idxs:
  print(f"Training opn batch {i+1}/{n_batches}...")
  # getting batch
  # current_batch = train_val_df[current_batch_idx[0]:current_batch_idx[1]]
  # train val split
  # current_train_df, current_validation_df = sklearn.model_selection.train_test_split(
  #     current_batch, test_size=validation_size, random_state=random_state)
  # del current_batch

  current_train_df = train_df[current_batch_idx[0]:current_batch_idx[1]]
  print("Current train shape", current_train_df.shape)

  print("Loading train images...")
  current_train_imgs = loadImages(current_train_df["path"].values)
  print()

  # print("Loading validation images...")
  # current_validation_imgs = loadImages(current_validation_df["path"].values)
  # print()

  current_validation_imgs = validation_imgs

  current_train_y = current_train_df[classes].values
  current_validation_y = validation_y

  temp_hist = mymodel.fit(current_train_imgs,
                   {'results':current_train_y},
                   epochs=epochs_per_batch,
                   verbose=True,
                   batch_size=batch_size,
                   validation_data = (
                       current_validation_imgs,
                       {'results':current_validation_y}
                       ),
                   callbacks=callbacks)
  
  temp_df = pd.DataFrame(temp_hist.history)
  last_epoch = temp_df.shape[0]
  print(f"Training for batch {i+1}/{n_batches} finished at epoch {last_epoch}/{epochs_per_batch}.")
  
  hist_df = pd.concat([hist_df, temp_df])

  print()
  print()

In [None]:
hist_df.reset_index(drop=True, inplace=True)

In [None]:
# hist = mymodel.fit(imgs_loaded,
#                    {'results':train_y},
#                    epochs=25,
#                    verbose=True,
#                    batch_size=32,
#                    validation_data = (
#                        imgs_loaded_validation,
#                        {'results':validation_y}
#                        ),
#                    callbacks=callbacks)

In [None]:
hist_df.plot(y=["loss", "val_loss"])

In [None]:
hist_df.plot(y=["results_accuracy", "val_results_accuracy"])

## Save model

In [None]:
nowtag = datetime.datetime.now().strftime("%y%m%d%H")
model_path = f'./mymodel{nowtag}.h5'
if os.path.isfile(model_path):
  print(f"Model file {model_path} already exists.\nDo you want to overwrite it? [y/n]")
  ans = input()
  if ans == "Y":
    print("overwriting...")
    mymodel.save(model_path)
  else:
    print("leaving current file untouched.")
else:
  print(f"Saving model to file {model_path}...")
  mymodel.save(model_path)

## Test evaluation

In [None]:
imgs_loaded_test = loadImages(test_df["path"].values)

In [None]:
test_y = test_df[classes].values

In [None]:
test_y_pred, test_features = mymodel.predict(imgs_loaded_test)

In [None]:
test_y_pred_df = pd.DataFrame(test_y_pred, columns=classes)

In [None]:
unknown = 'unknown'
# handles unknown
def softmax2class(softmax, classes, threshold=0.5, unknown='unknown'):
  max = softmax.max()
  if max >= threshold:
    argmax = softmax.argmax()
    return classes[argmax]
  else:
    return unknown

In [None]:
threshold = 0.4
test_y_pred_df["class"] = test_y_pred_df[classes].apply(
    lambda x: softmax2class(x, classes, threshold=threshold, unknown=unknown), axis=1)
test_y_pred_df.head()

In [None]:
test_cm = pd.DataFrame(
            sklearn.metrics.confusion_matrix(
                test_df["class"],
                test_y_pred_df["class"],
                labels=classes+[unknown]
                ),
            columns=classes+[unknown],
            index=classes+[unknown]
            )
test_cm

In [None]:
plt.imshow(test_cm)

In [None]:
test_acc = sklearn.metrics.accuracy_score(test_df["class"], test_y_pred_df["class"])
test_acc

In [None]:
test_df.reset_index(drop=True, inplace=True)

In [None]:
test_df["pred_class"] = test_y_pred_df["class"]

In [None]:
unknown_number = (test_df["pred_class"] == unknown).sum()
unknown_ratio = unknown_number / test_df.shape[0]

print(f"Unknowns number: {unknown_number}")
print(f"Unknowns ratio: {unknown_ratio}")

In [None]:
errors = test_df[test_df["class"] != test_df["pred_class"]]
errors = errors.drop(columns=classes)
errors.head()

In [None]:
test_y_pred_df.loc[24]

In [None]:
im = image.imread(errors.iloc[0]["path"])
plt.imshow(im)