In [1]:
import csv
import shutil
import time

import joblib
import pandas as pd
import numpy as np
import os
import re
import pathlib

import yaml
from IPython.core.display import SVG
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from tensorflow.python.keras.utils.vis_utils import plot_model, model_to_dot

import sakthi_helper

In [2]:
len(tf.config.list_physical_devices('GPU'))

1

In [3]:
training_dataset_path = "data/all_data"
test_dataset_path = "machine-learning-in-science-ii-2023/test_data/test_data"

training_labels_path = "data/cleaned_training_norm.csv"

In [4]:
df = pd.read_csv(training_labels_path)
df.shape

(13792, 3)

In [5]:
df.head()

Unnamed: 0,image_id,angle,speed
0,1,0.4375,0.0
1,2,0.8125,1.0
2,3,0.4375,1.0
3,4,0.625,1.0
4,5,0.5,0.0


In [7]:
df["file_name"] = df["image_id"].astype(str) + ".png"
df.head()

Unnamed: 0,image_id,angle,speed,file_name
0,1,0.4375,0.0,1.png
1,2,0.8125,1.0,2.png
2,3,0.4375,1.0,3.png
3,4,0.625,1.0,4.png
4,5,0.5,0.0,5.png


## Train Test Split

In [9]:
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=39, stratify=df[["speed", "angle"]])
print(train_df.shape, len(train_df["image_id"].unique()))
print(validation_df.shape, len(validation_df["image_id"].unique()))

(11033, 4) 11033
(2759, 4) 2759


In [11]:
files = os.listdir(test_dataset_path)
test_image_id_list = []
for file in files:
    image_id = file.split(".")[0]
    test_image_id_list.append(image_id)

test_df = pd.DataFrame()
test_df["image_id"]=sorted(test_image_id_list, key=sakthi_helper.natural_keys)
test_df["file_name"] = df["image_id"].astype(str) + ".png"


Unnamed: 0,image_id,file_name
0,1,1.png
1,2,2.png
2,3,3.png
3,4,4.png
4,5,5.png


In [12]:
test_df.to_csv("submissions/test_submission_template.csv", index=False)
test_df.head()

Unnamed: 0,image_id,file_name
0,1,1.png
1,2,2.png
2,3,3.png
3,4,4.png
4,5,5.png


## Image parameters

In [14]:
N_CLASSES = [2, 17]  # CHANGE HERE, total number of classes               
IMG_HEIGHT = 240  # CHANGE HERE, the image height to be resized to  
IMG_WIDTH = 320  # CHANGE HERE, the image width to be resized to     
CHANNELS = 3  # The 3 color channels, change to 1 if grayscale
COLOR_MODE = "rgb"
BATCH_SIZE = 16
SEED = 39
AUGMENTATION = "b-0.5-1.5"

## Image Generators

In [15]:
dataset_path = "machine-learning-in-science-ii-2023/training_data/training_data"

In [16]:
train_data_gen = ImageDataGenerator(rescale=1 / 255, horizontal_flip=False , brightness_range=(0.5, 1.5), zca_whitening=False)
                                            # contrast_stretching=True, contrast_stretching_range=(2, 98),
                                            # histogram_equalization=True ,
                                            # adaptive_equalization=True, adaptive_hist_clip_limit=0.03)

In [17]:
train_generator_images_path = "data/train_generator_images_{}".format(AUGMENTATION) # hf_b_cs_he_ahe_zca
pathlib.Path(train_generator_images_path).mkdir(parents=True, exist_ok=True)

train_generator = train_data_gen.flow_from_dataframe(train_df, directory=dataset_path, x_col='file_name', y_col=["speed", "angle"], weight_col=None, 
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH), color_mode=COLOR_MODE, classes=None, class_mode='multi_output', 
                                                     batch_size=BATCH_SIZE, shuffle=True, seed=SEED,
                                                     save_to_dir=train_generator_images_path, save_prefix='Aug', save_format='png',
                                                     subset=None, interpolation='nearest', validate_filenames=True)


Found 11033 validated image filenames.


In [18]:
test_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = test_datagen.flow_from_dataframe(validation_df, directory=dataset_path, x_col='file_name', y_col=["speed", "angle"], weight_col=None, 
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH), color_mode=COLOR_MODE, classes=None, class_mode='multi_output', 
                                                     batch_size=BATCH_SIZE, shuffle=True, seed=SEED,
                                                     save_to_dir=None, save_prefix='', save_format='png',
                                                     subset=None, interpolation='nearest', validate_filenames=True)


Found 2759 validated image filenames.


In [19]:
test_generator = test_datagen.flow_from_dataframe(test_df, directory=test_dataset_path, x_col='file_name', y_col=None, weight_col=None, 
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH), color_mode=COLOR_MODE, classes=None, class_mode=None, 
                                                     batch_size=BATCH_SIZE, shuffle=False, seed=SEED,
                                                     save_to_dir=None, save_prefix='', save_format='png',
                                                     subset=None, interpolation='nearest', validate_filenames=True)

Found 1020 validated image filenames.


## Model Parameters

In [None]:
model_config = {}

model_ID = model_config['model_id'] = "1"
model_type = model_config['model_type'] = 'CNN_BC_MC'
class_names = model_config['class_names_short'] = '' ##[[0, 1], []]
class_names_short = model_config['model_id'] = '' #['OA', 'G', 'AG', 'HA', 'PG', 'CBG', 'SEG', 'MBG', 'MBAG', 'WOOS']
class_names_dict = model_config['class_names_dict'] = '' ##{0: 'OA', 1: 'G', 2: 'AG', 3: 'HA', 4: 'PG', 5: 'CBG', 6: 'SEG', 7: 'MBG', 8: 'MBAG', 9: 'WOOS'}
dataset_ID = model_config['dataset_ID'] = "INITIAL_1"
seed = model_config['seed'] = SEED
n_classes = model_config['n_classes'] = len(model_config['class_names'])
class_weights = model_config['class_weights'] = '' ##{0:1, 1:1 , 2:3, 3:5, 4:5, 5:1}
batch_size = model_config['batch_size'] = BATCH_SIZE
n_epochs = model_config['n_epochs'] = 20
hidden_layers = model_config['hidden_layers'] = ['C32_K5', 'C32_K3', 'BN', 'MP_2', 'C64_K5', 'C64_K3', 'BN', 'MP_2', 'C128_K3', 'F', '1024', '128']
activation_fns = model_config['activation_fns'] = ['relu', 'softmax']
n_layers = model_config['n_layers'] = 0

optimizer = model_config['optimizer'] = "adam"
initializer = model_config['initializer'] = "glorot_uniform"
loss = model_config['loss'] = "categorical_crossentropy"
loss_short = model_config['loss_short'] = "cce"
metrics = model_config['metrics'] = "accuracy"
preprocess = model_config['preprocess'] = "normalize_rescale-[0,1]"
augmentation = model_config['augmentation'] = AUGMENTATION  #"hf, br_0.4_1.3, zca_w" #"-" # #
regularisation = model_config['regularisation'] = "-"  #"1_BN, 1_Dropout_0.3 D" #"4_Dropout_0.3 in MP,D" #"-" # # #"3_Dropout_0.3 in F,D"  #    #"6 Batch Norm layers"   ##  #"-"   #"5_Dropout_0.3 in MP,D"
additional_params = model_config['additional_params'] = "-"

# ---------- CNN --------------------
input_shape = model_config['input_shape'] = (IMG_HEIGHT, IMG_WIDTH, CHANNELS)
total_train_samples = model_config['total_train_samples'] = train_df.shape[0]
total_validation_samples = model_config['total_validation_samples'] = validation_df.shape[0]
train_samples = model_config['train_samples'] = train_df.shape[0]
validation_samples = model_config['validation_samples'] = validation_df.shape[0]
train_dataset = model_config['train_dataset'] = ""
validation_dataset = model_config['validation_dataset'] = ""

## model save path

In [None]:
model_save_path = "models/CNN_models/model_{}_{}".format(model_ID, model_type)
checkpoints_save_path = "models/CNN_models/model_{}_{}/model_{}_checkpoints".format(model_ID, model_type, model_ID)
pathlib.Path(model_save_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(checkpoints_save_path).mkdir(parents=True, exist_ok=True)

## Creating the CNN model

In [None]:
input_shape = (IMG_HEIGHT, IMG_WIDTH, CHANNELS)

model = sakthi_helper.create_CNN_model(input_shape, hidden_layers, pretrained_model=None, num_non_trainable_layers=1,
                     output_layer={'BC': [1, 'sigmoid', 'binary_crossentropy'], 'MC': [17, 'softmax', 'categorical_crossentropy']},
                     init='normal', optimize='adam', metrics=['accuracy', 'mse'])

print("Model Summary : ", model.summary())

model_parameters = model.count_params()
print("Model Parameters : ", model_parameters)

n_layers = 0
for i, layer in enumerate(model.layers):  # base_model.layers
    n_layers += 1
    print(i, layer.name)
print("num layers: ", n_layers)

model_name =  "{}_{}_D-{}_A-{}_E-{}_B-{}_PP-{}_R-{}_EXTRA-{}".format(model_ID, model_type, dataset_ID, augmentation, n_epochs, batch_size, preprocess,
                                                                      regularisation, additional_params)

print("Model_name   : ", model_name, end="\n\n")

In [None]:
plot_model(model, to_file=model_save_path + "\\{}_plot.png".format(model_name))
print("model_plotted")
SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Save model config

In [None]:
#--------------------------   SAVE MODEL CONFIGURATION  -------------------------------------------

# serialize model to YAML
model_yaml = model.to_yaml()

model_config_path = model_save_path
pathlib.Path(model_config_path).mkdir(parents=True, exist_ok=True)

print("Model_config_path : ", model_config_path)

model_config_file = model_config_path + "\\model_{}_config.yaml".format(model_ID)

with open(model_config_file, "w") as yaml_file:
    yaml_file.write(model_yaml)
    print("Model_config_saved in \n {}".format(model_config_file))

In [None]:
#---------------------------  SAVE MODEL DETAILS  -------------------------------------------------

model_info_file = model_save_path + "\\model_{}_build_details.yaml".format(model_ID)

with open(model_info_file, 'w') as outfile:
    yaml.dump(model_config, outfile, default_flow_style=False, indent=4)
    print("model_info_file saved in \n {}".format(model_info_file))

In [None]:
#--------------------------  CSV DATA  ----------------------------------------------------

header_list = []
header_list.append('Model ID')
header_list.append('Model Type')
header_list.append('Dataset ID')
header_list.append('Total train Samples')
header_list.append('Total Validation samples')
header_list.append('Train Samples')
header_list.append('Validation Samples')
header_list.append('Seed')
header_list.append('Input shape')
header_list.append('N classes')
header_list.append('Class names')
header_list.append('Class weights')
header_list.append('Preprocessing')
header_list.append('Augmentation')
header_list.append('N layers')
header_list.append('Architecture')
header_list.append('Model Parameters')
header_list.append('Initializer')
header_list.append('Optimizer')
header_list.append('Loss function')
header_list.append('Regularisation')
header_list.append('Metrics')
header_list.append('Batch size')
header_list.append('N epochs')

header_list.append('Train loss')
header_list.append('Train accuracy')
header_list.append('Test loss')
header_list.append('Test accuracy')
header_list.append('Sub Model ID')
header_list.append('Validation loss')
header_list.append('Validation accuracy')
header_list.append('Additional params')
header_list.append('Remarks')

model_details = []
model_details.append(model_ID)
model_details.append(model_type)
model_details.append(dataset_ID)
model_details.append(total_train_samples)
model_details.append(total_validation_samples)
model_details.append(train_samples)
model_details.append(validation_samples)
model_details.append(seed)
model_details.append(input_shape)
model_details.append(n_classes)
model_details.append(class_names)
model_details.append(class_weights)
model_details.append(preprocess)
model_details.append(augmentation)
model_details.append(n_layers)
model_details.append(hidden_layers)
model_details.append(model_parameters)
model_details.append(initializer)
model_details.append(optimizer)
model_details.append(loss_short)
model_details.append(regularisation)
model_details.append(metrics)
model_details.append(batch_size)
model_details.append(n_epochs)

print("Length of header list : ", len(header_list))
print("Length of model details_list : ", len(model_details))

In [None]:
#---------------------------------  CALLBACKS  --------------------------------------------------------

# metrics_1 = Metrics()

#------------  CHECKPOINTS  -----------------------

model_metrics = "_{epoch:02d}-{loss:.4f}-{acc:.4f}-{val_loss:.4f}-{val_acc:.4f}"

filepath = checkpoints_save_path + "\\{}_weights".format(model_ID) + model_metrics + ".hdf5"
check_point = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False,
                              save_weights_only=False, mode='auto', period=1)

#-------------  CSV LOGGER  ----------------------

# CSVLogger(filename, separator=',', append=False)
csv_log_path = model_save_path + '\\model_{}_training_{}.log'.format(model_ID, int(time.time()))
csv_logger = CSVLogger(csv_log_path, append=False)

#-------------  EARLY STOPPING  --------------------

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=20)

# reduce_lr_loss = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=patience_lr, verbose=1, epsilon=1e-4,
                                   # mode='min')

#-------------  TENSORBOARD  ------------------------

NAME = "model_{}_logs_{}".format(model_ID, int(time.time()))
tensorboard_path = model_save_path + "\\logs\\{}".format(NAME)
pathlib.Path(tensorboard_path).mkdir(parents=True, exist_ok=True)

tensorboard = TensorBoard(log_dir=tensorboard_path)

## Model training

In [None]:
history = model.fit_generator(train_generator,
                                      epochs=n_epochs,
                                      steps_per_epoch= total_train_samples/BATCH_SIZE,
                                      verbose=1, # class_weight=class_weights,
                                      validation_data= validation_generator,
                                      validation_steps= total_validation_samples/BATCH_SIZE,
                                      callbacks=[csv_logger, check_point, early_stopping, tensorboard])

In [None]:
#----------------------------------  MODEL TRAINING HISTORY  ----------------------------------------------------

hist = [history.history]
print(hist, end="\n\n")
# print(hist[0]['acc'][-1])
train_loss = hist[0]['loss'][-1]
train_accuracy = hist[0]['acc'][-1]
val_loss = hist[0]['val_loss'][-1]
val_accuracy = hist[0]['val_acc'][-1]

In [None]:
#----------------------------------  SAVING MODEL  ------------------------------------------------------

end_epoch = len(hist[0]['loss'])

model_stats = "{:.4f}_{:.4f}_{:.4f}-{:.4f}".format(train_loss ,train_accuracy ,val_loss ,val_accuracy)

model.save(model_save_path + "\\{}_end_epoch_{}_{}.h5".format(model_ID, end_epoch, model_stats))
print("model saved in \n {}".format(model_save_path))

# serialize weights to HDF5

model.save_weights(model_save_path + "\\model_{}_end_epoch_{}_weights_{}.h5".format(model_ID, end_epoch, model_stats))
print("model weights saved in \n {}".format(model_save_path))

In [None]:
# ---------------------------  SAVE PICKLE FILE  -----------------------------------------------

joblib.dump(model, model_save_path + "\\model_{}_end_epoch_{}_weights_{}.pickle".format(model_ID, end_epoch, model_stats))

#-----------------------------------  PLOT TRAINING HISTORY  --------------------------------------------

history_plot_save_path = model_save_path + "\\{}_Training_Stats.png".format(model_name)

sakthi_helper.plot_histories(hist, history_plot_save_path)

## Loading trained model

In [None]:
model_ID = "51F47"

model_type = "CNN_BC_MC"  # CNN_C_R   # CNN_R

epoch_ID = 14  #51F47_weights_14-0.0417-0.9856-0.1764-0.9602

dataset_ID = "3CDR_3" #""CGA_18"

model_save_path = "models/CNN_models/model_{}_{}".format(model_ID, model_type)
checkpoints_save_path = "models/CNN_models/model_{}_{}/Model_{}_Checkpoints".format(model_ID, model_type, model_ID)

model_config_file = model_save_path + "\\model_{}_config.yaml".format(model_ID)

weight_files = os.listdir(checkpoints_save_path)
print(weight_files)

for i, file in enumerate(sorted(weight_files)):

    # if i == min_val_loss_index or i == max_val_acc_index:
    # print(file)
    epoch_num = file.split("_")[-1].split("-")[0]
    # print("Epoch num : ", epoch_num)

    if int(epoch_num) == epoch_ID:
        print(file)

        weight_file_name = file


## weight_file_name = "{}_weights_16-0.0379-0.9864-0.1782-0.9571.hdf5".format(model_ID)

weight_stats = weight_file_name.split("_")[-1]
epoch_name = (weight_stats).split("-")[0]
print("Model loaded from Epoch : {}".format(epoch_name))
sub_model_ID = int(epoch_name)

#--------  LOADING MODEL WEIGHTS  ----------------
selected_model_weights_file = checkpoints_save_path + "\\" + weight_file_name # .format(model_ID, n_epochs)

src_file = selected_model_weights_file

sub_model_save_path = model_save_path + "\\{}_SubModel_{}_Val_Dataset_{}\\".format(model_ID, sub_model_ID, dataset_ID)
pathlib.Path(sub_model_save_path).mkdir(parents=True, exist_ok=True)

dest_file = sub_model_save_path + weight_file_name

if not os.path.isfile(dest_file):
    shutil.copy2(src_file, dest_file)
else:
    print("Weight file already exists in Model directory")

sub_model_weights_file = dest_file

model_config_file = model_save_path + "\\model_{}_config.yaml".format(model_ID)

loaded_model = sakthi_helper.get_ML_model(model_config_file, sub_model_weights_file)

In [None]:
#-----------------------------------  MODEL EVALUATION  ---------------------------------------------------

loss_and_metrics = loaded_model.evaluate_generator(validation_generator,
                                                   steps=total_validation_samples / BATCH_SIZE, verbose=1)

print("Loss and Metrics : ", loss_and_metrics)

eval_loss = round(loss_and_metrics[0], ndigits=4)
eval_accuracy = round(loss_and_metrics[1] * 100 , ndigits=2)

print("{}: {:.4f}".format(loaded_model.metrics_names[0], eval_loss))
print("{}: {:.2f}".format(loaded_model.metrics_names[1], eval_accuracy))

In [None]:
#-----------------------------------  ML EXPERIMENT DOCUMENTATION  -----------------------------------------------------

train_loss = float(weight_stats.split("-")[1])
train_accuracy = float(weight_stats.split("-")[2]) * 100
validation_loss = eval_loss
validation_accuracy = eval_accuracy
test_loss = '-'
test_accuracy = '-'

model_details.append(train_loss)
model_details.append(train_accuracy)
model_details.append(test_loss)
model_details.append(test_accuracy)
model_details.append(sub_model_ID)
model_details.append(validation_loss)
model_details.append(validation_accuracy)

print(train_loss)
print(train_accuracy)
print(test_loss)
print(test_accuracy)
print(validation_loss)
print(validation_accuracy)



model_details.append(model_config['additional_params'])

remarks = "-"
model_details.append("-")

print("-----------")
# print(len(header_list))
print(len(model_details))

print(model_details)

ML_models_path = "models/CNN_models/"
CNN_model_results_file = "\\CNN_model_results_New.csv"

CNN_model_results_csv_path = ML_models_path + CNN_model_results_file


with open(CNN_model_results_csv_path, 'a', newline='') as file:
    file_empty = os.stat(CNN_model_results_csv_path).st_size == 0
    csv_writer = csv.writer(file, lineterminator="\n")

    if file_empty:
        csv_writer.writerow(header_list)
    csv_writer.writerow(model_details)
    print("Written Experiment stats to csv")

In [None]:
##----------------------- INFERENCE --------------------------------






In [None]:
#-------------------------------------  PLOTING CONFUSION MATRIX  ----------------------------------------
# print(Y_pred_list)
# print(Y_test_list)
#
# print("\n", pred_val_list)
# print(y_val_list)

cnf_matrix = sakthi_helper.confusion_matrix(Y_test_list, Y_pred_list)

print(cnf_matrix)

cnf_plot_save_path = sub_model_save_path + "\\Model_{}-{}_Val_Dataset_{}_cnf_matrix.png".format(model_ID, sub_model_ID, dataset_ID)

sakthi_helper.plot_confusion_matrix(cnf_matrix, class_names, cnf_plot_save_path)

In [None]:
#-------------------------------------  PLOTTING ROC CURVES  ---------------------------------------------

roc_plot_save_path_full = sub_model_save_path + "\\Model_{}-{}_Val_Dataset_{}_full_ROC_plot.png".format(model_ID, sub_model_ID, dataset_ID)
roc_plot_save_path_zoom = sub_model_save_path + "\\Model_{}-{}_Val_Dataset_{}_zoomed_ROC_plot.png".format(model_ID, sub_model_ID, dataset_ID)

# print(Y_one_hot_array.shape)
# print(Y_pred_one_hot_array.shape)


# sakthi_helper.plot_ROC_curves(Y_one_hot_array, Y_pred_one_hot_array, n_classes, roc_plot_save_path_full, roc_plot_save_path_zoom)

In [None]:
#-------------------------------------  PLOT CLASSIFICATION REPORT  ----------------------------------

report = sakthi_helper.classification_report(Y_test_array, Y_pred_array, target_names=class_names)
print("Classification Report : ", report)

report_plot_save_path = sub_model_save_path + "\\Model_{}-{}_Val_Dataset_{}_Classification_Report.png".format(model_ID, sub_model_ID, dataset_ID)

sakthi_helper.plot_classification_report(report, report_plot_save_path)

In [None]:
#-------------------------------------  SAVE CLASSIFICATION REPORT AS CSV  ------------------------------------

report_save_path = sub_model_save_path + "\\Model_{}-{}_Val_Dataset_{}_Classification_Report.csv".format(model_ID,
                                                                                               sub_model_ID, dataset_ID)

report_df = sakthi_helper.pandas_classification_report(Y_test_array, Y_pred_array, class_names)

print(report_df)

report_df.to_csv(report_save_path)