# Final Project

#### Team: &emsp;&emsp; Sasi Kanduri &emsp;&emsp; Vikas Mishra &emsp;&emsp; Ashish Kotian

### Converting .mat files to jpg and image arrays
##### The dataset we chose is from the follwoing link which has images in .mat files. We have to take the image arrays out of the .mat constructs and pre-process them.

In [141]:
import os
import h5py
import numpy as np
from PIL import Image
import pandas as pd
import cv2
import SimpleITK as sitk
# List of directories to process
directories = [
    'brainTumorDataPublic_1-766',
    'brainTumorDataPublic_767-1532',
    'brainTumorDataPublic_1533-2298',
    'brainTumorDataPublic_2299-3064'
]

y_labels = []
filename_column = []

image_arrays = []
image_classes = []


# Set the paths for the Mat and Jpg folders
mat_folder = f'./all_files'
jpg_folder = f'./Jpg_images'

# Create the Jpg folder if it doesn't exist
if not os.path.exists(jpg_folder):
    os.makedirs(jpg_folder)

# Iterate through files in the Mat folder
for filename in os.listdir(mat_folder):
    # Construct the full file paths
    mat_filepath = os.path.join(mat_folder, filename)
    jpg_filepath = os.path.join(jpg_folder, filename.split(".")[0] + '.jpg')

    if filename.endswith('.mat'):
        # Check if the Mat file path is valid
        if os.path.exists(mat_filepath) and os.path.isfile(mat_filepath):
            try:
                # Open the mat file
                with h5py.File(mat_filepath, 'r+') as f:

                    cjdata = f['cjdata']
                    image = np.array(cjdata.get('image')).astype(np.float64)
                    label = cjdata.get('label')[0, 0]

                    if image.shape[0] == 512:

                        y_labels.append(label - 1)
                        filename_column.append(filename.split(".")[0])

                        # Perform image processing
                        hi = np.max(image)
                        lo = np.min(image)
                        image = (((image - lo) / (hi - lo))
                                 * 255).astype(np.uint8)

                        image = cv2.resize(
                            image, (128, 128), interpolation=cv2.INTER_AREA)
                        image_arrays.append(image)
                        image_classes.append(label - 1)

                        # cv2.imwrite(jpg_filepath, image)
                        # t1_image = sitk.GetImageFromArray(image)
                        # sitk.WriteImage(t1_image, jpg_filepath)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
        else:
            print(f"Invalid Mat file path: {mat_filepath}")

    else:
        print(f"Skipping non-HDF5 file: {filename}")

df = pd.DataFrame({'filename': filename_column, 'label': y_labels})
df.to_csv('final_df.csv', index=False)


### Data pre-processing

In [142]:
import numpy as np
from numpy import array
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.utils import plot_model, image_dataset_from_directory
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D, Rescaling
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


import tensorflow as tf

In [143]:
df = pd.read_csv("final_df.csv", index_col=False)

In [144]:
df.sort_values(by='filename', inplace=True)

In [145]:
df.dtypes

filename      int64
label       float64
dtype: object

In [146]:
df['label'] = df['label'].astype('int32')
df['label'].unique()

array([0, 2, 1])

In [147]:
from keras.utils import to_categorical

y_labels = df['label'].to_numpy().tolist()
y_labels


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [148]:
image_arrays = np.array(image_arrays)

In [149]:
image_arrays = image_arrays.reshape(image_arrays.shape[0], 128, 128, 1)

In [150]:
image_arrays[0]

array([[[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]], dtype=uint8)

In [151]:
image_classes_encoded = tf.keras.utils.to_categorical(image_classes, 3, "float32")
image_classes_encoded

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [152]:
print(image_arrays.shape)
print(image_classes_encoded.shape)

(3049, 128, 128, 1)
(3049, 3)


In [153]:
x_train, x_test, y_train, y_test = train_test_split(image_arrays, image_classes_encoded, test_size=0.20, random_state=42)

In [154]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(2439, 128, 128, 1)
(610, 128, 128, 1)
(2439, 3)
(610, 3)


### Baseline model from the paper

In [155]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Rescaling, Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import keras_tuner as kt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [156]:
def build_model(hp):
    model = Sequential()

    model.add(Input(shape=(128, 128, 1)))
    model.add(Rescaling(1./255))  # scale pixels

    # Define the number of CNN layers as a hyperparameter
    num_cnn_layers = hp.Int('num_cnn_layers', min_value=1, max_value=10, step=1)

    for i in range(num_cnn_layers):
        model.add(Conv2D(64, kernel_size=(2, 2), padding='same', strides=1, activation=hp.Choice('activation', values=['relu', 'tanh'])))
        model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
        model.add(Dropout(0.2))

    model.add(Flatten())

    model.add(Dense(hp.Choice('units', [8, 16, 32]), activation='relu'))
    model.add(Dense(3, activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [157]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5,
    directory='my_dir',
    project_name='my_project')


In [158]:
checkpointer = ModelCheckpoint(filepath="array_model.hdf5", verbose=2, save_best_only=True, monitor='val_loss')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')

In [159]:
tuner.search(x_train, y_train, epochs=20, validation_data=(x_test, y_test), callbacks=[monitor, checkpointer], batch_size=32)
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)

# tuner.search(x_train, y_train, epochs=5, validation_data=(x_val, y_val))


# model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_test, y_test), callbacks=[monitor, checkpointer])

Trial 5 Complete [00h 08m 44s]
val_loss: 0.23218651115894318

Best val_loss So Far: 0.23218651115894318
Total elapsed time: 00h 45m 15s
{'num_cnn_layers': 3, 'activation': 'relu', 'units': 8, 'learning_rate': 0.0007539027884617362}


In [160]:
model = tuner.hypermodel.build(best_hps)
model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test), callbacks=[monitor, checkpointer], batch_size=32)

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.88772, saving model to array_model.hdf5
Epoch 2/20
Epoch 2: val_loss improved from 0.88772 to 0.82882, saving model to array_model.hdf5
Epoch 3/20
Epoch 3: val_loss improved from 0.82882 to 0.79916, saving model to array_model.hdf5
Epoch 4/20
Epoch 4: val_loss improved from 0.79916 to 0.75883, saving model to array_model.hdf5
Epoch 5/20
Epoch 5: val_loss improved from 0.75883 to 0.75729, saving model to array_model.hdf5
Epoch 6/20
Epoch 6: val_loss improved from 0.75729 to 0.69621, saving model to array_model.hdf5
Epoch 7/20
Epoch 7: val_loss improved from 0.69621 to 0.66271, saving model to array_model.hdf5
Epoch 8/20
Epoch 8: val_loss improved from 0.66271 to 0.61923, saving model to array_model.hdf5
Epoch 9/20
Epoch 9: val_loss improved from 0.61923 to 0.58396, saving model to array_model.hdf5
Epoch 10/20
Epoch 10: val_loss improved from 0.58396 to 0.47066, saving model to array_model.hdf5
Epoch 11/20
Epoch 11: val_loss did not imp

<keras.src.callbacks.History at 0x17743cd5790>

In [161]:
pred = model.predict(x_test)
pred



array([[1.47039347e-04, 9.99391437e-01, 4.61595017e-04],
       [8.68321717e-01, 9.95130837e-02, 3.21651697e-02],
       [1.42810680e-03, 9.57365751e-01, 4.12061922e-02],
       ...,
       [1.17672855e-04, 9.99578059e-01, 3.04185669e-04],
       [6.33158535e-03, 9.85928953e-01, 7.73942005e-03],
       [4.55464376e-03, 3.31174291e-04, 9.95114207e-01]], dtype=float32)

In [162]:
y_true = np.argmax(y_test, axis=1)
pred = np.argmax(pred, axis=1)

print(classification_report(y_true, pred))

              precision    recall  f1-score   support

           0       0.85      0.74      0.79       137
           1       0.87      0.94      0.91       278
           2       0.98      0.96      0.97       195

    accuracy                           0.90       610
   macro avg       0.90      0.88      0.89       610
weighted avg       0.90      0.90      0.90       610

