In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import cv2
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from skimage.feature import hog
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Initializing/Import Data + Augmentation

In [2]:
csv_file = "dataset/data_labels_mainData.csv"
mainData_df = pd.read_csv(csv_file)

def load_data(dataframe, img_folder):
    data = []
    labels = []
    
    for index, row in dataframe.iterrows():
        img_name = row['ImageName']
        img_path = os.path.join(img_folder, img_name)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        data.append(img)
        labels.append(row['isCancerous'])
    
    return np.array(data), np.array(labels)

img_folder = "dataset/patch_images/"
data, labels = load_data(mainData_df, img_folder)




In [3]:
#changing isCancerous column to bool
mainData_df['isCancerous'] = mainData_df['isCancerous'].astype('bool')
# Normalize the image data
data = data.astype('float32') / 255.0

In [4]:
#image generator
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
)

# Apply data augmentation
augmented_data = []
augmented_labels = []

for img, label in zip(data, labels):
    img = img.reshape((1, *img.shape))
    for _ in range(3):  # Number of times to augment each image
        augmented_img = datagen.flow(img, batch_size=1)[0].reshape(img.shape[1:])
        augmented_data.append(augmented_img)
        augmented_labels.append(label)

augmented_data = np.array(augmented_data)
augmented_labels = np.array(augmented_labels)



KeyboardInterrupt: 

## Model 1.1: Convolutional  Neural Network

In [None]:
#Building Neural Network Layers
cancerModel = Sequential()

cancerModel.add(Conv2D(32, (3, 3), activation='relu', input_shape=(27, 27, 3)))
cancerModel.add(MaxPooling2D((2, 2)))

cancerModel.add(Conv2D(64, (3, 3), activation='relu'))
cancerModel.add(MaxPooling2D((2, 2)))

cancerModel.add(Conv2D(128, (3, 3), activation='relu'))
cancerModel.add(MaxPooling2D((2, 2)))

cancerModel.add(Flatten())
cancerModel.add(Dense(128, activation='relu'))
cancerModel.add(Dropout(0.5))
cancerModel.add(Dense(1, activation='softmax'))

In [None]:
# #Train test val split
# X_cnn_train, X_cnn_test, y_cnn_train, y_cnn_test = train_test_split(augmented_data, augmented_labels, test_size=0.3, random_state=2)

# # Compile the model
# cancerModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# history = cancerModel.fit(X_cnn_train, y_cnn_train,
#                         batch_size=32,
#                         epochs=50,
#                         validation_split=0.2,
#                         callbacks=[early_stopping])

# # Make predictions on the test set
# y_pred_cnn = np.argmax(cancerModel.predict(X_cnn_test), axis=-1)
# test_loss, test_accuracy = cancerModel.evaluate(X_cnn_test, y_cnn_test)

# # Calculate the accuracy, confusion matrix, and classification report
# test_loss, test_accuracy = cancerModel.evaluate(X_cnn_test, y_cnn_test)
# cnn_conf_mat = confusion_matrix(y_cnn_test, y_pred_cnn)
# cnn_class_report = classification_report(y_cnn_test, y_pred_cnn)

# print(f"Accuracy: {test_accuracy:.4f}")
# print(f"Loss: {test_loss:.4f}")
# print("\nConfusion Matrix:\n", cnn_conf_mat)
# print("\nClassification Report:\n", cnn_class_report)

## Model 1.2: Parameter Change: Activation Function + Learning Rate

In [None]:

#Building Neural Network Layers
cancersigModel = Sequential()

cancersigModel.add(Conv2D(32, (3, 3), activation='sigmoid', input_shape=(27, 27, 3)))
cancersigModel.add(MaxPooling2D((2, 2)))

cancersigModel.add(Conv2D(64, (3, 3), activation='sigmoid'))
cancersigModel.add(MaxPooling2D((2, 2)))

cancersigModel.add(Conv2D(128, (3, 3), activation='sigmoid'))
cancersigModel.add(MaxPooling2D((2, 2)))

cancersigModel.add(Flatten())
cancersigModel.add(Dense(128, activation='sigmoid'))
cancersigModel.add(Dropout(0.5))
cancersigModel.add(Dense(1, activation='sigmoid'))

In [None]:
#Train test val split
X_sig_train, X_sig_test, y_sig_train, y_sig_test = train_test_split(augmented_data, augmented_labels, test_size=0.3, random_state=2)

# Compile the model
cancersigModel.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

history = cancersigModel.fit(X_sig_train, y_sig_train,
                        batch_size=32,
                        epochs=50,
                        validation_split=0.2,
                        callbacks=[early_stopping])

# Make predictions on the test set
y_pred_sig = np.argmax(cancersigModel.predict(X_sig_test), axis=-1)

# Calculate the accuracy, confusion matrix, and classification report
test_loss, test_accuracy = cancersigModel.evaluate(X_sig_test, y_sig_test)
sig_conf_mat = confusion_matrix(y_sig_test, y_pred_sig)
sig_class_report = classification_report(y_sig_test, y_pred_sig)
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Loss: {test_loss:.4f}")
print("\nConfusion Matrix:\n", sig_conf_mat)
print("\nClassification Report:\n", sig_class_report)

## Model 2: Random Forest

## HOG

In [None]:
#Train test val split
X_rfc_train, X_rfc_test, y_rfc_train, y_rfc_test = train_test_split(augmented_data, augmented_labels, test_size=0.3, random_state=2)



In [None]:
def extract_hog_features(images):
    features = []
    for img in images:
        hog_feature, hog_image = hog(img, orientations=8, pixels_per_cell=(16, 16),
                    cells_per_block=(1, 1), visualize=True)
        features.append(hog_feature)
    return np.array(features)

X_train_hog = extract_hog_features(X_rfc_train)
X_test_hog = extract_hog_features(X_rfc_test)

## Random Forest Classifier

In [None]:
rfc_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_classifier.fit(X_train_hog, y_rfc_train)

## Determining the best hyperparameters

In [None]:
# param_grid = {
#     'n_estimators': [100, 200, 300],  
#     'max_depth': [None, 5, 10],      
#     'max_features': ['sqrt', 'log2'] 
# }

# gridsearch = GridSearchCV(estimator=rfc_classifier, param_grid=param_grid, cv=5)
# gridsearch.fit(X_train_hog, y_rfc_train)
# params = gridsearch.best_params_
# print("Hyperparameters:", params)

### Training Random Forest Classifier

In [None]:
# # recreating model with new parameters
# reparam_rfc_classifier = RandomForestClassifier(**params)
# reparam_rfc_classifier.fit(X_train_hog, y_rfc_train)

# # Make predictions on the test data
# rfc_predictions = reparam_rfc_classifier.predict(X_test_hog)


### Evaluation

In [None]:

# # Evaluate the accuracy of the model
# rfc_accuracy = accuracy_score(y_rfc_test, rfc_predictions)
# rfc_conf_mat = confusion_matrix(y_rfc_test, rfc_predictions)
# rfc_class_report = classification_report(y_rfc_test, rfc_predictions)
# print(f"Accuracy: {rfc_accuracy:.4f}")
# print("\nConfusion Matrix:\n", rfc_conf_mat)
# print("\nClassification Report:\n", rfc_class_report)

## Model 3: CNN + Random Forest

### From CNN sig in Model 1.2

**NOTE: Model 1.2 code MUST be run before using Model 3**

In [None]:
target_layer = tf.keras.Model(inputs=cancersigModel.input, outputs=cancersigModel.layers[-2].output)

def extract_sig_features(images):
    features = target_layer.predict(images)
    return features

train_sig_features = extract_sig_features(X_sig_train)
train_sig_features = train_sig_features.reshape(train_sig_features.shape[0], -1)


### Training Random Forest Classifier

In [None]:
rfc_classifier = RandomForestClassifier()

rfc_classifier.fit(train_sig_features, y_sig_train)

### Evaluation

In [None]:
test_sig_features = extract_sig_features(X_sig_test)
test_sig_features = test_sig_features.reshape(test_sig_features.shape[0], -1)
rfsig_predictions = rfc_classifier.predict(test_sig_features)

In [None]:
rfsig_accuracy = accuracy_score(y_sig_test, rfsig_predictions)
rfsig_conf_mat = confusion_matrix(y_sig_test, rfsig_predictions)
rfsig_class_report = classification_report(y_sig_test, rfsig_predictions)

print(f"Accuracy: {rfsig_accuracy:.4f}")
print("\nConfusion Matrix:\n", rfsig_conf_mat)
print("\nClassification Report:\n", rfsig_class_report)