# Image classification of Breast Ultra Sound Images

Here we experiment with classical algorithms such as Logistic Regression, SVM and Bag of Visual words


### Import packages

In [12]:
import os
import re
import random
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

from sklearn.decomposition import PCA

from utils import init_img_dict, get_file_dicts, filter_files, find_mask, print_ndarray_info
from utils import img_read, img_write, img_resize, img_flip, comp_fft, histogram_equalization
from utils import display_img, display_img_list_3, display_3_imgs, display_3_hist, resize_imgs, flip_imgs, append_img_data

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, classification_report, confusion_matrix 
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier, KerasRegressor
import talos
from talos.utils import lr_normalizer

In [4]:
import os
import re
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf

import sys
import keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adagrad

### Functions to assess model quality and to plot confusion matrix, ROC etc.

In [5]:
# Plot confusion matrix (binary)
def plot_cm(cm):
    
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.title("Confusion matrix")
    plt.show()
    

# Plot ROC
def plot_ROC(fpr, tpr, auc):

    plt.figure(figsize = (6.4, 6.4))
    plt.plot(fpr, tpr, label= f"ROC, auc= {auc:.2f}")
    plt.title("ROC curve")
    plt.legend(loc=4)
    plt.show()
    

# To assess model quality analysis
def model_quality(model, X_train, X_test, y_train, y_test):

    # Accuracy
    print(f"Model accuracy on training data: {model.score(X_train, y_train):.2f} ")
    print(f"Model accuracy on test data: {model.score(X_test, y_test):.2f} ")

    # Classification report (precision, recall, f1 score)
    print(f"\nClassification report:\n {classification_report(y_test, model.predict(X_test))} ")

    # Confusion matrix
    cm = confusion_matrix(y_test, model.predict(X_test))
    print(f"Confusion matrix:\n {cm}\n")
    plot_cm(cm)

    # ROC curve
    y_pred_proba = model.predict_proba(X_test)[::,1]
    fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
    auc = roc_auc_score(y_test, y_pred_proba)
    plot_ROC(fpr, tpr, auc)

## Read image list (with dict object for each image) for the 2 classes (Benign and Malignant)
 - Read benign and malignant dataset into a list

In [6]:
img_res = 256

benign_img_dir = './Dataset_BUSI_with_GT/benign_256'
malignant_img_dir = './Dataset_BUSI_with_GT/malignant_256'

# Get a list of images in the images directory
benign_img_list_all = get_file_dicts(benign_img_dir)
num_benign_img = len(benign_img_list_all)
print(f"Number of images in benign dataset: {num_benign_img}")

malignant_img_list_all = get_file_dicts(malignant_img_dir)
num_malignant_img = len(malignant_img_list_all)
print(f"Number of images in malignant dataset: {num_malignant_img}")

Number of images in benign dataset: 437
Number of images in malignant dataset: 420


## Prepare data for classification
 - Assemble feature data (pixels) and target data (0 for benign and 1 for malignant)
 - Shuffle the data
 - Split data in to training and test (70-30) set
 - Scale the data (use Standard scaler)

In [18]:
def import_images(img_list, label):
    images = []
    masks = []
    labels = []
    for item in img_list:
        img = cv2.imread(item['file_name_fullpath'], cv2.IMREAD_GRAYSCALE)
        mask = cv2.imread(item['file_masks'][0], cv2.IMREAD_GRAYSCALE)
        images.append(img)
        masks.append(mask)
        labels.append(label)
    return images, masks, labels

# normal_im, normal_mk, n_l = import_images(normal_img_list, 0)
benign_im, benign_mk, b_l = import_images(benign_img_list_all, 1)
malignant_im, malignant_mk, m_l = import_images(malignant_img_list_all, 2)

X = benign_im + malignant_im
L = b_l+m_l

x_resize=[]
img_size=256

for x in X:
    new_array = cv2.resize(x,(img_size,img_size))
    x_resize.append(new_array)
x_resize = np.array(x_resize)
x_resize = x_resize.reshape(x_resize.shape[0], x_resize.shape[1], x_resize.shape[2], 1)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x_resize,L, test_size=0.1, random_state= 7, stratify= L)
# X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=7,  stratify= y_test) 

### CNN

In [19]:
def create_model(opt = 'adam', learning_rate = .001):
# Create a model and trin
    model = tf.keras.Sequential()
    
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(256, 256,1)))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(2, activation='softmax'))
    if opt == 'adagrad':
        opt = Adagrad(lr=learning_rate)
    elif opt == 'adam':
        opt = Adam(lr=learning_rate)
    elif opt == 'SGD':
        opt = SGD(lr=learning_rate) 
    model.compile(optimizer = opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    # history = model.fit(x=x_train, y=y_train, validation_data=(x_val, y_val), batch_size=params['batch_size'], epochs=params['epochs'], verbose=1)
    
    return model 

In [16]:
batch_size = [16, 32]
opt = ['adam', 'adagrad']
# 'dropout_rate': [0.0, 0.10, 0.20], 0.30],
learning_rate = [.001, .01]
epochs = [10, 15, 20]
param_grid = dict(opt= opt, learning_rate = learning_rate, batch_size = batch_size,epochs = epochs)

In [20]:
model = KerasClassifier(build_fn=create_model, learning_rate=0.0001, opt='adam', verbose=1)
grid = GridSearchCV(estimator = model,cv = 3, param_grid = param_grid, verbose = 1)
grid_result = grid.fit(X_train,y_train)

In [None]:
def display_cv_results(search_results):
    print('Best score = {:.4f} using {}'.format(search_results.best_score_, search_results.best_params_))
    means = search_results.cv_results_['mean_test_score']
    stds = search_results.cv_results_['std_test_score']
    params = search_results.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print('mean test accuracy +/- std = {:.4f} +/- {:.4f} with: {}'.format(mean, stdev, param))  

In [None]:
display_cv_results(grid_result)

In [None]:
mlp = grid_result.best_estimator_ 
history = mlp.fit(
    X_train,
    y_train)
# history = model.fit(np.array(X_train),np.array(y_train), epochs=10,validation_data=(X_test, np.array(y_test)))

In [None]:
# get prediction on validation dataset 
y_pred = mlp.predict(X_test)
print('Accuracy on validation data = {:.4f}'.format(accuracy_score(y_testl, y_pred)))

# plot accuracy on training and validation data
df_history = pd.DataFrame(history.history)
sns.lineplot(data=df_history[['accuracy','val_accuracy']], palette="tab10", linewidth=2.5)

In [21]:
model = create_model(opt = 'adam', learning_rate = .001)
# y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
# y_test = np.asarray(y_test).astype('float32').reshape((-1,1))
history = model.fit(X_train,y_train,epochs=12,validation_data = (X_test, y_test))

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

In [None]:
# Model quality analysis
# accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.evaluate(np.array(X_test),np.array(y_test))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

predicted = [np.argmax(item) for item in model.predict(X_test)]

conf = confusion_matrix(y_test,predicted)
conf

info = [
    'benign'   ,  # 0
    'malignant',  # 1
]
plt.figure(figsize = (6,6))
ax = sns.heatmap(conf, cmap=plt.cm.Blues, annot=True, square=True, xticklabels = info, yticklabels = info)
ax.set_ylabel('Actual', fontsize=40)
ax.set_xlabel('Predicted', fontsize=40)