In [1]:
import os
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sn
import csv

from tqdm import tqdm
from matplotlib import pyplot as plt


from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve

warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array, load_img

2022-11-23 09:51:02.804564: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-23 09:51:03.273609: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-23 09:51:03.273653: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-23 09:51:05.066054: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"
%cd "/content/gdrive/MyDrive/Kaggle"

In [2]:
# base_path =  '/content/gdrive/MyDrive/Kaggle/'
base_path =  '.'
# data_path =  'BreaKHis_v1/BreaKHis_v1/histology_slides/breast'
data_path =  'BreaKHis_v1/histology_slides/breast'
magnifications = ['40X', '100X', '200X', '400X']
classes = ['benign', 'malignant']
sub_classes = {
    'benign': ['adenosis', 'fibroadenoma', 'phyllodes_tumor', 'tubular_adenoma'],
    'malignant': ['ductal_carcinoma', 'lobular_carcinoma', 'mucinous_carcinoma', 'papillary_carcinoma']
}

In [3]:
encoder = LabelEncoder()
encoder.fit(classes)

In [4]:
dataset = pd.DataFrame()

for clazz in classes:
    for sub_clazz in sub_classes[clazz]:
        path = os.path.join(base_path, data_path, clazz, "SOB", sub_clazz)
        for id in os.listdir(path):
            for magnification in magnifications:
                path_to_files = os.path.join(path, id, magnification)
                for file_name in os.listdir(path_to_files):
                    dataset = dataset.append({
                        'id': id,
                        'file_name': file_name,
                        'path': os.path.join(path_to_files, file_name),
                        'magnification': magnification,
                        'type': sub_clazz,
                        'lesion': clazz
                }, ignore_index=True)
                    
dataset.head()

Unnamed: 0,id,file_name,path,magnification,type,lesion
0,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-005.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign
1,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-022.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign
2,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-027.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign
3,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-012.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign
4,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-017.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign


In [5]:
# 5 fold K
n_folds = (1, 2, 3, 4, 5)

folds_df = pd.DataFrame()
for nfold in n_folds:
    fold_file = f"dsfold{nfold}.txt"
    
    fd = pd.read_csv(fold_file, delimiter="|", names=["file_name", "magnification", "fold", "grp"])
    fd = fd[["file_name", "grp"]]
    fd.rename(columns = {"grp":f"fold_{nfold}"}, inplace=True)
    if folds_df.empty:
        folds_df = folds_df.append(fd)
    else:
        folds_df = folds_df.merge(fd, how="inner", on="file_name")
        
folds_df.head()

Unnamed: 0,file_name,fold_1,fold_2,fold_3,fold_4,fold_5
0,SOB_B_A-14-22549AB-100-001.png,train,train,train,test,test
1,SOB_B_A-14-22549AB-100-002.png,train,train,train,test,test
2,SOB_B_A-14-22549AB-100-003.png,train,train,train,test,test
3,SOB_B_A-14-22549AB-100-004.png,train,train,train,test,test
4,SOB_B_A-14-22549AB-100-005.png,train,train,train,test,test


In [6]:
dataset = dataset.merge(folds_df, how="inner", on="file_name")
dataset.head()

Unnamed: 0,id,file_name,path,magnification,type,lesion,fold_1,fold_2,fold_3,fold_4,fold_5
0,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-005.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
1,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-022.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
2,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-027.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
3,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-012.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
4,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-017.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test


### ScratchPad
No need to run this, head over to next section

In [None]:
image_size = (46, 70, 3)

x_train, x_test, y_train, y_test = [], [], [], []

n_fold = 1
for i, row in tqdm(dataset.iterrows()):
    image = load_img(row["path"], target_size=image_size)
    if row[f"fold_{n_fold}"] == "train":
        x_train.append(img_to_array(image) / 255.0)
        y_train.append(row["lesion"])
    else:
        x_test.append(img_to_array(image) / 255.0)
        y_test.append(row["lesion"])

In [None]:
x_train_ = np.asarray(x_train)
x_test_ = np.asarray(x_test)
print(x_train_.shape)
print(x_test_.shape)

In [None]:
x_train_ = np.reshape(x_train_, (len(x_train_), np.prod(image_size)))
x_test_ = np.reshape(x_test_, (len(x_test_), np.prod(image_size)))
print(x_train_.shape)
print(x_test_.shape)

In [None]:
y_train_ = encoder.transform(y_train)
y_test_ = encoder.transform(y_test)

### Classifier Functions

In [7]:
def knn_clf(x_train, y_train, x_test, y_test):
    k = 1
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(x_train_, y_train_)
    
    return classification_report(y_test_, knn_clf.predict(x_test_), output_dict=True)

def svm_clf(x_train, y_train, x_test, y_test):
    svm_clf = SVC()
    svm_clf = svm_clf.fit(x_train_, y_train_)

    return classification_report(y_test_, svm_clf.predict(x_test_), output_dict=True)

def dt_clf(x_train, y_train, x_test, y_test):
    dt_clf = DecisionTreeClassifier()
    dt_clf = dt_clf.fit(x_train_, y_train_)

    return classification_report(y_test_, dt_clf.predict(x_test_), output_dict=True)

In [13]:
image_size = (46, 70, 3)
n_folds = (4, 5)

In [37]:
results = {
    "knn": {
        "40X": {},
        "100X": {},
        "200X": {},
        "400X": {}
    },
    "svm": {
        "40X": {},
        "100X": {},
        "200X": {},
        "400X": {}
    },
    "dt": {
        "40X": {},
        "100X": {},
        "200X": {},
        "400X": {}
    },
}

In [None]:
for n_fold in n_folds:
    train_df = dataset.copy()[dataset[f"fold_{n_fold}"] == "train"]
    test_df = dataset.copy().drop(train_df.index).reset_index(drop=True)

    x_train, x_test, y_train, y_test = [], [], [], []

    for i, row in tqdm(train_df.iterrows()):
        image = load_img(row["path"], target_size=image_size)
        x_train.append(img_to_array(image) / 255.0)
        y_train.append(row["lesion"])

    for i, row in tqdm(test_df.iterrows()):
        image = load_img(row["path"], target_size=image_size)
        x_test.append(img_to_array(image) / 255.0)
        y_test.append(row["lesion"])    

    x_train_ = np.asarray(x_train)
    x_test_ = np.asarray(x_test)

    x_train_ = np.reshape(x_train_, (len(x_train_), np.prod(image_size)))
    x_test_ = np.reshape(x_test_, (len(x_test_), np.prod(image_size)))

    y_train_ = encoder.transform(y_train)
    y_test_ = encoder.transform(y_test)

    print(f"Processing fold {n_fold}")
    # 1NN
    results["knn"][f"fold_{n_fold}"] = knn_clf(x_train_, y_train_, x_test_, y_test_)
    # SVM
    results["svm"][f"fold_{n_fold}"] = svm_clf(x_train_, y_train_, x_test_, y_test_)

    # Decision Tree
    results["dt"][f"fold_{n_fold}"] = dt_clf(x_train_, y_train_, x_test_, y_test_)
    
    print(f"Fold {n_fold} processed")

print(results)


In [88]:
def run_clfs(mag):
    print("Magnification:", mag)
    for n_fold in n_folds:
        df = dataset.copy()[dataset["magnification"] == mag]
        train_df = df.copy()[dataset[f"fold_{n_fold}"] == "train"]
        test_df = df.copy().drop(train_df.index).reset_index(drop=True)

        x_train, x_test, y_train, y_test = [], [], [], []

        for i, row in tqdm(train_df.iterrows()):
            image = load_img(row["path"], target_size=image_size)
            x_train.append(img_to_array(image) / 255.0)
            y_train.append(row["lesion"])

        for i, row in tqdm(test_df.iterrows()):
            image = load_img(row["path"], target_size=image_size)
            x_test.append(img_to_array(image) / 255.0)
            y_test.append(row["lesion"])    

        x_train_ = np.asarray(x_train)
        x_test_ = np.asarray(x_test)

        x_train_ = np.reshape(x_train_, (len(x_train_), np.prod(image_size)))
        x_test_ = np.reshape(x_test_, (len(x_test_), np.prod(image_size)))

        y_train_ = encoder.transform(y_train)
        y_test_ = encoder.transform(y_test)

        print(f"Processing fold {n_fold}")
        # 1NN
        results["knn"][mag][f"fold_{n_fold}"] = knn_clf(x_train_, y_train_, x_test_, y_test_)
        # SVM
        results["svm"][mag][f"fold_{n_fold}"] = svm_clf(x_train_, y_train_, x_test_, y_test_)
        # Decision Tree
        results["dt"][mag][f"fold_{n_fold}"] = dt_clf(x_train_, y_train_, x_test_, y_test_)
        
        print(f"Fold {n_fold} processed")

In [89]:
run_clfs("40X")

Magnification: 40X


1295it [00:21, 59.46it/s]
700it [00:10, 64.92it/s]


Processing fold 4
Fold 4 processed


1220it [00:18, 66.78it/s]
775it [00:12, 63.43it/s]


Processing fold 5
Fold 5 processed


In [90]:
# results = {'knn': {'fold_1': {'0': {'precision': 0.6133971291866028, 'recall': 0.6359126984126984, 'f1-score': 0.6244520214320506, 'support': 1008}, '1': {'precision': 0.8025820333512641, 'recall': 0.7869198312236287, 'f1-score': 0.7946737683089214, 'support': 1896}, 'accuracy': 0.734504132231405, 'macro avg': {'precision': 0.7079895812689334, 'recall': 0.7114162648181636, 'f1-score': 0.709562894870486, 'support': 2904}, 'weighted avg': {'precision': 0.736914545955266, 'recall': 0.734504132231405, 'f1-score': 0.7355885338557927, 'support': 2904}}, 'fold_2': {'0': {'precision': 0.44786729857819907, 'recall': 0.5220994475138122, 'f1-score': 0.48214285714285715, 'support': 724}, '1': {'precision': 0.7780628608082104, 'recall': 0.7224538415723645, 'f1-score': 0.7492279184681904, 'support': 1679}, 'accuracy': 0.6620890553474823, 'macro avg': {'precision': 0.6129650796932047, 'recall': 0.6222766445430883, 'f1-score': 0.6156853878055237, 'support': 2403}, 'weighted avg': {'precision': 0.6785782220006665, 'recall': 0.6620890553474823, 'f1-score': 0.6687578458924346, 'support': 2403}}, 'fold_3': {'0': {'precision': 0.7592592592592593, 'recall': 0.7377892030848329, 'f1-score': 0.7483702737940024, 'support': 778}, '1': {'precision': 0.8879736408566722, 'recall': 0.8988326848249028, 'f1-score': 0.8933701657458565, 'support': 1799}, 'accuracy': 0.8502134264648816, 'macro avg': {'precision': 0.8236164500579657, 'recall': 0.8183109439548678, 'f1-score': 0.8208702197699295, 'support': 2577}, 'weighted avg': {'precision': 0.849114584247131, 'recall': 0.8502134264648816, 'f1-score': 0.8495944901779315, 'support': 2577}}, 'fold_4': {'0': {'precision': 0.7819420783645656, 'recall': 0.4909090909090909, 'f1-score': 0.6031537450722734, 'support': 935}, '1': {'precision': 0.7745144481288488, 'recall': 0.9273964832671583, 'f1-score': 0.8440887971089313, 'support': 1763}, 'accuracy': 0.7761304670126019, 'macro avg': {'precision': 0.7782282632467072, 'recall': 0.7091527870881246, 'f1-score': 0.7236212710906023, 'support': 2698}, 'weighted avg': {'precision': 0.7770885156864453, 'recall': 0.7761304670126019, 'f1-score': 0.7605920314846633, 'support': 2698}}, 'fold_5': {'0': {'precision': 0.6645702306079665, 'recall': 0.29626168224299065, 'f1-score': 0.40982546864899805, 'support': 1070}, '1': {'precision': 0.7110514198004605, 'recall': 0.9205166418281172, 'f1-score': 0.802338168434726, 'support': 2013}, 'accuracy': 0.7038598767434318, 'macro avg': {'precision': 0.6878108252042134, 'recall': 0.6083891620355539, 'f1-score': 0.606081818541862, 'support': 3083}, 'weighted avg': {'precision': 0.6949194469052388, 'recall': 0.7038598767434318, 'f1-score': 0.6661109258882685, 'support': 3083}}}, 'svm': {'fold_1': {'0': {'precision': 0.8078947368421052, 'recall': 0.6091269841269841, 'f1-score': 0.6945701357466063, 'support': 1008}, '1': {'precision': 0.816231343283582, 'recall': 0.9229957805907173, 'f1-score': 0.8663366336633663, 'support': 1896}, 'accuracy': 0.8140495867768595, 'macro avg': {'precision': 0.8120630400628437, 'recall': 0.7660613823588507, 'f1-score': 0.7804533847049864, 'support': 2904}, 'weighted avg': {'precision': 0.8133376451799289, 'recall': 0.8140495867768595, 'f1-score': 0.8067152046344084, 'support': 2904}}, 'fold_2': {'0': {'precision': 0.6768488745980707, 'recall': 0.5814917127071824, 'f1-score': 0.62555720653789, 'support': 724}, '1': {'precision': 0.8298708590679393, 'recall': 0.8802858844550328, 'f1-score': 0.854335260115607, 'support': 1679}, 'accuracy': 0.7902621722846442, 'macro avg': {'precision': 0.753359866833005, 'recall': 0.7308887985811076, 'f1-score': 0.7399462333267486, 'support': 2403}, 'weighted avg': {'precision': 0.7837668570886698, 'recall': 0.7902621722846442, 'f1-score': 0.7854067079765029, 'support': 2403}}, 'fold_3': {'0': {'precision': 0.875, 'recall': 0.6658097686375322, 'f1-score': 0.7562043795620439, 'support': 778}, '1': {'precision': 0.8690176322418136, 'recall': 0.9588660366870484, 'f1-score': 0.9117336152219874, 'support': 1799}, 'accuracy': 0.8703919285991463, 'macro avg': {'precision': 0.8720088161209067, 'recall': 0.8123379026622903, 'f1-score': 0.8339689973920157, 'support': 2577}, 'weighted avg': {'precision': 0.8708237176573623, 'recall': 0.8703919285991463, 'f1-score': 0.8647791156707898, 'support': 2577}}, 'fold_4': {'0': {'precision': 0.7866004962779156, 'recall': 0.6780748663101605, 'f1-score': 0.7283170591614014, 'support': 935}, '1': {'precision': 0.8409090909090909, 'recall': 0.9024390243902439, 'f1-score': 0.8705882352941177, 'support': 1763}, 'accuracy': 0.8246849518161601, 'macro avg': {'precision': 0.8137547935935032, 'recall': 0.7902569453502022, 'f1-score': 0.7994526472277595, 'support': 2698}, 'weighted avg': {'precision': 0.822088284393098, 'recall': 0.8246849518161601, 'f1-score': 0.821283732075404, 'support': 2698}}, 'fold_5': {'0': {'precision': 0.8821192052980132, 'recall': 0.6224299065420561, 'f1-score': 0.7298630136986302, 'support': 1070}, '1': {'precision': 0.8264604810996563, 'recall': 0.9557873820168902, 'f1-score': 0.8864316977654917, 'support': 2013}, 'accuracy': 0.8400908206292572, 'macro avg': {'precision': 0.8542898431988348, 'recall': 0.7891086442794731, 'f1-score': 0.8081473557320609, 'support': 3083}, 'weighted avg': {'precision': 0.8457776510290244, 'recall': 0.8400908206292572, 'f1-score': 0.8320922582742357, 'support': 3083}}}, 'dt': {'fold_1': {'0': {'precision': 0.587527352297593, 'recall': 0.5327380952380952, 'f1-score': 0.558792924037461, 'support': 1008}, '1': {'precision': 0.7633165829145728, 'recall': 0.8011603375527426, 'f1-score': 0.7817807514153372, 'support': 1896}, 'accuracy': 0.7079889807162535, 'macro avg': {'precision': 0.6754219676060829, 'recall': 0.6669492163954189, 'f1-score': 0.6702868377263991, 'support': 2904}, 'weighted avg': {'precision': 0.7022988334442163, 'recall': 0.7079889807162535, 'f1-score': 0.7043800179453306, 'support': 2904}}, 'fold_2': {'0': {'precision': 0.4780821917808219, 'recall': 0.48204419889502764, 'f1-score': 0.4800550206327373, 'support': 724}, '1': {'precision': 0.775851763299462, 'recall': 0.7730792138177487, 'f1-score': 0.7744630071599046, 'support': 1679}, 'accuracy': 0.6853932584269663, 'macro avg': {'precision': 0.626966977540142, 'recall': 0.6275617063563882, 'f1-score': 0.627259013896321, 'support': 2403}, 'weighted avg': {'precision': 0.686136752987562, 'recall': 0.6853932584269663, 'f1-score': 0.685760808971944, 'support': 2403}}, 'fold_3': {'0': {'precision': 0.5406360424028268, 'recall': 0.589974293059126, 'f1-score': 0.5642286416717885, 'support': 778}, '1': {'precision': 0.8153935185185185, 'recall': 0.783212896053363, 'f1-score': 0.7989793025233909, 'support': 1799}, 'accuracy': 0.7248738843616609, 'macro avg': {'precision': 0.6780147804606727, 'recall': 0.6865935945562445, 'f1-score': 0.6816039720975897, 'support': 2577}, 'weighted avg': {'precision': 0.7324438419884416, 'recall': 0.7248738843616609, 'f1-score': 0.7281077409624493, 'support': 2577}}, 'fold_4': {'0': {'precision': 0.6010471204188481, 'recall': 0.613903743315508, 'f1-score': 0.6074074074074074, 'support': 935}, '1': {'precision': 0.7928858290304074, 'recall': 0.7838910947249007, 'f1-score': 0.7883628066172277, 'support': 1763}, 'accuracy': 0.7249814677538917, 'macro avg': {'precision': 0.6969664747246278, 'recall': 0.6988974190202044, 'f1-score': 0.6978851070123175, 'support': 2698}, 'weighted avg': {'precision': 0.7264035486183215, 'recall': 0.7249814677538917, 'f1-score': 0.7256521697524456, 'support': 2698}}, 'fold_5': {'0': {'precision': 0.5905349794238683, 'recall': 0.5364485981308411, 'f1-score': 0.5621939275220372, 'support': 1070}, '1': {'precision': 0.7650402652771199, 'recall': 0.8022851465474417, 'f1-score': 0.7832201745877789, 'support': 2013}, 'accuracy': 0.7100227051573142, 'macro avg': {'precision': 0.6777876223504942, 'recall': 0.6693668723391414, 'f1-score': 0.672707051054908, 'support': 3083}, 'weighted avg': {'precision': 0.7044756672028485, 'recall': 0.7100227051573142, 'f1-score': 0.7065098001601618, 'support': 3083}}}}
for clf, r in results.items():
    for mag, d in r.items():
        avg_accuracy = 0
        for fold, data in d.items():
            avg_accuracy += data["accuracy"]
    
        if (len(d) > 0):
            avg_accuracy = avg_accuracy / len(d)
            print(clf, mag, "avg accuracy:", avg_accuracy)

knn 40X avg accuracy: 0.7038598767434318
svm 40X avg accuracy: 0.8400908206292572
dt 40X avg accuracy: 0.7169964320467077
