In [1]:
import os
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sn
import csv
import pickle

from tqdm import tqdm
from matplotlib import pyplot as plt


from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve

warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array, load_img

2022-11-26 10:40:25.663475: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-26 10:40:25.918259: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-26 10:40:25.918322: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-26 10:40:27.933263: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"
# %cd "/content/gdrive/MyDrive/Kaggle"

In [3]:
# base_path =  '/content/gdrive/MyDrive/Kaggle/'
base_path =  '.'
# data_path =  'BreaKHis_v1/BreaKHis_v1/histology_slides/breast'
data_path =  'BreaKHis_v1/histology_slides/breast'
magnifications = ['40X', '100X', '200X', '400X']
classes = ['benign', 'malignant']
sub_classes = {
    'benign': ['adenosis', 'fibroadenoma', 'phyllodes_tumor', 'tubular_adenoma'],
    'malignant': ['ductal_carcinoma', 'lobular_carcinoma', 'mucinous_carcinoma', 'papillary_carcinoma']
}
n_folds = (1, 2, 3, 4, 5)


In [4]:
encoder = LabelEncoder()
encoder.fit(classes)

In [5]:
load_ds_from_pickle = True
# load_ds_from_pickle = False

In [6]:
dataset = pd.DataFrame()

if load_ds_from_pickle:
    dataset = pd.read_pickle('dataset.pkl')
else:
    for clazz in classes:
        for sub_clazz in sub_classes[clazz]:
            path = os.path.join(base_path, data_path, clazz, "SOB", sub_clazz)
            for id in os.listdir(path):
                for magnification in magnifications:
                    path_to_files = os.path.join(path, id, magnification)
                    for file_name in os.listdir(path_to_files):
                        dataset = dataset.append({
                            'id': id,
                            'file_name': file_name,
                            'path': os.path.join(path_to_files, file_name),
                            'magnification': magnification,
                            'type': sub_clazz,
                            'lesion': clazz
                        }, ignore_index=True)

# 5 fold K
    n_folds = (1, 2, 3, 4, 5)
    folds_df = pd.DataFrame()
    for nfold in n_folds:
        fold_file = f"dsfold{nfold}.txt"

        fd = pd.read_csv(fold_file, delimiter="|", names=[
                         "file_name", "magnification", "fold", "grp"])
        fd = fd[["file_name", "grp"]]
        fd.rename(columns={"grp": f"fold_{nfold}"}, inplace=True)
        if folds_df.empty:
            folds_df = folds_df.append(fd)
        else:
            folds_df = folds_df.merge(fd, how="inner", on="file_name")

    folds_df.head()

    dataset = dataset.merge(folds_df, how="inner", on="file_name")

    dataset.to_pickle('dataset.pkl')

dataset.head()


Unnamed: 0,id,file_name,path,magnification,type,lesion,fold_1,fold_2,fold_3,fold_4,fold_5
0,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-005.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
1,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-022.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
2,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-027.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
3,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-012.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test
4,SOB_B_A_14-22549AB,SOB_B_A-14-22549AB-40-017.png,./BreaKHis_v1/histology_slides/breast/benign/S...,40X,adenosis,benign,train,train,train,test,test


### Classifier Functions

In [7]:
def knn_clf(x_train, y_train, x_test, y_test):
    k = 1
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(x_train, y_train)
    
    return classification_report(y_test, knn_clf.predict(x_test), output_dict=True)

def svm_clf(x_train, y_train, x_test, y_test):
    svm_clf = SVC()
    svm_clf = svm_clf.fit(x_train, y_train)

    return classification_report(y_test, svm_clf.predict(x_test), output_dict=True)

def dt_clf(x_train, y_train, x_test, y_test):
    dt_clf = DecisionTreeClassifier()
    dt_clf = dt_clf.fit(x_train, y_train)

    return classification_report(y_test, dt_clf.predict(x_test), output_dict=True)

In [8]:
image_size = (46, 70, 3)

In [9]:
results = {
    "knn": {
    },
    "svm": {
    },
    "dt": {
    },
}

In [11]:
def run_clfs(clf, mag, n_fold):
    print("Classifier:", clf)
    print("Magnification:", mag)
    print("Fold:", n_fold)

    df = dataset.copy()[dataset["magnification"] == mag]
    train_df = df.copy()[dataset[n_fold] == "train"]
    test_df = df.copy().drop(train_df.index).reset_index(drop=True)

    x_train, x_test, y_train, y_test = [], [], [], []

    for i, row in tqdm(train_df.iterrows()):
        image = load_img(row["path"], target_size=image_size)
        x_train.append(img_to_array(image) / 255.0)
        y_train.append(row["lesion"])

    for i, row in tqdm(test_df.iterrows()):
        image = load_img(row["path"], target_size=image_size)
        x_test.append(img_to_array(image) / 255.0)
        y_test.append(row["lesion"])    

    x_train_ = np.asarray(x_train)
    x_test_ = np.asarray(x_test)

    x_train_ = np.reshape(x_train_, (len(x_train_), np.prod(image_size)))
    x_test_ = np.reshape(x_test_, (len(x_test_), np.prod(image_size)))

    y_train_ = encoder.transform(y_train)
    y_test_ = encoder.transform(y_test)

    results[clf][mag] = {}
    if clf == "knn":
        # 1NN
        results[clf][mag][n_fold] = knn_clf(x_train_, y_train_, x_test_, y_test_)
    elif clf == "svm":
        # SVM
        results[clf][mag][n_fold] = svm_clf(x_train_, y_train_, x_test_, y_test_)
    elif clf == "dt":
        # Decision Tree
        results[clf][mag][n_fold] = dt_clf(x_train_, y_train_, x_test_, y_test_)

    print("Processed:", clf, mag, n_fold)

    with open(f"results/{clf}_{mag}_{n_fold}.pkl", 'wb') as f:
        pickle.dump(results, f)