# imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from PIL import Image
import os
import shutil
import matplotlib.pyplot as plt

# dataloader

In [2]:
im = Image.open('cnn_data/benign/1.png')
# im.show()
im = np.array(im)
im = np.reshape(im, im.shape[0]*im.shape[1]*im.shape[2])
image_size = im.size
image_size
im.shape

(794106,)

In [3]:
data_path_img = 'cnn_data/'
folders = os.listdir(data_path_img)

shapes = set()
for folder in folders:
    for file in os.listdir(data_path_img+folder):
        path = f"{data_path_img}{folder}/{file}"
        # 
        im = Image.open(path)
        im = np.array(im)

        if (len(im.shape)==3 and im.shape[2]!=3) or (len(im.shape)==2):
            print('delete:', path)
            os.remove(path)


# for file in os.listdir(data_path_img):
#     im = Image.open(data_path_img+file)
#     im = np.array(im)
#     # print(f"file: {len(im.shape)}")
#     if len(im.shape) == 3 and im.shape[2]==2:
#         print(f"delete: {file}")
#         # os.remove(data_path_img+file)
#         # os.remove(data_path_mask+file)

In [4]:
class_mapping = {
    'benign': 0,
    'malignant': 1,
    'normal': 2,
}
data_path_img = 'cnn_data/'
folders = os.listdir(data_path_img)

paths, classes = [], []
for folder in folders:
    for file in os.listdir(data_path_img+folder):
        path = f"{data_path_img}{folder}/{file}"
        _class = class_mapping[folder]
        paths.append(path)
        classes.append(_class)
# paths, classes

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    paths, classes,
    test_size=.33,
    random_state=42
)

In [6]:
train_df = pd.DataFrame({'path': X_train, 'class': y_train})
val_df = pd.DataFrame({'path': X_test, 'class': y_test})
# train_df, val_df

In [7]:
train_df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,path,class
0,cnn_data/normal/57.png,2
1,cnn_data/benign/295.png,0
2,cnn_data/malignant/57.png,1
3,cnn_data/malignant/90.png,1
4,cnn_data/malignant/77.png,1
...,...,...
511,cnn_data/benign/368.png,0
512,cnn_data/malignant/175.png,1
513,cnn_data/benign/379.png,0
514,cnn_data/normal/56.png,2


# DataGenerator

In [8]:
class CustomDataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, df, X_col, y_col, batch_size=16, shuffle=True):
        self.df = df.copy()
        self.df = self.df.sample(frac=1).reset_index(drop=True)

        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n = len(self.df)


    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    

    def __get_input(self, paths):
        out = []

        for path in paths:
            im = Image.open(path)
            im = im.crop((57,0,505,448))
            im = np.array(im)/255.
            im = np.reshape(im, im.shape[0]*im.shape[1]*im.shape[2])

            out.append(im)

        return out


    def __get_output(self, labels):
        out = []

        for label in labels:
            out.append(label)

        return out


    def __get_data(self, batches):
        paths = batches[self.X_col]
        X = self.__get_input(paths)
        labels = batches[self.y_col]
        y = self.__get_output(labels)

        X = np.array(X)
        y = np.array(y)
        X = tf.convert_to_tensor(X, dtype=tf.float32)
        y = tf.convert_to_tensor(y, dtype=tf.int32)
        return X,y


    def __getitem__(self, index):
        batches = self.df.iloc[index*self.batch_size:(index+1)*self.batch_size]
        X,y = self.__get_data(batches)
        return X,y


    def get_all_data(self):
        paths = self.df[self.X_col]
        labels = self.df[self.y_col]

        outX = []

        for path in paths:
            im = Image.open(path)
            im = im.crop((57,0,505,448))
            im = np.array(im)/255.
            im = np.reshape(im, im.shape[0]*im.shape[1]*im.shape[2])

            outX.append(im)

        outy = []

        for label in labels:
            outy.append(label)

        return outX, outy


    def __len__(self):
        return self.n // self.batch_size


In [9]:
data_dir = 'cnn_data/'
BATCH_SIZE=16

traingen = CustomDataGenerator(train_df, 'path', 'class', BATCH_SIZE)
valgen = CustomDataGenerator(val_df, 'path', 'class', BATCH_SIZE)
traingen, valgen

(<__main__.CustomDataGenerator at 0x16c39eaf0>,
 <__main__.CustomDataGenerator at 0x16c39e100>)

In [10]:
img = np.array(traingen.__getitem__(0)[0][0]*255, dtype=int)
label = np.array(traingen.__getitem__(0)[1][0])

In [11]:
img.shape

(602112,)

In [12]:
label

array(0, dtype=int32)

# SVM

In [13]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# param_grid={
#     'kernel':['rbf','poly']
# }

svm = svm.SVC(probability=True, kernel='poly')
# model = GridSearchCV(svc, param_grid)
model

NameError: name 'model' is not defined

In [None]:
train_X, train_y = traingen.get_all_data()

In [None]:
svm.fit(train_X, train_y)

GridSearchCV(estimator=SVC(probability=True),
             param_grid={'kernel': ['rbf', 'poly']})

In [None]:
# model.best_params_
# {'kernel': 'poly'}

{'kernel': 'poly'}

In [None]:
del train_X
del train_y

In [None]:
val_X, val_y = valgen.get_all_data()

In [None]:
from sklearn.metrics import f1_score

pred = svm.predict(val_X)
acc = val_y

f1_score(acc, pred, average='micro')

0.6588235294117647

In [None]:
del val_X
del val_y

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators':[50, 100, 150, 200],
}

rf = RandomForestClassifier(n_estimators=200)
# model = GridSearchCV(rf, param_grid)
rf

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [50, 100, 150, 200]})

In [None]:
train_X, train_y = traingen.get_all_data()

In [None]:
rf.fit(train_X, train_y)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [50, 100, 150, 200]})

In [None]:
# model.best_params_
# {'n_estimators': 200}

{'n_estimators': 200}

In [None]:
del train_X
del train_y

In [None]:
val_X, val_y = valgen.get_all_data()

In [None]:
from sklearn.metrics import f1_score

pred = rf.predict(val_X)
acc = val_y

f1_score(acc, pred, average='micro')

0.6470588235294118

In [None]:
del val_X
del val_y

# another model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_neighbors':[5, 10, 15],
}

knn = KNeighborsClassifier(n_neighbors=5)
# model = GridSearchCV(knn, param_grid)
knn

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 10, 15]})

In [None]:
train_X, train_y = traingen.get_all_data()

In [None]:
knn.fit(train_X, train_y)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 10, 15]})

In [None]:
# model.best_params_
# {'n_neighbors': 5}

{'n_neighbors': 5}

In [None]:
del train_X
del train_y

In [None]:
val_X, val_y = valgen.get_all_data()

In [None]:
from sklearn.metrics import f1_score

pred = knn.predict(val_X)
acc = val_y

f1_score(acc, pred, average='micro')

0.5686274509803921

In [None]:
del val_X
del val_y

# another