In [None]:
# import required packages

# source: https://www.kaggle.com/allunia/patterns-of-colorectal-cancer-wally

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.gridspec as gridspec

import seaborn as sns
sns.set(style='darkgrid')

# Import classifiers used:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

# Data Transformation
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


# Import ovo and ovr related stuff
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import VotingClassifier

# Pre-processing Data:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict, GridSearchCV

# Import Metrics
from sklearn.metrics import f1_score, accuracy_score, precision_recall_curve, classification_report, multilabel_confusion_matrix, confusion_matrix, plot_confusion_matrix, plot_roc_curve

# Tensorflow
import tensorflow as tf

In [36]:
# Ensure that GPU is detected
assert(tf.config.experimental.list_physical_devices('GPU') is not None), 'GPU not detected'

In [None]:
df = pd.read_csv('../01_Data_Files/hmnist_64_64_L.csv', engine='c') # read 64x64 gray scale images as csv

# Change labels so they start at 0 instead of 1. 
df['label'] = df['label'] -1 

dict_class_names = {0: 'Tumor', 1:'Stroma', 2:'Complex', 3:'Lymphoma', 4:'Debris', 5:'Mucosa', 6:'Adipose', 7:'Empty'}
df['label_name'] = df['label'].map(dict_class_names)
class_names = ['Tumor', 'Stroma', 'Complex', 'Lymphoma', 'Debris', 'Mucosa', 'Adipose', 'Empty']

In [None]:
X = (df.loc[:, (df.columns != 'label') & (df.columns != 'label_name')].values)
y = (df.loc[:, (df.columns == 'label')].values)

In [None]:
X = np.reshape(X, (5000, 64,64))

In [None]:
# Sanity check: 
print(f'Is the length of X, X_extracted & Y equal? It should be: {len(X) == len(y)}')

In [None]:
# Split Data into test & Train: 
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Build pipeline for SGD:
pipe_SGD = Pipeline([('scaler', StandardScaler()), ('sgd', SGDClassifier())])

In [None]:
pipe_SGD.fit(X_train, np.ravel(y_train))

In [None]:
plot_confusion_matrix(estimator=pipe_SGD, X=X_test, y_true=y_test, cmap=plt.cm.Blues, normalize='true', display_labels=class_names)
plt.grid(False)

In [None]:
clf_svm = svm.SVC(kernel='rbf', gamma='auto', C=0.9) # default params to begin with
# clf_svm_ex = svm.LinearSVC()
clf_svm.fit(X_train, y_train)
# clf_svm_ex.fit(X_train_ex, y_train_ex)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (20,10))
plot_confusion_matrix(estimator=clf_svm, X=X_test, y_true=y_test, cmap=plt.cm.Blues, normalize='true', display_labels=class_names, ax=axes[0])
# plot_confusion_matrix(estimator=clf_svm_ex, X=X_test_ex, y_true=y_test_ex, cmap=plt.cm.Blues, normalize='true', display_labels=class_names, ax=axes[1])

In [None]:
clf_gnb = GaussianNB()
clf_knn = KNeighborsClassifier(n_jobs=-1)

In [None]:
clf_gnb.fit(X_train, y_train)
clf_knn.fit(X_train, y_train)

In [None]:
# pred_gnb = confusion_matrix(y_test, clf_gnb.predict(X_test))
# pred_knn = confusion_matrix(y_test, clf_knn.predict(X_test))

In [None]:
from tensorflow.keras import layers, models

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(64, (3,3), activation='relu', input_shape=(64, 64, 1)))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(64, (3,3), activation='relu'))
model.add(layers.MaxPool2D((2,2)))
model.add(layers.Conv2D(64, (3,3), activation='relu'))


In [None]:
model.summary()

In [None]:
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(8))

In [None]:
model.summary()

In [None]:
X_train.shape

In [None]:
x_train = X_train.reshape(X_train.shape[0], 64, 64, 1)
x_test = X_test.reshape(X_test.shape[0], 64, 64, 1)

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=20, 
                    validation_data=(x_test, y_test))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=2)

In [None]:
# Test Multi-class classifiers to pick the best ones to further work on: 
multi_class_classifiers = []

knn_clf = KNeighborsClassifier()
