In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image
import os
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Conv2DTranspose, Conv2D, MaxPooling2D, Input, UpSampling2D, Dense, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras import regularizers
from sklearn.model_selection import train_test_split
import glob
import cv2
from tqdm import tqdm
from PIL import Image
from matplotlib.pyplot import imshow
import matplotlib.image as img

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from imblearn.metrics import geometric_mean_score,specificity_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [None]:
train_yes ='../input/brain-mri-images-for-brain-tumor-detection/yes/'
train_no ='../input/brain-mri-images-for-brain-tumor-detection/no/'
#train_pred ='../input/brain-tumor-detection/pred/'

In [None]:
print('Total Tumor images:', len(os.listdir(train_yes)))
print('Total Non-Tumor images:', len(os.listdir(train_no)))

Non-Tumor: 0 Tumor: 1

In [None]:
y_train_yes = np.empty(155); y_train_yes.fill(1)
y_train_no = np.empty(98); y_train_no.fill(0)

In [None]:
yes_train = []
for filename in os.listdir(train_yes):
    img = image.load_img(train_yes + filename, target_size=(128, 128),color_mode='grayscale')
    yes_train.append(image.img_to_array(img))
yes_train = np.array(yes_train)

no_train = []
for filename in os.listdir(train_no):
    img = image.load_img(train_no + filename, target_size=(128, 128),color_mode='grayscale')
    no_train.append(image.img_to_array(img))
no_train = np.array(no_train)

In [None]:
X_train = np.concatenate((yes_train, no_train),axis=0)

In [None]:
def show_data(X, n=5, title=""):
    plt.figure(figsize=(20, 20))
    for i in range(n):
        ax = plt.subplot(2,n,i+1)
        plt.imshow(image.array_to_img(X[i]),cmap='gray')
    #plt.suptitle(title, fontsize = 20)

show_data(X_train)

In [None]:
y_train = np.concatenate((y_train_yes, y_train_no),axis=0)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
tf.__version__

In [None]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding the image."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [None]:
X_train.shape[1:]

In [None]:
latent_dim = 432

input_layer = Input(shape=(128, 128, 1), name="INPUT")
x = Conv2D(12, 3, padding="same", activation='relu')(input_layer) #activation="relu"
x = MaxPooling2D((2, 2))(x)
#x = LeakyReLU()(x)
x = Conv2D(6, 3, padding="same", activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
#x = LeakyReLU()(x)
'''x = Conv2D(3, 3, padding="same")(x)
x = MaxPooling2D((2, 2))(x)
x = LeakyReLU()(x)'''
x = layers.Flatten()(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(input_layer, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(encoder, show_shapes=True, to_file='encoder_network.png', dpi=600)

In [None]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(6144, activation="relu")(latent_inputs)
x = layers.Reshape((32, 32, 6))(x)

'''x = layers.Conv2DTranspose(3, 3, padding="same",activation="relu")(x) #activation="relu"
x = UpSampling2D((2, 2))(x)
x = LeakyReLU()(x)'''
x = layers.Conv2DTranspose(6, 3, padding="same",activation="relu")(x)
x = UpSampling2D((2, 2))(x)
#x = LeakyReLU()(x)
x = layers.Conv2DTranspose(12, 3,  padding="same",activation="relu")(x)
x = UpSampling2D((2, 2))(x)
#x = LeakyReLU()(x)

decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

In [None]:
plot_model(decoder, show_shapes=True, to_file='decoder_network.png', dpi=600)

In [None]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2) #mean_squared_error binary_crossentropy
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [None]:
X_train = X_train.astype('float32') / 255.0

when n = 64, i.e., n << d (n= latent_dim, d = orginal_dim); the gradients start exploding Hence, 432 latent dimension has been considered.

In [None]:
early_stop = EarlyStopping(monitor = 'reconstruction_loss',
                            mode = 'min',
                            min_delta = 0,
                            patience = 6,
                            restore_best_weights = True)


vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam(learning_rate = 5 * 0.0001))
vae.fit(X_train, epochs=500, batch_size = 256,
                       shuffle = True,
                       callbacks=[early_stop])

In [None]:
plt.plot(vae.history.history['reconstruction_loss'])
plt.title('reconstruction loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

In [None]:
plt.plot(vae.history.history['kl_loss'])
plt.title('KLD loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

In [None]:
plt.plot(vae.history.history['loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

In [None]:
_,_,z = vae.encoder.predict(X_train)

In [None]:
z.shape

In [None]:
def show_encoded_data(X, n=5, height=18, width=24, title=""):
    plt.figure(figsize=(20, 20))
    for i in range(n):
        ax = plt.subplot(2,n,i+1)
        plt.imshow(X[i].reshape((height,width)),cmap='gray')
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    plt.suptitle(title, fontsize = 20)

show_encoded_data(z, height= 18, width= 24) #height= 18, width= 24)

In [None]:
def show_image(x):
    plt.imshow(image.array_to_img(x),cmap='gray')

In [None]:
def visualize(img,encoder,decoder):
    """Draws original, encoded and decoded images"""
    # img[None] will have shape of (1, 48, 48, 3) which is the same as the model input
    code = vae.encoder.predict(img[None])[2]
    reco = vae.decoder.predict(code)[0]

    plt.subplot(1,3,1)
    plt.title("Original")
    show_image(img)

    plt.subplot(1,3,2)
    plt.title("Code")
    plt.imshow(code.reshape([code.shape[-1]//24,-1]),cmap='gray')

    plt.subplot(1,3,3)
    plt.title("Reconstructed")
    show_image(reco)
    plt.show()

for i in range(5):
    img = X_train[i]
    visualize(img,encoder,decoder)

In [None]:
pd.DataFrame(z).to_csv('Train_Encoded.csv', index = False)

In [None]:
X_train_df = pd.read_csv("Train_Encoded.csv")
y_df = pd.DataFrame(y_train, columns = ['Class'])
Y = y_df.Class
X_train_df['Class'] = Y.values
X_train_df.to_csv('Labeled_Train_Encoded.csv', index = False)

In [None]:
train_X = X_train_df.iloc[:, :-1].to_numpy()

In [None]:
X_train_df

In [None]:
data_df = pd.read_csv('../input/200-epochs-432sized/Labeled_Train_Encoded.csv')
data_df.head()

In [None]:
data_df['Class'] = data_df['Class'].astype('int')

In [None]:
initial_X_train = data_df.iloc[:, :-1].to_numpy()
initial_y_train = data_df.Class

In [None]:
xx_train, xx_test, yy_train, yy_test = train_test_split(initial_X_train, initial_y_train,
                                                    test_size=0.15,shuffle=True, stratify=initial_y_train.ravel())
print(f"X_train Shape: {xx_train.shape}\nX_test Shape: {xx_test.shape}\ny_train Shape: {yy_train.shape}\ny_test Shape:{yy_test.shape}")

In [None]:
yy_train.isnull().values.any()

Training set: 2550 samples Testing set: 450 samples

In [None]:
training_set_df = pd.DataFrame(xx_train)
y_df = pd.DataFrame(yy_train, columns = ['Class'])
Y = y_df.Class
training_set_df['Class'] = Y.values
training_set_df

Out of 2550 Training samples: 10% Labeled samples (i.e. 255) and 90% Unlabeled samples (i.e. 2295)

In [None]:
xx_labeled, xx_unlabeled, yy_labeled, yy_unlabeled = train_test_split(xx_train, yy_train,test_size=0.9 ) #,shuffle=True, stratify= yy_train.ravel())
print(f"X_labeled Shape: {xx_labeled.shape}\nX_unlabeled Shape: {xx_unlabeled.shape}\ny_labeled Shape: {yy_labeled.shape}\ny_unlabeled Shape:{xx_unlabeled.shape}")

In [None]:
labeled_df = pd.DataFrame(xx_labeled)
y0_df = pd.DataFrame(yy_labeled, columns = ['Class'])
Y0 = y0_df.Class
labeled_df['Class'] = Y0.values
labeled_df.head()

In [None]:
unlabeled_df = pd.DataFrame(xx_unlabeled)
y1_df = pd.DataFrame(yy_unlabeled, columns = ['Class'])
Y1 = y1_df.Class
unlabeled_df['Class'] = Y1.values
unlabeled_df.head()

In [None]:
X_unlabeled = unlabeled_df.drop(['Class'], axis=1)

In [None]:
X_train = labeled_df.iloc[:, :-1]
y_train = labeled_df.Class

In [None]:
print("Labeled X",len(X_train))
print("Labeled y",len(y_train))
print("Unlabeled X",len(X_unlabeled))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_shape=(432,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(xx_train, yy_train, epochs=10, batch_size=10)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(xx_train, yy_train)

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(xx_train, yy_train)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(random_state=0)
clf.fit(xx_train, yy_train)

In [None]:
y_pred = clf.predict(xx_test)

In [None]:
# predict probabilities for test set
#yhat_probs = model.predict(xx_test, verbose=0)
# predict crisp classes for test set
#y_pred = np.argmax(yhat_probs)


# predict probabilities for test set
yhat_probs = model.predict(xx_test, verbose=0)
# predict crisp classes for test set
y_pred = (model.predict(xx_test) > 0.5).astype("int32")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(yy_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(yy_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(yy_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(yy_test, y_pred)
print('F1 score: %f' % f1)

In [None]:
#Baseline
clf0 = LogisticRegression(max_iter=1000)

clf0.fit(X_train, y_train)
y_hat_test = clf0.predict(xx_test)

acc_test = accuracy_score(yy_test, y_hat_test)
f1_test = f1_score(yy_test, y_hat_test)
PrecisionScore_test = precision_score(yy_test , y_hat_test)
RecallScore_test = recall_score(yy_test , y_hat_test)
g_mean_test = geometric_mean_score(yy_test, y_hat_test)

#fpr, tpr, thresholds = roc_curve(yy_test, y_hat_test)
auc = roc_auc_score(yy_test, y_hat_test)
rounded_auc = round(auc,4)

'''plt.figure(figsize=(12,6), dpi=600)

plt.plot(fpr,tpr,linewidth=2, label="Baseline" + ", auc="+str(rounded_auc))

plt.legend(loc=4)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
#plt.savefig(str(name)+'.png', bbox_inches='tight')
plt.show()  '''

print(f"Test Accuracy Score: {round(acc_test,4)}")
print(f"Test f1 Score: {round(f1_test,4)}")
print(f"Test Precision Score: {round(PrecisionScore_test,4)}")
print(f"Test Recall Score: {round(RecallScore_test,4)}")
print(f"Test GM Score: {round(g_mean_test,4)}")
print(f"Test AUC Score: {rounded_auc}")

plot_confusion_matrix(clf0, xx_test, yy_test, cmap='Blues', normalize='true',
                     display_labels=['No Tumor.', 'Tumor']);

Base Classifiers: 1. Logistic Regression 2. Naive Bayes 3. MLP 4. SVC 5. Ensemble of Classifiers

1. Logistic Regression

In [None]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
#---------------------------------------------------Top k-most confident predictions------------------------
# Initiate iteration counter
iterations = 0

# Containers to hold f1_scores and # of pseudo-labels
test_f1s = []
test_accs = []
test_precs = []
test_recs = []
#test_gmeans = []
#test_aucs = []
test_spss = []

pseudo_labels = []

# Assign value to initiate while loop
high_prob = [1]

# Loop will run until there are no more high-probability pseudo-labels
while len(X_unlabeled) > 100:
    # Fit classifier and make train/test predictions
    clf1 = LogisticRegression(max_iter=1000)
    clf1.fit(X_train, y_train)
    y_hat_train = clf1.predict(X_train)
    y_hat_test = clf1.predict(xx_test)
    # Calculate and print iteration and scores

    acc_test = accuracy_score(yy_test, y_hat_test)
    f1_test = f1_score(yy_test, y_hat_test)
    PrecisionScore_test = precision_score(yy_test , y_hat_test)
    RecallScore_test = recall_score(yy_test , y_hat_test)
    sps_test = specificity_score(yy_test, y_hat_test)

    #fpr, tpr, thresholds = roc_curve(yy_test, y_hat_test)
    #auc = roc_auc_score(yy_test, y_hat_test)
    #rounded_auc = round(auc,4)

    print(f"Iteration {iterations}")
    print(f"Test Accuracy Score: {round(acc_test,4)}")
    print(f"Test f1 Score: {round(f1_test,4)}")
    print(f"Test Precision Score: {round(PrecisionScore_test,4)}")
    print(f"Test Recall Score: {round(RecallScore_test,4)}")
    print(f"Test Specificity Score: {round(sps_test,4)}")
    #print(f"Test AUC Score: {rounded_auc}")

    test_f1s.append(round(f1_test,4))
    test_accs.append(round(acc_test,4))
    test_precs.append(round(PrecisionScore_test,4))
    test_recs.append(round(RecallScore_test,4))
    test_spss.append(round(sps_test,4))

    print(confusion_matrix(yy_test, y_hat_test))
    print(classification_report(yy_test, y_hat_test))

    # Generate predictions and probabilities for unlabeled data
    print(f"Now predicting labels for unlabeled data...")

    pred_probs = clf1.predict_proba(X_unlabeled)
    preds = clf1.predict(X_unlabeled)
    prob_0 = pred_probs[:,0]
    prob_1 = pred_probs[:,1]
    # Store predictions and probabilities in dataframe
    df_pred_prob = pd.DataFrame([])
    df_pred_prob['preds'] = preds
    df_pred_prob['prob_0'] = prob_0
    df_pred_prob['prob_1'] = prob_1
    df_pred_prob.index = X_unlabeled.index

    prob_mapped_df = pd.concat([X_unlabeled, df_pred_prob],axis=1)
    c0_df = prob_mapped_df[(prob_mapped_df['preds'] == 0 )]
    c1_df = prob_mapped_df[(prob_mapped_df['preds'] == 1 )]

    c0_df = c0_df.sort_values(by ='prob_0' , ascending=False)
    top_k_df0 = c0_df.iloc[:50]

    c1_df = c1_df.sort_values(by ='prob_1' , ascending=False)
    top_k_df1 = c1_df.iloc[:50]

    high_prob = pd.concat([top_k_df0,top_k_df1], axis=0)
    high_prob = high_prob.loc[:, high_prob.columns.intersection(['preds','prob_0','prob_1'])]
    #print(high_prob.head())


    print(f"{len(high_prob)} high-probability predictions added to training data.")

    pseudo_labels.append(len(high_prob))

    # Add pseudo-labeled data to training data
    X_train = pd.concat([X_train, X_unlabeled.loc[high_prob.index]], axis=0)
    y_train = pd.concat([y_train, high_prob.preds])
    # Drop pseudo-labeled instances from unlabeled data
    X_unlabeled = X_unlabeled.drop(index=high_prob.index)
    print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

    # Update iteration counter
    iterations += 1

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(6,8))
ax1.plot(range(iterations), test_f1s)
ax1.set_ylabel('f1 Score')
ax2.bar(x=range(iterations), height=pseudo_labels)
ax2.set_ylabel('Pseudo-Labels Created')
ax2.set_xlabel('# Iterations');

plot_confusion_matrix(clf1, xx_test, yy_test, cmap='Blues', normalize='true',
                     display_labels=['No Tumor.', 'Tumor']);

2. Gaussian Naive Bayes

In [None]:
nb_classifier = GaussianNB()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier,
                 param_grid=params_NB,
                 cv=cv,   # use any cross validation technique
                 verbose=1,
                 scoring='accuracy')
gs_NB.fit(X_train, y_train)

gs_NB.best_params_

In [None]:
#---------------------------------------------------Top k-most confident predictions------------------------
# Initiate iteration counter
iterations = 0

# Containers to hold f1_scores and # of pseudo-labels
test_f1s = []
test_accs = []
test_precs = []
test_recs = []
test_spss = []

pseudo_labels = []

# Assign value to initiate while loop
high_prob = [1]

# Loop will run until there are no more high-probability pseudo-labels
while len(X_unlabeled) > 100:
    # Fit classifier and make train/test predictions
    clf1 = GaussianNB(var_smoothing=0.8111308307896871)
    clf1.fit(X_train, y_train)

    y_hat_test = clf1.predict(xx_test)

    # Calculate and print iteration # and f1 scores, and store f1 scores
    acc_test = accuracy_score(yy_test, y_hat_test)
    f1_test = f1_score(yy_test, y_hat_test)
    PrecisionScore_test = precision_score(yy_test , y_hat_test)
    RecallScore_test = recall_score(yy_test , y_hat_test)
    sps_test = specificity_score(yy_test, y_hat_test)


    print(f"Iteration {iterations}")
    print(f"Test Accuracy Score: {round(acc_test,4)}")
    print(f"Test Precision Score: {round(PrecisionScore_test,4)}")
    print(f"Test Recall Score: {round(RecallScore_test,4)}")
    print(f"Test f1 Score: {round(f1_test,4)}")
    print(f"Test Specificity Score: {round(sps_test,4)}")

    test_f1s.append(round(f1_test,4))
    test_accs.append(round(acc_test,4))
    test_precs.append(round(PrecisionScore_test,4))
    test_recs.append(round(RecallScore_test,4))
    test_spss.append(round(sps_test,4))

    print(confusion_matrix(yy_test, y_hat_test))
    print(classification_report(yy_test, y_hat_test))

    # Generate predictions and probabilities for unlabeled data
    print(f"Now predicting labels for unlabeled data...")

    pred_probs = clf1.predict_proba(X_unlabeled)
    preds = clf1.predict(X_unlabeled)
    prob_0 = pred_probs[:,0]
    prob_1 = pred_probs[:,1]
    # Store predictions and probabilities in dataframe
    df_pred_prob = pd.DataFrame([])
    df_pred_prob['preds'] = preds
    df_pred_prob['prob_0'] = prob_0
    df_pred_prob['prob_1'] = prob_1
    df_pred_prob.index = X_unlabeled.index

    prob_mapped_df = pd.concat([X_unlabeled, df_pred_prob],axis=1)
    c0_df = prob_mapped_df[(prob_mapped_df['preds'] == 0 )]
    c1_df = prob_mapped_df[(prob_mapped_df['preds'] == 1 )]

    c0_df = c0_df.sort_values(by ='prob_0' , ascending=False)
    top_k_df0 = c0_df.iloc[:50]

    c1_df = c1_df.sort_values(by ='prob_1' , ascending=False)
    top_k_df1 = c1_df.iloc[:50]

    high_prob = pd.concat([top_k_df0,top_k_df1], axis=0)
    high_prob = high_prob.loc[:, high_prob.columns.intersection(['preds','prob_0','prob_1'])]
    #print(high_prob.head())


    print(f"{len(high_prob)} high-probability predictions added to training data.")

    pseudo_labels.append(len(high_prob))

    # Add pseudo-labeled data to training data
    X_train = pd.concat([X_train, X_unlabeled.loc[high_prob.index]], axis=0)
    y_train = pd.concat([y_train, high_prob.preds])
    # Drop pseudo-labeled instances from unlabeled data
    X_unlabeled = X_unlabeled.drop(index=high_prob.index)
    print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

    # Update iteration counter
    iterations += 1

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(6,8))
ax1.plot(range(iterations), test_f1s)
ax1.set_ylabel('f1 Score')
ax2.bar(x=range(iterations), height=pseudo_labels)
ax2.set_ylabel('Pseudo-Labels Created')
ax2.set_xlabel('# Iterations');

plot_confusion_matrix(clf1, xx_test, yy_test, cmap='Blues', normalize='true',
                     display_labels=['No Tumor.', 'Tumor']);

3. MLP

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000)

parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
#---------------------------------------------------Top k-most confident predictions------------------------
# Initiate iteration counter
iterations = 0

# Containers to hold f1_scores and # of pseudo-labels
test_f1s = []
test_accs = []
test_precs = []
test_recs = []
test_spss = []

pseudo_labels = []

# Assign value to initiate while loop
high_prob = [1]

# Loop will run until there are no more high-probability pseudo-labels
while len(X_unlabeled) > 100:
    # Fit classifier and make train/test predictions
    clf1 = MLPClassifier(activation= 'tanh', alpha= 0.05, hidden_layer_sizes= (100,), learning_rate= 'adaptive', solver= 'sgd', warm_start=True)
    clf1.fit(X_train, y_train)

    y_hat_test = clf1.predict(xx_test)

    # Calculate and print iteration # and f1 scores, and store f1 scores
    acc_test = accuracy_score(yy_test, y_hat_test)
    f1_test = f1_score(yy_test, y_hat_test)
    PrecisionScore_test = precision_score(yy_test , y_hat_test)
    RecallScore_test = recall_score(yy_test , y_hat_test)
    sps_test = specificity_score(yy_test, y_hat_test)


    print(f"Iteration {iterations}")
    print(f"Test Accuracy Score: {round(acc_test,4)}")
    print(f"Test Precision Score: {round(PrecisionScore_test,4)}")
    print(f"Test Recall Score: {round(RecallScore_test,4)}")
    print(f"Test f1 Score: {round(f1_test,4)}")
    print(f"Test Specificity Score: {round(sps_test,4)}")

    test_f1s.append(round(f1_test,4))
    test_accs.append(round(acc_test,4))
    test_precs.append(round(PrecisionScore_test,4))
    test_recs.append(round(RecallScore_test,4))
    test_spss.append(round(sps_test,4))


    print(confusion_matrix(yy_test, y_hat_test))
    print(classification_report(yy_test, y_hat_test))

    # Generate predictions and probabilities for unlabeled data
    print(f"Now predicting labels for unlabeled data...")

    pred_probs = clf1.predict_proba(X_unlabeled)
    preds = clf1.predict(X_unlabeled)
    prob_0 = pred_probs[:,0]
    prob_1 = pred_probs[:,1]
    # Store predictions and probabilities in dataframe
    df_pred_prob = pd.DataFrame([])
    df_pred_prob['preds'] = preds
    df_pred_prob['prob_0'] = prob_0
    df_pred_prob['prob_1'] = prob_1
    df_pred_prob.index = X_unlabeled.index

    prob_mapped_df = pd.concat([X_unlabeled, df_pred_prob],axis=1)
    c0_df = prob_mapped_df[(prob_mapped_df['preds'] == 0 )]
    c1_df = prob_mapped_df[(prob_mapped_df['preds'] == 1 )]

    c0_df = c0_df.sort_values(by ='prob_0' , ascending=False)
    top_k_df0 = c0_df.iloc[:50]

    c1_df = c1_df.sort_values(by ='prob_1' , ascending=False)
    top_k_df1 = c1_df.iloc[:50]

    high_prob = pd.concat([top_k_df0,top_k_df1], axis=0)
    high_prob = high_prob.loc[:, high_prob.columns.intersection(['preds','prob_0','prob_1'])]
    #print(high_prob.head())


    print(f"{len(high_prob)} high-probability predictions added to training data.")

    pseudo_labels.append(len(high_prob))

    # Add pseudo-labeled data to training data
    X_train = pd.concat([X_train, X_unlabeled.loc[high_prob.index]], axis=0)
    y_train = pd.concat([y_train, high_prob.preds])
    # Drop pseudo-labeled instances from unlabeled data
    X_unlabeled = X_unlabeled.drop(index=high_prob.index)
    print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

    # Update iteration counter
    iterations += 1

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(6,8))
ax1.plot(range(iterations), test_f1s)
ax1.set_ylabel('f1 Score')
ax2.bar(x=range(iterations), height=pseudo_labels)
ax2.set_ylabel('Pseudo-Labels Created')
ax2.set_xlabel('# Iterations');

plot_confusion_matrix(clf1, xx_test, yy_test, cmap='Blues', normalize='true',
                     display_labels=['No Tumor.', 'Tumor']);

In [None]:
df = pd.DataFrame(columns=['Accuracy','Precision','Recall', 'F1'])
df['Accuracy'] = test_accs
df['Precision'] = test_precs
df['Recall'] = test_recs
df['F1'] = test_f1s

In [None]:
df.to_csv("20%performance.csv")

4. SVC

In [None]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
#---------------------------------------------------Top k-most confident predictions------------------------
# Initiate iteration counter
iterations = 0

# Containers to hold f1_scores and # of pseudo-labels
test_f1s = []
test_accs = []
test_precs = []
test_recs = []
test_spss = []

pseudo_labels = []

# Assign value to initiate while loop
high_prob = [1]

# Loop will run until there are no more high-probability pseudo-labels
while len(X_unlabeled) > 100:
    # Fit classifier and make train/test predictions
    clf1 = SVC(probability=True)
    clf1.fit(X_train, y_train)

    y_hat_test = clf1.predict(xx_test)

    # Calculate and print iteration # and f1 scores, and store f1 scores
    acc_test = accuracy_score(yy_test, y_hat_test)
    f1_test = f1_score(yy_test, y_hat_test)
    PrecisionScore_test = precision_score(yy_test , y_hat_test)
    RecallScore_test = recall_score(yy_test , y_hat_test)
    sps_test = specificity_score(yy_test, y_hat_test)


    print(f"Iteration {iterations}")
    print(f"Test Accuracy Score: {round(acc_test,4)}")
    print(f"Test Precision Score: {round(PrecisionScore_test,4)}")
    print(f"Test Recall Score: {round(RecallScore_test,4)}")
    print(f"Test f1 Score: {round(f1_test,4)}")
    print(f"Test Specificity Score: {round(sps_test,4)}")

    test_f1s.append(round(f1_test,4))
    test_accs.append(round(acc_test,4))
    test_precs.append(round(PrecisionScore_test,4))
    test_recs.append(round(RecallScore_test,4))
    test_spss.append(round(sps_test,4))


    print(confusion_matrix(yy_test, y_hat_test))
    print(classification_report(yy_test, y_hat_test))

    # Generate predictions and probabilities for unlabeled data
    print(f"Now predicting labels for unlabeled data...")

    pred_probs = clf1.predict_proba(X_unlabeled)
    preds = clf1.predict(X_unlabeled)
    prob_0 = pred_probs[:,0]
    prob_1 = pred_probs[:,1]
    # Store predictions and probabilities in dataframe
    df_pred_prob = pd.DataFrame([])
    df_pred_prob['preds'] = preds
    df_pred_prob['prob_0'] = prob_0
    df_pred_prob['prob_1'] = prob_1
    df_pred_prob.index = X_unlabeled.index

    prob_mapped_df = pd.concat([X_unlabeled, df_pred_prob],axis=1)
    c0_df = prob_mapped_df[(prob_mapped_df['preds'] == 0 )]
    c1_df = prob_mapped_df[(prob_mapped_df['preds'] == 1 )]

    c0_df = c0_df.sort_values(by ='prob_0' , ascending=False)
    top_k_df0 = c0_df.iloc[:50]

    c1_df = c1_df.sort_values(by ='prob_1' , ascending=False)
    top_k_df1 = c1_df.iloc[:50]

    high_prob = pd.concat([top_k_df0,top_k_df1], axis=0)
    high_prob = high_prob.loc[:, high_prob.columns.intersection(['preds','prob_0','prob_1'])]
    #print(high_prob.head())


    print(f"{len(high_prob)} high-probability predictions added to training data.")

    pseudo_labels.append(len(high_prob))

    # Add pseudo-labeled data to training data
    X_train = pd.concat([X_train, X_unlabeled.loc[high_prob.index]], axis=0)
    y_train = pd.concat([y_train, high_prob.preds])
    # Drop pseudo-labeled instances from unlabeled data
    X_unlabeled = X_unlabeled.drop(index=high_prob.index)
    print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

    # Update iteration counter
    iterations += 1

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(6,8))
ax1.plot(range(iterations), test_f1s)
ax1.set_ylabel('f1 Score')
ax2.bar(x=range(iterations), height=pseudo_labels)
ax2.set_ylabel('Pseudo-Labels Created')
ax2.set_xlabel('# Iterations');

plot_confusion_matrix(clf1, xx_test, yy_test, cmap='Blues', normalize='true',
                     display_labels=['No Tumor.', 'Tumor']);

5. Ensemble of LR, GNB, MLP

In [None]:
#---------------------------------------------------Top k-most confident predictions------------------------
# Initiate iteration counter
iterations = 0

# Containers to hold f1_scores and # of pseudo-labels
train_f1s = []
test_f1s = []
pseudo_labels = []

# Assign value to initiate while loop
high_prob = [1]

# Loop will run until there are no more high-probability pseudo-labels
while (len(X_unlabeled) > 50):
    # Fit classifier and make train/test predictions
    clf1 = LogisticRegression(warm_start=True)
    clf2 = GaussianNB()
    clf3 = MLPClassifier(warm_start=True)
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf2), ('mlp', clf3)], voting='soft')
    eclf1 = eclf1.fit(X_train, y_train)
    #clf1 = LogisticRegression(warm_start=True)
    #clf1 = SGDClassifier(loss='modified_huber')
    #clf1.partial_fit(X_train, y_train, classes=np.unique(initial_y_train))
    #clf1.fit(X_train, y_train)
    y_hat_train_1 = eclf1.predict(X_train)
    y_hat_test_1 = eclf1.predict(xx_test)
    # Calculate and print iteration # and f1 scores, and store f1 scores
    train_f1_1 = f1_score(y_train, y_hat_train_1)
    test_f1_1 = f1_score(yy_test, y_hat_test_1)
    #plot_confusion_matrix(clf, xx_test, yy_test, cmap='Blues', normalize='true',
    #                 display_labels=['No Tumor.', 'Tumor']);
    print(f"Iteration {iterations}")
    #print(f"Train f1: {train_f1}")
    print(f"Test f1: {test_f1_1}")
    train_f1s.append(train_f1_1)
    test_f1s.append(test_f1_1)
    print(confusion_matrix(yy_test, y_hat_test_1))
    print(classification_report(yy_test, y_hat_test_1))

    # Generate predictions and probabilities for unlabeled data
    print(f"Now predicting labels for unlabeled data...")

    pred_probs = eclf1.predict_proba(X_unlabeled)
    preds = eclf1.predict(X_unlabeled)
    prob_0 = pred_probs[:,0]
    prob_1 = pred_probs[:,1]

    # Store predictions and probabilities in dataframe
    df_pred_prob = pd.DataFrame([])
    df_pred_prob['preds'] = preds
    df_pred_prob['prob_0'] = prob_0
    df_pred_prob['prob_1'] = prob_1
    df_pred_prob.index = X_unlabeled.index

    prob_mapped_df = pd.concat([X_unlabeled, df_pred_prob],axis=1)
    c0_df = prob_mapped_df[(prob_mapped_df['preds'] == 0 )]
    c1_df = prob_mapped_df[(prob_mapped_df['preds'] == 1 )]

    c0_df = c0_df.sort_values(by ='prob_0' , ascending=False)
    top_k_df0 = c0_df.iloc[:50]

    c1_df = c1_df.sort_values(by ='prob_1' , ascending=False)
    top_k_df1 = c1_df.iloc[:50]

    high_prob = pd.concat([top_k_df0,top_k_df1], axis=0)
    high_prob = high_prob.loc[:, high_prob.columns.intersection(['preds','prob_0','prob_1'])]
    #print(high_prob.head())

    # Separate predictions with > 99% probability
    #high_prob = pd.concat([df_pred_prob.loc[df_pred_prob['prob_0'] > 0.99],
    #                       df_pred_prob.loc[df_pred_prob['prob_1'] > 0.99]],
    #                      axis=0)

    print(f"{len(high_prob)} high-probability predictions added to training data.")

    pseudo_labels.append(len(high_prob))

    # Add pseudo-labeled data to training data
    X_train = pd.concat([X_train, X_unlabeled.loc[high_prob.index]], axis=0)
    y_train = pd.concat([y_train, high_prob.preds])
    # Drop pseudo-labeled instances from unlabeled data
    X_unlabeled = X_unlabeled.drop(index=high_prob.index)
    print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

    # Update iteration counter
    iterations += 1

In [None]:
len(X_train)

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(6,8))
ax1.plot(range(iterations), test_f1s)
ax1.set_ylabel('f1 Score')
ax2.bar(x=range(iterations), height=pseudo_labels)
ax2.set_ylabel('Pseudo-Labels Created')
ax2.set_xlabel('# Iterations');

plot_confusion_matrix(eclf1, xx_test, yy_test, cmap='Blues', normalize='true',
                     display_labels=['No Tumor.', 'Tumor']);