In [None]:
import pandas as pd
from pandas import read_csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import os
import shutil
import tensorflow as tf
import tensorflow_datasets as tfds
import tabnet

In [None]:
# This is a function to provide accuracies based on rejection rate
def eval_wisdom_accuracy(actual,predicted):
    all_cnt=len(actual)
    correct_cnt=np.count_nonzero(np.equal(actual,predicted))
    reject_cnt=np.count_nonzero(predicted==2)
    incorrect_cnt=all_cnt-(correct_cnt+reject_cnt)
    return reject_cnt/all_cnt,correct_cnt/(correct_cnt+incorrect_cnt)

In [None]:
df1 = read_csv('rna_seq.csv')
strat = read_csv('strat.csv')
df1 = df1.drop(columns='SAMPLE_BARCODE')

In [None]:
selected = pd.read_table('ras_coef.tsv')
df = df1[list(set(selected[selected['weight']!=0]['feature']).intersection(df1.columns))]
df['total_status'] = df1['total_status']

In [None]:
# split into input and output columns
X, y = df.values[:, :-1], df.values[:, -1]
# ensure all data are floating point values
X = X.astype('float32')
# encode strings to integer
y = LabelEncoder().fit_transform(y)
# split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 0, stratify=strat['DISEASE'])
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# determine the number of input features
n_features = X_train.shape[1]

In [None]:
tf.keras.utils.set_random_seed(1)
model = Sequential()
model.add(Dense(20, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
model.add(Dense(7, activation='relu', kernel_initializer='he_normal'))
# I made a change and add the conjugate neuron in the beginning, so I have 3 output nodes
model.add(Dense(3, activation='softmax'))
# compile the model
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=10, batch_size=32)
# Check how the base model performs on training set
loss, accu = model.evaluate(X_train, y_train, verbose=0)
from sklearn.metrics import confusion_matrix

pred1 = np.argmax(model.predict(X_train),axis=1)
cf_matrix1 = confusion_matrix(pred1, y_train)
print(cf_matrix1)
print("train accuracy",accu)
loss1, accu1 = model.evaluate(X_test, y_test, verbose=0)
pred = np.argmax(model.predict(X_test),axis=1)
cf_matrix = confusion_matrix(pred, y_test)
print(cf_matrix)
print("test accuracy",accu1)

########
# Find out misclassified samples; WisdomNet is trained based on misclassified samples
pred = np.argmax(model.predict(X_train),axis=1)
X_train_miss=X_train[~np.equal(pred,y_train)]
(X_train_miss).shape
# Find out correctly classified samples
X_train_correct=X_train[np.equal(pred,y_train)]
y_train_correct=y_train[np.equal(pred,y_train)]
# Relabel misclassified samples as 'Reject', which is class label 2 in this case
a,b=X_train_miss.shape
y_train_miss=2*np.ones(a)
print(y_train_miss.shape)
# Check how the base model performs on training set
predict_train_w = np.argmax(model.predict(X_train),axis=1)
reject_train, correct_train_acc = eval_wisdom_accuracy(y_train,predict_train_w)
print('--- Base Training Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_train, correct_train_acc))

# Check how the base model performs on test set
predict_test_w = np.argmax(model.predict(X_test),axis=1)
reject_test, correct_test_acc = eval_wisdom_accuracy(y_test,predict_test_w)
print('--- Base Test Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_test, correct_test_acc))
print('Train accuracy rate: %.3f' % (accu))
######## 

# WisdomNet: only trains the model on misclassified samples until reaches 100% accuracy
tf.keras.utils.set_random_seed(1)
acc=0.0
test_acc=0.1
old_test_acc=0.0
i=0
# It is undefined when to stop training of WisdomNet. I added 4 conditions below
# i) no error on misclassified samples (acc!=1.0), temporarily removed, ii) no error on training data, iii) test accuracy improves (could be controversial), iv) n number of iterations
while ((correct_train_acc!=1.0 or test_acc>old_test_acc or i<50) and (correct_train_acc!=1.0 or test_acc!=1.0)):# Ideally the model is trained further, otherwise it may misclassify unseen test data
   # fit the model; I used 5 epochs but this could change
    #model.train_on_batch( X_train_miss,y_train_miss) 
    model.fit(X_train_miss, y_train_miss, epochs=5, batch_size=16,verbose=0)
   # evaluate the model on misclassified samples
    loss, acc = model.evaluate(X_train_miss, y_train_miss, verbose=0)
    print('Train (initally missed obs.) accuracy rate: %.3f' % (acc))

   # The following part is not part of original WisdomNet, but it checks whether correctly classified data is misclassified or not
   #predict_train_w = np.argmax(model.predict(X_train_correct), axis=1)
   #reject_correct, train_correct_acc = eval_wisdom_accuracy(y_train_correct, predict_train_w)
   #while (train_correct_acc!=1.0):
   #   # Find out new misclassified samples out of previously correct samples
   #   pred_inner = np.argmax(model.predict(X_train_correct), axis=1)
   #   mask=np.logical_and(~np.equal(pred_inner, y_train_correct), pred_inner!=2)
   #   #mask=(~np.equal(pred_inner, y_train_correct) and (pred_inner!=2))
   #   X_train_miss_inner = X_train_correct[mask]
   #   y_train_miss_inner = y_train_correct[mask]#

   #   model.fit(X_train_miss_inner, y_train_miss_inner, epochs=5, batch_size=32, verbose=0)
   #   predict_train_w = np.argmax(model.predict(X_train_miss_inner), axis=1)
   #   reject_inner, train_correct_acc = eval_wisdom_accuracy(y_train_miss_inner, predict_train_w)
   #   if (train_correct_acc == 1.0):
   #      loss, acc = model.evaluate(X_train_miss, y_train_miss, verbose=0)

   # Check how the model behaves on training and test data, The following could be used inside the loop as well to determine when the training stops
    predict_train_w = np.argmax(model.predict(X_train), axis=1)
    reject_train, correct_train_acc = eval_wisdom_accuracy(y_train, predict_train_w)

    predict_test_w = np.argmax(model.predict(X_test), axis=1)
    old_test_acc = test_acc
    reject_test, test_acc = eval_wisdom_accuracy(y_test, predict_test_w)
    print('[%d] WisdomNet Training: (Train Reject Rate: %.3f; Train Correct Accuracy: %.3f) (Test Reject Rate: %.3f; Test Correct Accuracy: %.3f)' % (i, reject_train, correct_train_acc, reject_test, test_acc))
    i=i+1


# Check how the model behaves on training and test data, The following could be used inside the loop as well to determine when the training stops
predict_train_w = np.argmax(model.predict(X_train),axis=1)
reject_train, correct_train_acc = eval_wisdom_accuracy(y_train,predict_train_w)
print('Train Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_train, correct_train_acc))

# Check how the model behaves on test data; if correct_test_accuracy is not 1.0, the the while loop should be executed more.
#predict_test_w = np.argmax(model.predict(X_test),axis=1)
#reject_test, correct_test_acc = eval_wisdom_accuracy(y_test,predict_test_w)
print('Test Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_test, correct_test_acc))


### IMPORTNANT NOTE: TEST AND TRAIN ACCURACY SHOULD BE CLOSE TO EACH OTHER


In [None]:
X1, y1 = df.values[:, :-1], df.values[:, -1]
X1 = X1.astype('float32')
y1 = LabelEncoder().fit_transform(y)
#### THe TEST SIZE PARAMETER CAN BE CHANGED TO SEE HOW MODEL PERFORMS ON AN UNSEEN DATASET
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.01)
predict_train_w = np.argmax(model.predict(X1_train),axis=1)
reject_train, correct_train_acc = eval_wisdom_accuracy(y1_train,predict_train_w)
print('Train Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_train, correct_train_acc))

# Check how the model behaves on test data; if correct_test_accuracy is not 1.0, the the while loop should be executed more.
predict_test_w = np.argmax(model.predict(X1_test),axis=1)
reject_test, correct_test_acc = eval_wisdom_accuracy(y1_test,predict_test_w)
print('Test Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_test, correct_test_acc))
cf_matrix1 = confusion_matrix(predict_train_w, y1_train)
print(cf_matrix1)
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(predict_test_w, y1_test)
print(cf_matrix)

In [None]:
## FINAL LOOK ON THE FULL DATASET
X2, y2 = df.values[:, :-1], df.values[:, -1]
X2 = X2.astype('float32')
y2 = LabelEncoder().fit_transform(y)
predict_train_w = np.argmax(model.predict(X2),axis=1)
reject_train, correct_train_acc = eval_wisdom_accuracy(y2,predict_train_w)
print('Train Reject Rate: %.3f Correct Accuracy: %.3f' % (reject_train, correct_train_acc))
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(predict_train_w, y2)
print(cf_matrix)