In [1]:
% pylab inline
from numpy import linalg as LA
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import glob
from tqdm import tqdm_notebook
import os
import sklearn.preprocessing as prep
import pickle
import joblib
import tensorflow as tf

def min_max_scale(X):
    preprocessor = prep.MinMaxScaler().fit(X)
    X_scaled = preprocessor.transform(X)
    return X_scaled

Populating the interactive namespace from numpy and matplotlib


In [None]:
master_matrix = joblib.load('../scripts/tumor_and_normal_200000_standardized_X.joblib.pickle')
y = joblib.load('../scripts/tumor_and_normal_200000_standardized_y.joblib.pickle')


In [None]:
master_matrix.shape

In [None]:
y.shape

In [None]:
WIDTH = 256 
HEIGHT = 256
DEPTH = 3

def standardize_image(f):
    standardized = (imread(f) / 255.0).reshape(-1, 256 * 256 * 3)
    return standardized

In [None]:
config = tf.ConfigProto(
    device_count = {'GPU': 0}
)
config.gpu_options.allocator_type = 'BFC'
#config

IMAGE_WIDTH = 256
IMAGE_HEIGHT = 256
IMAGE_CHANNELS = 3


class VAE(object):
    def __init__(self,
                 input_dim,
                 learning_rate=0.001,
                 n_latent=100,
                 batch_size=50):
        self.learning_rate = learning_rate
        self.n_latent = n_latent
        self.batch_size = batch_size
        self.input_dim = input_dim

        self._build_network()
        self._create_loss_optimizer()

        init = tf.global_variables_initializer()
        #init = tf.initialize_all_variables()
        # Launch the session
        self.session = tf.InteractiveSession(config=config)
        self.session.run(init)
        self.saver = tf.train.Saver(tf.all_variables())

    def _build_network(self):
        self.x = tf.placeholder(tf.float32, [None, self.input_dim])
        dense1 = tf.layers.dense(
            activation=tf.nn.elu, inputs=self.x, units=512)
        dense2 = tf.layers.dense(
            activation=tf.nn.elu, inputs=dense1, units=512)
        dense3 = tf.layers.dense(
            activation=tf.nn.elu, inputs=dense2, units=512)
        dense4 = tf.layers.dense(
            activation=None, inputs=dense3, units=self.n_latent * 2)
        self.mu = dense4[:, :self.n_latent]
        self.sigma = tf.nn.softplus(dense4[:, self.n_latent:])
        eps = tf.random_normal(
            shape=tf.shape(self.sigma), mean=0, stddev=1, dtype=tf.float32)
        self.z = self.mu + self.sigma * eps

        ddense1 = tf.layers.dense(
            activation=tf.nn.elu, inputs=self.z, units=512)
        ddense2 = tf.layers.dense(
            activation=tf.nn.elu, inputs=ddense1, units=512)
        ddense3 = tf.layers.dense(
            activation=tf.nn.elu, inputs=ddense2, units=512)

        self.reconstructed = tf.layers.dense(
            activation=tf.nn.sigmoid, inputs=ddense3, units=self.input_dim)

    def _create_loss_optimizer(self):
        epsilon = 1e-10
        reconstruction_loss = -tf.reduce_sum(
            self.x * tf.log(epsilon + self.reconstructed) +
            (1 - self.x) * tf.log(epsilon + 1 - self.reconstructed),
            axis=1)

        self.reconstruction_loss = tf.reduce_mean(
            reconstruction_loss) / self.batch_size

        latent_loss = -0.5 * tf.reduce_sum(
            1 + tf.log(epsilon + self.sigma) - tf.square(self.mu) - tf.square(
                self.sigma),
            axis=1)
        latent_loss = tf.reduce_mean(latent_loss) / self.batch_size
        self.latent_loss = latent_loss
        self.cost = tf.reduce_mean(self.reconstruction_loss + self.latent_loss)
        # ADAM optimizer
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate).minimize(self.cost)

    def fit_minibatch(self, batch):
        _, cost, reconstruction_loss, latent_loss = self.session.run(
            [
                self.optimizer, self.cost, self.reconstruction_loss,
                self.latent_loss
            ],
            feed_dict={self.x: batch})
        return cost, reconstruction_loss, latent_loss

    def reconstruct(self, x):
        return self.session.run([self.reconstructed], feed_dict={self.x: x})

    def decoder(self, z):
        return self.session.run([self.reconstructed], feed_dict={self.z: z})

    def encoder(self, x):
        return self.session.run([self.z], feed_dict={self.x: x})

    def save_model(self, checkpoint_path, epoch):
        self.saver.save(self.session, checkpoint_path, global_step=epoch)

    def load_model(self, checkpoint_dir):
        #new_saver = tf.train.import_meta_graph(checkpoint_path)
        #new_saver.restore(sess, tf.train.latest_checkpoint('./'))

        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=checkpoint_dir, latest_filename='checkpoint')
        print('loading model: {}'.format(ckpt.model_checkpoint_path))
        self.saver.restore(self.session, ckpt.model_checkpoint_path)
        
    

In [None]:
learning_rate=1e-4
batch_size=32
num_epoch=1000
n_latent=100
    
checkpoint_dir = '/Z/personal-folders/interns/saket/vae_checkpoint_histoapath_2000_nlatent100'   
os.makedirs(checkpoint_dir, exist_ok=True)
input_dim = IMAGE_CHANNELS*IMAGE_WIDTH*IMAGE_HEIGHT
tf.reset_default_graph()
#input_dims = input_dim[1]
model = VAE(input_dim=input_dim,
            learning_rate=learning_rate,
            n_latent=n_latent,
            batch_size=batch_size)
model.load_model(checkpoint_dir)

In [None]:
# Test the trained model: generation
%pylab inline
# Sample noise vectors from N(0, 1)
z = np.random.normal(size=[model.batch_size, model.n_latent])
x_generated = model.decoder(z)[0]

w = h = 256 
n = np.sqrt(model.batch_size).astype(np.int32)
I_generated = np.empty((h*n, w*n, 3))

for i in range(n):
    for j in range(n):
        I_generated[i*h:(i+1)*h, j*w:(j+1)*w, :] = x_generated[i*n+j, :].reshape(w, h, 3)

plt.figure(figsize=(8, 8))
plt.imshow(I_generated)# cmap='gray')


In [None]:
master_matrix[0].reshape()

In [None]:

x_sample = np.reshape(master_matrix, (-1, 256*256*3))
x_encoded =  model.encoder(x_sample)
x_reconstruct = model.reconstruct(x_sample)


plt.figure(figsize=(8, 12))
for i in range(7):

    plt.subplot(7, 2, 2*i + 1)
    plt.imshow(x_sample[i].reshape(256, 256, 3))
    plt.title("Test input")
    #plt.colorbar()
    plt.subplot(7, 2, 2*i + 2)
    plt.imshow(x_reconstruct[0][i].reshape(256, 256, 3))
    plt.title("Reconstruction")
    #plt.colorbar()
plt.tight_layout()

In [None]:
x_reconstruct[0].shape

In [None]:
x_encoded[0].shape

# Train a TPOT on these reduced dimension!

In [None]:
test_tumor_patches_dir = '/Z/personal-folders/interns/saket/histopath_data/baidu_images/test_tumor_level0/level_0/'
list_of_tumor_files = list(glob.glob('{}*.png'.format(test_tumor_patches_dir)))

test_normal_patches_dir = '/Z/personal-folders/interns/saket/histopath_data/baidu_images/test_normal_level0/level_0/'
list_of_normal_files = list(glob.glob('{}*.png'.format(test_normal_patches_dir)))



In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(x_encoded[0], y,
                                                    train_size=0.75, test_size=0.25)

pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_valid, y_valid))
pipeline_optimizer.export('tpot_exported_pipeline_autoencoder_nlatent100.py')

In [None]:
y_test = []
X_test_matrix = []
for f in tqdm_notebook(list_of_tumor_files):
    standardized = (imread(f)).reshape(-1, 256 * 256 * 3)
    X_test_matrix.append(standardized)    
    y_test.append(1)

for f in tqdm_notebook(list_of_normal_files):
    standardized = (imread(f)).reshape(-1, 256 * 256 * 3)
    X_test_matrix.append(standardized)    
    y_test.append(0)



In [None]:
plt.imshow(X_test_matrix[0].reshape( 256 , 256 , 3))

In [None]:
plt.imshow(x_sample[i].reshape(256, 256, 3))

In [None]:
x_sample[i]

In [None]:
X_test_matrix[0]

In [None]:
X_test_matrix = np.array(X_test_matrix)
y_test = np.array(y_test)
x_test_input = np.reshape(X_test_matrix, (-1, 256*256*3))
x_test_encoded = model.encoder(x_test_input)[0]
print(pipeline_optimizer.score(x_test_encoded, y_test))

In [None]:
x_test_reconstructed = model.reconstruct(x_test_input)[0]


In [None]:
plt.figure(figsize=(10, 12))
for i in range(10):

    plt.subplot(10, 2, 2*i + 1)
    plt.imshow(x_test_input[i].reshape(256, 256, 3))
    plt.title("Test input")
    #plt.colorbar()
    plt.subplot(10, 2, 2*i + 2)
    plt.imshow(x_test_reconstructed[i].reshape(256, 256, 3))
    plt.title("Reconstruction")
    #plt.colorbar()
plt.tight_layout()

# lightgbm

In [None]:
import json
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


In [None]:
X_train.shape

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

In [None]:
x_test_encoded.shape

In [None]:
x_test_input.shape

In [None]:
y_pred_test_gbm = gbm.predict(x_test_input, num_iteration=gbm.best_iteration)


In [None]:
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred_test_gbm) ** 0.5)


In [None]:
y_pred_test_gbm_bin = [1 if x>0.5 else 0 for x in y_pred_test_gbm]

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_pred_test_gbm_bin, y_test )

In [None]:
gbm.feature_importance()

In [None]:
features = pd.