# Combine features

At this stage we are going to combine the behavioural, eeg and mri data.

We are then gonna train a linear model with the purpose to both reconstruct the original data based on a latent space but also based on the predictions for the multi-label problem

In [0]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 5, 10
rcParams['font.size'] = 12

plt.style.use('ggplot')

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


## Combine features

### Load behavioural data and preprocess dataset

In [216]:
base_dir = '/gdrive/My Drive/Colab Notebooks/DSLab/data'

behaviour_data = pd.read_csv(os.path.join(base_dir, 'HBNFinalSummaries.csv'), low_memory=False)

initial_size = behaviour_data.shape[0]
behaviour_data = behaviour_data[behaviour_data['NoDX'].isin(['Yes', 'No'])]
new_size = behaviour_data.shape[0]
print('Removing', initial_size - new_size,
      'patients as their evaluation was incomplete.')

keep_most_common_diseases = 5
healthy_diagnosis = 'No Diagnosis Given'

# these disorders should also include the no diagnosis given option
keep_most_common_diseases += 1

category_columns = ['DX_' + str(i).zfill(2) + '_Cat' for i in range(1, 11)]

# count for each disorder number of occurences
disorder_counts = {}
for val in behaviour_data[category_columns].values.reshape(-1):
    if not pd.isnull(val):
        if val in disorder_counts:
            disorder_counts[val] += 1
        else:
            disorder_counts[val] = 1

# sort in descending order
disorder_counts = sorted(disorder_counts.items(), key=lambda kv: -kv[1])

most_common_disorders = [x[0]
                         for x in disorder_counts[:keep_most_common_diseases]]

# find users that have no diagnosis within these top diseases
# filtering should cahnge anything as this should also happen at a later stage
mask = None
for col in category_columns:
    mask_col = behaviour_data[col].isin(most_common_disorders)
    if mask is None:
        mask = mask_col
    else:
        mask = mask | mask_col

initial_size = behaviour_data.shape[0]
behaviour_data = behaviour_data[mask]
behaviour_data = behaviour_data.reset_index(drop=True)
new_size = behaviour_data.shape[0]
print('Removing', initial_size - new_size,
      'patients as their diagnoses were very uncommon.')

Removing 282 patients as their evaluation was incomplete.
Removing 37 patients as their diagnoses were very uncommon.


In [0]:
no_diagnosis_given = 'No Diagnosis Given'

if no_diagnosis_given in most_common_disorders:
    no_diag_index = most_common_disorders.index(no_diagnosis_given)
    most_common_disorders = most_common_disorders[:no_diag_index] + \
        most_common_disorders[no_diag_index + 1:]

In [0]:
classes = np.zeros((len(most_common_disorders),
                    behaviour_data.shape[0]), dtype=np.int32)


df_disorders = behaviour_data[category_columns]

for i, disorder in enumerate(most_common_disorders):
    mask = df_disorders.select_dtypes(include=[object]). \
        applymap(lambda x: disorder in x if pd.notnull(x) else False)

    disorder_df = df_disorders[mask.any(axis=1)]

    np.add.at(classes[i], disorder_df.index.values, 1)

In [0]:
behaviour_data_columns = behaviour_data.columns.values.astype(np.str)

columns_to_drop = behaviour_data_columns[
    np.flatnonzero(np.core.defchararray.find(behaviour_data_columns, 'DX') != -1)]

behaviour_data = behaviour_data.drop(columns=columns_to_drop)

In [0]:
for disorder, classification in zip(most_common_disorders, classes):
    behaviour_data[disorder] = classification

In [221]:
behaviour_data.shape

(1777, 311)

In [0]:
combined_df = behaviour_data.set_index('EID')

### Load mri and add to dataset

In [0]:
fa_per_tract = pd.read_csv('DataScience2019_MRI/MRI/DTI/FAPerTract.csv', low_memory=False)

# Remove "/" from the end some IDs 
fa_per_tract['ID'] = fa_per_tract['ID'].apply(lambda x: x[:-1] if "/" in x else x)

# join with behavioural data
combined_df = combined_df.join(fa_per_tract.set_index('ID'), how='inner')

In [0]:
# base_dir = 'DataScience2019_MRI/MRI/structuralMRI'

# column ScanSite already exists in the behavioural data
cort_thick_l = pd.read_csv(os.path.join(base_dir,
    'CorticalThicknessLHROI.csv'), low_memory=False).drop(columns=['ScanSite'])
cort_thick_r = pd.read_csv(os.path.join(base_dir,
    'CorticalThicknessRHROI.csv'), low_memory=False).drop(columns=['eTIV', 'ScanSite'])
cort_vol_l = pd.read_csv(os.path.join(base_dir,
    'CorticalVolumeLHROI.csv'), low_memory=False).drop(columns=['eTIV', 'ScanSite'])
cort_vol_r = pd.read_csv(os.path.join(base_dir,
    'CorticalVolumeRHROI.csv'), low_memory=False).drop(columns=['eTIV', 'ScanSite'])
sub_cort_vol_l = pd.read_csv(os.path.join(base_dir,
    'SubCorticalVolumeLHROI.csv'), low_memory=False).drop(columns=['eTIV', 'ScanSite'])
sub_cort_vol_r = pd.read_csv(os.path.join(base_dir,
    'SubCorticalVolumeRHROI.csv'), low_memory=False).drop(columns=['eTIV', 'ScanSite'])
glob_thick = pd.read_csv(os.path.join(base_dir,
    'GlobalCorticalThickness.csv'), low_memory=False).drop(columns=['ScanSite'])

# Join tables 
struct_mri = pd.merge(cort_thick_l, cort_thick_r, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, cort_vol_l, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, cort_vol_r, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, sub_cort_vol_l, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, sub_cort_vol_r, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, glob_thick, on='ID', how='inner')

In [224]:
combined_df = combined_df.join(struct_mri.set_index('ID'), how='inner')
combined_df.shape

(1053, 684)

### Load EEG  and add to dataset

In [0]:
base_dir = 'DataScience2019_MRI/EEG'

eeg_mic = pd.read_csv(os.path.join(base_dir, "RestingEEG_Microstates.csv"))
eeg_psd = pd.read_csv(os.path.join(base_dir, "RestingEEG_PSD_Average.csv"))
eeg_spectro = pd.read_csv(os.path.join(base_dir, "RestingEEG_Spectro_Average.csv"))

In [0]:
combined_df = combined_df.join(eeg_mic.set_index('id'), how='inner')
combined_df = combined_df.join(eeg_psd.set_index('id'), how='inner')
combined_df = combined_df.join(eeg_spectro.set_index('id'), how='inner')

combined_df.shape

(950, 735)


### Some final preprocessing

In [0]:
# fdx and mdx may contain 'No Diagnosis'
# drop them for now but they may be important
# they correspond to father's and mother's primary diagnosis
columns_to_drop = ['Anonymized.ID', 'mdx', 'fdx', 'fcodxm_1', 'fcodxm_2', 'fcodxm_3', 'mcodxm_1',
                   'mcodxm_2', 'mcodxm_3', 'mcodxmdt', 'TOWRE_Total_Desc', 'Picture_Vocab_Raw',
                   'sib1dx', 'sib1codxm_1', 'sib1codxm_2', 'sib1codxm_3',
                   'sib2dx', 'sib2codxm_1', 'sib2codxm_2', 'sib2codxm_3',
                   'sib3dx', 'sib3codxm_1', 'sib3codxm_2', 'sib3codxm_3',
                   'sib4dx', 'sib4codxm_1', 'sib4codxm_2', 'sib4codxm_3',
                   'sib5dx', 'sib5codxm_1', 'sib5codxm_2', 'sib5codxm_3']

combined_df = combined_df.drop(columns=columns_to_drop)

In [226]:
combined_df.shape

(1053, 652)

## Train

For this time we have only considered the behavioural data and the structural mri as these datasets have the most ids in common.

In [0]:
assert combined_df.shape == (1053, 652)

In [247]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

def mean_imputer(x, y):
    return np.where(np.isnan(x), np.ma.array(x, mask=np.isnan(x)).mean(axis=0), x),\
            np.where(np.isnan(y), np.ma.array(x, mask=np.isnan(x)).mean(axis=0), y)  

# we are goint to remove some behavioral data based on their null values
drop_missing_threshold = 0.5


kf = KFold(n_splits=5, random_state=17, shuffle=True)
kf.get_n_splits(combined_df)

# preds = np.zeros(combined_df.shape[0], len(most_common_disorders))

for train_index, test_index in kf.split(combined_df):
    train, test = combined_df.iloc[train_index], combined_df.iloc[test_index]
    # for testing purposes
    break

columns_mask = pd.isnull(train).sum() / train.shape[0] >= drop_missing_threshold

print('Droping this many columns:', np.sum(columns_mask))

dropped_columns = train.columns[columns_mask]

train = train.drop(columns=dropped_columns)
test = test.drop(columns=dropped_columns)

# classes
train_classes = train[most_common_disorders].values
test_classes = test[most_common_disorders].values

# keep only features
train = train.drop(columns=most_common_disorders)
test = test.drop(columns=most_common_disorders)

train_mask = pd.isnull(train).values.astype(np.float32)
test_mask = pd.isnull(test).values.astype(np.float32)

# deal with numpy because of some weird SettingWithCopyWarning I cannot figure out
# impute based on the mean values
train, test = mean_imputer(train.values, test.values)

scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

Droping this many columns: 130


## Model

In [0]:
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
import numpy as np
import warnings
import os


def get_batches(iterable, batch_size=64, do_shuffle=True):
    if do_shuffle:
        iterable = shuffle(iterable)

    length = len(iterable)
    for ndx in range(0, length, batch_size):
        iterable_batch = iterable[ndx: min(ndx + batch_size, length)]
        yield iterable_batch


def get_reconstruction_loss(true, predictions, mask):
    loss = np.mean(((true - predictions) ** 2) * mask, axis=1)
    return np.mean(loss, axis=0)


def multi_label_accuracy(true, predictions):
    if not issubclass(predictions.dtype.type, np.integer):
        predictions = before_softmax_to_predictions(predictions)

    return 1 - np.sum((true - predictions) ** 2) / (true.shape[0] * true.shape[1])


def before_softmax_to_predictions(predictions):
    return (predictions >= 0).astype(np.int16)


def f1_per_class(true, predictions):
    if not issubclass(predictions.dtype.type, np.integer):
        predictions = before_softmax_to_predictions(predictions)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        f1_scores = list()
        for i in range(true.shape[1]):
            f1_scores.append(f1_score(true[:, i], predictions[:, i], average='macro'))

    return f1_scores


DEFAULT_LOG_PATH = './autoencoder_predict'


class AutoencodePredict:
    training = None
    input_ = None
    input_mask = None
    intermediate_representation = None
    input_reconstructed = None
    reconstruction_loss = None
    regularization_loss = None
    prediction_loss = None
    true_predictions = None
    predictions = None
    total_loss = None
    pos_weights = None
    class_weights = None

    def __init__(self,
                 number_of_features,
                 num_classes,
                 alpha=1,           # parameter showing the significance of the prediction loss
                 activation=tf.nn.relu,
                 layers=None,
                 prediction_layers=None,
                 dropout=None,
                 regularization=0,
                 masking=0.5):

        self.activation = activation
        self.num_classes = num_classes
        self.alpha = alpha

        if layers is None:
            self.layers = [50, 15]
        else:
            self.layers = layers

        if prediction_layers is None:
            self.prediction_layers = [25, 15]
        else:
            self.prediction_layers = prediction_layers

        self.number_of_features = number_of_features

        self.masking = masking
        self.dropout = dropout

        use_regularization = (regularization > 0)
        self.use_regularization = use_regularization

        if regularization == 0:
            # set to small value to avoid tensorflow error
            # use_regularization = False in this case and will not contribute towards the final loss
            self.regularization = 0.1
        else:
            self.regularization = regularization

    def build_graph(self):

        self.training = tf.placeholder(tf.bool, shape=[], name='training')

        self.input_ = tf.placeholder(tf.float32, shape=[None, self.number_of_features], name='input_data')
        self.input_mask = tf.placeholder(tf.float32, shape=[None, self.number_of_features], name='input_mask')
        self.true_predictions = tf.placeholder(tf.float32, shape=[None, self.num_classes], name='input_predictions')

        # placeholders used to balance the loss for individual classes and predictions
        self.pos_weights = tf.placeholder(tf.float32, shape=[5], name='pos_weights')
        self.class_weights = tf.placeholder(tf.float32, shape=[5], name='class_weights')

        self.intermediate_representation = self.encode(self.input_)

        self.input_reconstructed = self.decode(self.intermediate_representation)

        self.predictions = self.predict_classes(self.intermediate_representation)

        if self.input_mask is not None:
            self.reconstruction_loss = tf.reduce_mean(((self.input_ - self.input_reconstructed) ** 2) * self.input_mask)
        else:
            self.reconstruction_loss = tf.reduce_mean((self.input_ - self.input_reconstructed) ** 2)

        # self.prediction_loss = tf.reduce_mean(
        #     tf.nn.sigmoid_cross_entropy_with_logits(labels=self.true_predictions,
        #                                             logits=self.predictions)) * self.alpha
        def my_loss(labels, logits, pos_weight, class_weight):
            return tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(labels=labels,
                                                                           logits=logits,
                                                                           pos_weight=pos_weight)) * class_weight

        loss_per_class = tf.map_fn(
            lambda x: my_loss(x[0], x[1], x[2], x[3]),
            (tf.transpose(self.true_predictions), tf.transpose(self.predictions), self.pos_weights, self.class_weights),
            dtype=tf.float32)

        self.prediction_loss = tf.reduce_mean(loss_per_class)

        if self.use_regularization:
            self.regularization_loss = tf.losses.get_regularization_loss()

            # TODO CHANGE TO ORIGINAL
            self.total_loss = self.reconstruction_loss + self.regularization_loss + self.prediction_loss
            # self.total_loss = self.prediction_loss
        else:
            self.total_loss = self.reconstruction_loss + self.prediction_loss
            # self.total_loss = self.prediction_loss

    def predict_classes(self, intermediate):

        x = intermediate
        for i, layer in enumerate(self.prediction_layers):
            x = tf.layers.dense(x, layer, use_bias=True, name='predict_layer_' + str(i),
                                activation=self.activation,
                                kernel_regularizer=tf.contrib.layers.l2_regularizer(self.regularization))
            if self.dropout is not None:
                x = tf.layers.dropout(x, rate=self.dropout, training=self.training)

        x = tf.layers.dense(x, self.num_classes, use_bias=True, name='predict_layer_final', activation=None,
                            kernel_regularizer=tf.contrib.layers.l2_regularizer(self.regularization))
        return x

    def encode(self,
               input_):

        if self.masking > 0:
            # mask randomly some of the inputs
            input_ = tf.layers.dropout(input_, rate=self.masking, training=self.training)

        x = input_
        # important to use relu as a first layer to make all unobserved values set to 0?
        for i, layer in enumerate(self.layers):
            x = tf.layers.dense(x, layer, use_bias=True, name='input_layer_1_' + str(i),
                                activation=self.activation,
                                kernel_regularizer=tf.contrib.layers.l2_regularizer(self.regularization))
            if self.dropout is not None:
                x = tf.layers.dropout(x, rate=self.dropout, training=self.training)

        return x

    def decode(self,
               intermediate):

        x = intermediate
        for i, layer in enumerate(self.layers[::-1][1:]):
            x = tf.layers.dense(x, layer, use_bias=True, name='input_layer_2_' + str(i),
                                activation=self.activation,
                                kernel_regularizer=tf.contrib.layers.l2_regularizer(self.regularization))
            if self.dropout is not None:
                x = tf.layers.dropout(x, rate=self.dropout, training=self.training)

        x = tf.layers.dense(x, self.number_of_features, use_bias=True, name='input_layer_2_final',
                            activation=self.activation,
                            kernel_regularizer=tf.contrib.layers.l2_regularizer(self.regularization))

        if self.dropout is not None:
            x = tf.layers.dropout(x, rate=self.dropout, training=self.training)

        return x

    def reconstruct(self,
                    data,
                    log_path=None):

        if log_path is None:
            log_path = DEFAULT_LOG_PATH

        with tf.Graph().as_default():
            with tf.Session() as sess:
                self.build_graph()

                saver = tf.train.Saver()
                saver.restore(sess, tf.train.latest_checkpoint(log_path))

                data_reconstructed = np.zeros((data.shape[0], self.number_of_features))

                for rows in get_batches(list(range(data.shape[0])), batch_size=64, do_shuffle=False):
                    rows_features = [data[i, :] for i in rows]

                    rows_reconstructed = sess.run(self.input_reconstructed,
                                                  feed_dict={
                                                      self.input_: rows_features,
                                                      self.training: False
                                                  })

                    data_reconstructed[rows] = rows_reconstructed

                return data_reconstructed

    def get_latent_space(self,
                         data,
                         log_path=None):

        if log_path is None:
            log_path = DEFAULT_LOG_PATH

        with tf.Graph().as_default():
            with tf.Session() as sess:
                self.build_graph()

                saver = tf.train.Saver()
                saver.restore(sess, tf.train.latest_checkpoint(log_path))

                data_latent = np.zeros((data.shape[0], self.layers[-1]))

                for rows in get_batches(list(range(data.shape[0])), batch_size=64, do_shuffle=False):
                    rows_features = [data[i, :] for i in rows]

                    rows_latent = sess.run(self.input_reconstructed,
                                           feed_dict={
                                               self.input_: rows_features,
                                               self.training: False
                                           })

                    data_latent[rows] = rows_latent

                return data_latent

    def predict_with_sess(self, sess, data):
        predictions = np.zeros((data.shape[0], self.num_classes))

        for rows in get_batches(list(range(data.shape[0])), batch_size=64, do_shuffle=False):
            rows_features = [data[i, :] for i in rows]

            rows_predictions = sess.run(self.predictions,
                                        feed_dict={
                                            self.input_: rows_features,
                                            self.training: False
                                        })

            predictions[rows] = rows_predictions

        return predictions

    def predict(self,
                data,
                log_path=None,
                make_integer=True):

        if log_path is None:
            log_path = DEFAULT_LOG_PATH

        with tf.Graph().as_default():
            with tf.Session() as sess:
                self.build_graph()

                saver = tf.train.Saver()
                saver.restore(sess, tf.train.latest_checkpoint(log_path))

                if make_integer:
                    return before_softmax_to_predictions(self.predict_with_sess(sess, data))
                else:
                    return self.predict_with_sess(sess, data)

    def fit(self,
            data,
            data_mask,
            data_labels,
            test_data=None,
            test_data_mask=None,
            test_data_labels=None,
            pos_weights=None,
            class_weights=None,
            n_epochs=350,
            decay_steps=None,
            learning_rate=None,
            decay=None,
            log_path=None,
            verbose=True,
            print_every_epochs=10):

        if pos_weights is None:
            pos_weights = [1] * self.num_classes

        if class_weights is None:
            class_weights = [1] * self.num_classes

        if decay_steps is None:
            # empirical
            decay_steps = data.shape[0] // 64 * 5

        if learning_rate is None:
            learning_rate = 0.001

        if decay is None:
            decay = 0.96

        if log_path is None:
            log_path = DEFAULT_LOG_PATH

        validation = False
        if test_data is not None and test_data_mask is not None and test_data_labels is not None:
            validation = True

        with tf.Graph().as_default():
            with tf.Session() as sess:

                self.build_graph()

                global_step = tf.Variable(1, name='global_step', trainable=False)

                learning_rate = tf.Variable(learning_rate, trainable=False, dtype=tf.float32, name="learning_rate")
                learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay)

                # Gradients and update operation for training the model.
                opt = tf.train.AdamOptimizer(learning_rate)
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                with tf.control_dependencies(update_ops):
                    # Update all the trainable parameters
                    train_step = opt.minimize(self.total_loss, global_step=global_step)

                saver = tf.train.Saver(max_to_keep=3)

                sess.run(tf.global_variables_initializer())

                for epoch in range(n_epochs):
                    reconstruction_loss = 0

                    if self.use_regularization:
                        regularization_loss = 0

                    prediction_loss = 0
                    for rows in get_batches(list(range(data.shape[0])), batch_size=data.shape[0]):
                        rows_features = data[rows]
                        rows_masks = data_mask[rows]
                        rows_predictions = data_labels[rows]

                        if self.use_regularization:
                            _, rec_loss, reg_loss, pred_loss, step = sess.run(
                                [train_step, self.reconstruction_loss, self.regularization_loss, self.prediction_loss,
                                 global_step],
                                feed_dict={
                                    self.input_: rows_features,
                                    self.input_mask: rows_masks,
                                    self.true_predictions: rows_predictions,
                                    self.pos_weights: pos_weights,
                                    self.class_weights: class_weights,
                                    self.training: True
                                })
                        else:
                            _, rec_loss, pred_loss, step = sess.run(
                                [train_step, self.reconstruction_loss, self.prediction_loss,
                                 global_step],
                                feed_dict={
                                    self.input_: rows_features,
                                    self.input_mask: rows_masks,
                                    self.true_predictions: rows_predictions,
                                    self.pos_weights: pos_weights,
                                    self.class_weights: class_weights,
                                    self.training: True
                                })

                        reconstruction_loss += rec_loss

                        if self.use_regularization:
                            regularization_loss += reg_loss

                        prediction_loss += pred_loss

                    if epoch % print_every_epochs == 0:
                        if verbose and validation:
                            predictions_train = self.predict_with_sess(sess, data)
                            train_accuracy = multi_label_accuracy(data_labels, predictions_train)

                            predictions_test = self.predict_with_sess(sess, test_data)
                            test_accuracy = multi_label_accuracy(test_data_labels, predictions_test)

                            if self.use_regularization:
                                print('At epoch {:4d} rec_loss: {:8.4f} reg_loss: {:8.4f} pred_loss: {:8.4f} train_'
                                      'acc: {:.4f} test_acc {:.4f}'.format(epoch, reconstruction_loss,
                                                                           regularization_loss, prediction_loss,
                                                                           train_accuracy, test_accuracy))
                            else:
                                print('At epoch {:4d} rec_loss: {:8.4f} pred_loss: {:8.4f} train_'
                                      'acc: {:.4f} test_acc {:.4f}'.format(epoch, reconstruction_loss,
                                                                           prediction_loss,
                                                                           train_accuracy, test_accuracy))

                            f1_scores = f1_per_class(data_labels, predictions_train)
                            print('train f1_scores: ', end='')
                            for sc in f1_scores:
                                print('{:.3f} '.format(sc), end='')

                            f1_scores = f1_per_class(test_data_labels, predictions_test)
                            print('test f1_scores: ', end='')
                            for sc in f1_scores:
                                print('{:.3f} '.format(sc), end='')
                            print()

                        saver.save(sess, os.path.join(log_path, "model"), global_step=epoch)


In [286]:
print(train.shape)
print(test.shape)

(842, 517)
(211, 517)


In [0]:
alpha=2e-3          # parameter showing the significance of the prediction loss
activation=tf.nn.relu
layers=[50, 15]
prediction_layers=[100, 25]
dropout=0.1
regularization=5e-5
masking=0.15

model = AutoencodePredict(train.shape[1], len(most_common_disorders), alpha=alpha, activation=activation, layers=layers,
                          prediction_layers=prediction_layers, dropout=dropout, regularization=regularization, 
                          masking=masking)

In [0]:
learning_rate = 3e-3
# pos_weights = [1, 1, 1, 1, 1]
# set larger weight for instances of disorders that are very infrequent

def smoothing_fun(a, beta=1):
    return [min(i, 1) for i in a] + beta * np.log([max(i - 1, 1) for i in a])

pos_weights = smoothing_fun(1 / (np.sum(train_classes, axis=0) / train_classes.shape[0]))
class_weights = [1, 1, 1, 1, 1]

model.fit(train, train_mask, train_classes, test_data=test, test_data_mask=test_mask, test_data_labels=test_classes,
          n_epochs=15000, print_every_epochs=200, pos_weights=pos_weights, class_weights=class_weights    )

In [289]:
preds = model.predict(test)

INFO:tensorflow:Restoring parameters from ./autoencoder_predict/model-16000


In [290]:
preds

array([[0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [1, 1, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0]], dtype=int16)

In [291]:
test_classes

array([[0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0]], dtype=int32)

In [292]:
multi_label_accuracy(test_classes, preds)

0.8454976303317535

In [297]:
# total recall

indices = np.where(test_classes == 1)
total = len(indices[0])
found = 0
for i, j in zip(indices[0], indices[1]):
    found += preds[i][j]

print('Total recall:', found / total)
print('Total precision:', found / np.sum(preds))

Total recall: 0.7437722419928826
Total precision: 0.6966666666666667


In [293]:
print(test.shape)
print(np.sum(test_classes, axis=0))
print(np.sum(preds, axis=0))

(211, 517)
[154  58  28  29  12]
[161  63  39  32   5]
