In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
import tensorflow.keras.backend as B

import tensorflow_addons as tfa

from keras.models import Model

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss
from sklearn.utils import shuffle
from sklearn.mixture import BayesianGaussianMixture
from sklearn.cluster import KMeans

In [None]:
data_folder = '../input/lish-moa/'
train=pd.read_csv(data_folder + '/train_features.csv')
test=pd.read_csv(data_folder + '/test_features.csv')

targets=pd.read_csv(data_folder + '/train_targets_scored.csv')
targets_ns = pd.read_csv(data_folder + '/train_targets_nonscored.csv')

drug_table = pd.read_csv(data_folder + '/train_drug.csv')

train.head()

In [None]:
print('Train rows:', len(train.index))
print('Test rows:', len(test.index))

tag = 'tag'
train[tag] = 'train'
test[tag] = 'test'

total=pd.concat([train, test])
print('Total rows:', len(total.index))

In [None]:
columns = train.columns.to_list()

c_columns = [col for col in columns if col.startswith('c-')]
print('Number of c_cols:', len(c_columns))
g_columns = [col for col in columns if col.startswith('g-')]
print('Number of g_cols:', len(g_columns))

In [None]:
id_col='sig_id'

type_col='cp_type'
time_col='cp_time'
dose_col='cp_dose'

print('Type:', total[type_col].unique())
print('Time:', total[time_col].unique())
print('Dose', total[dose_col].unique())

In [None]:
vehicles=total.loc[total[type_col]=='ctl_vehicle']
print('Total vehicles:',len(vehicles.index))

In [None]:
targets_agg = targets.agg([sum])
targets_agg.drop(['sig_id'], axis=1,inplace=True)
target_col = targets_agg.columns
np.sort(targets_agg.values)
zero_targets = targets_agg[targets_agg.columns[(targets_agg<10).any()]]
zero_targets

In [None]:
drug_id = 'drug_id'
drug_targets = pd.concat([drug_table[drug_id], targets], axis = 1)
grouped = drug_targets.groupby(drug_id)
drug_examples = grouped[drug_id].count()

train_ids = train[id_col]
train_nums = grouped[drug_id].transform('count')

In [None]:
total2 = total.copy()
targets2 = targets.copy()

# exclude zero targets (underrepresented columns)
#targets2.drop(zero_targets.columns, axis=1, inplace=True)

# add non-scored targets
scored_columns = targets2.columns
scored_columns = scored_columns.drop('sig_id')
num_scored = len(scored_columns)
targets2 = pd.concat([targets2, targets_ns], axis = 1)

# Change cp_dose
total2[dose_col]=total2[dose_col].map({'D1':0, 'D2':1})

# Change cp_time
def onehot_time(x):
    return [(x==24).astype(int), (x==48).astype(int), (x==72).astype(int)]
total2['t1'], total2['t2'], total2['t3'] = onehot_time(total2[time_col])

# Drop cp_time
total2.drop([time_col], axis=1, inplace=True)

# Remove tags
total2.drop([tag], axis=1, inplace=True)

# Remove vehicles
vehicles_mask = total2[type_col] == 'ctl_vehicle'
n_train = len(targets2.index)

vehicles = total2[vehicles_mask]
total2 = total2[~vehicles_mask]
targets2 = targets2[~vehicles_mask[:n_train]]
train_nums2 = train_nums[~vehicles_mask[:n_train]]
train_nums2 = np.array(train_nums2.to_list())


# Drop cp_type
total2.drop([type_col], axis=1, inplace=True)
vehicles.drop([type_col], axis=1, inplace=True)

# Remove indexes
def rm_index(df):
    df.reset_index(drop=True, inplace=True)
    df.drop([id_col], axis=1, inplace=True)

rm_index(total2)
rm_index(targets2)
rm_index(vehicles)

total2.head()
targets2.head()

In [None]:
def scale_total(total2, scaler = None):
    #TODO: find better way
    if scaler is None:
        scaler = MinMaxScaler()
        scaler.fit(total2)

    total2 = pd.DataFrame(scaler.transform(total2))
    #total2 = scaler.transform(total2)
    return total2, scaler

In [None]:
total_scaled, scaler = scale_total(total2)
vehicles_scaled, _ = scale_total(vehicles, scaler)
total_scaled.head()

In [None]:
seed = None
def set_seed(s):
    tf.random.set_seed(seed)
    return s
seed = set_seed(42)

In [None]:
total_proc=total_scaled.values
train_targets=targets2[scored_columns]
trains_without_veh = len(train_targets.index)
corr_train_set = total_proc[:trains_without_veh]
#test_set = total_proc[trains_without_veh:]

num_columns = corr_train_set.shape[1]
num_targets = train_targets.shape[1]
print('Train data', corr_train_set.shape)
print('Train targets', train_targets.shape)

In [None]:
class CorrLoss(tf.keras.losses.Loss):
    def __init__(self, mu, sigma, moving_mean, moving_variance, *args, **kwargs):
        super(CorrLoss, self).__init__(*args, **kwargs)
        self.mu = mu
        self.b = sigma * moving_variance
        self.moving_mean = moving_mean
        #self.moving_variance = moving_variance
    
    def __call__(self, y_true, y_pred, *args, **kwargs):
        a = tf.reduce_mean((y_true - self.mu) * (y_pred - self.moving_mean), axis = 0)
        result = tf.math.divide(a, self.b)
        result = tf.math.abs(1 - result)
        #print('Here', result.shape, self.b.shape)
        return tf.reduce_mean(result)

def zero_loss(y_true, y_pred):
    return 0#tf.zeros([tf.shape(y_pred)[0]], dtype=tf.float32)

def create_corr_model(num_inputs, num_outputs, mu, sigma):
    in_layer = L.Input(num_inputs)

    #model.add(tfa.layers.WeightNormalization(L.Dense(int(part*num_inputs), 
    #                                                 activation='sigmoid')))
    base_layer = in_layer#L.BatchNormalization()(in_layer)#in_layer
    
    hidden = L.Dense(1024, use_bias = False)(base_layer)
    hidden = L.BatchNormalization()(hidden)
    hidden = L.Activation('tanh')(hidden)
    
    out_layer = L.Dense(num_outputs, name='out', use_bias = False)(hidden)
    
    hidden = L.Dense(1024, use_bias = False)(base_layer)
    hidden = L.BatchNormalization()(hidden)
    hidden = L.Activation('tanh')(hidden)
    
    out_layer += L.Dense(num_outputs, name='out1', use_bias = False)(hidden)
    
    out_layer = L.Activation('sigmoid', name='final')(out_layer)
    bn = L.BatchNormalization(name = 'bn')
    out2 = bn(out_layer)
    moving_mean = bn.variables[2]
    moving_variance = bn.variables[3]

    model = Model(in_layer, [out_layer, out2])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),#tfa.optimizers.AdamW(learning_rate = 0.001, weight_decay = 0.0005)
                 loss = [CorrLoss(mu, sigma, moving_mean, moving_variance), zero_loss])
    #model.compile(optimizer=tf.keras.optimizers.Adam(0.00035), loss=tf.keras.losses.MeanSquaredError())
    return model

In [None]:
def do_train(X, y):
    num_inputs = X.shape[1]
    num_outputs = y.shape[1]

    mu = np.mean(y, axis = 0)
    sigma = np.std(y, axis = 0)
    
#kf=KFold(2, shuffle=True, random_state=seed)
    id_range = np.arange(X.shape[0])
    train_ids, val_ids= train_test_split(id_range, test_size=0.4, random_state=seed)

    checkpoint_filepath = 'corr_checkpoint.h5'
    corr_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        metric = 'val_final_loss',
        save_weights_only=True,
        save_best_only=True)
    
    model = create_corr_model(num_inputs, num_outputs, mu, sigma)
    #model.fit(X[train_ids],
    #          y[train_ids],
    #          validation_data=(X[val_ids], y[val_ids]),
    #          callbacks = [corr_checkpoint_callback],
    #          epochs=200, batch_size=2048)

    return model, (train_ids, val_ids)

In [None]:
corr_model, (corr_train_ids, corr_val_ids) = do_train(corr_train_set, train_targets.values.astype(np.float))

In [None]:
n_pca = 64
pca = PCA(n_components=n_pca)
pca_features = pca.fit_transform(total_scaled.values)
pca_columns = ['pca-{:03d}'.format(i) for i in range(n_pca)]

In [None]:
#mixture = BayesianGaussianMixture(n_components = 64, verbose = 2)
#mixture.fit(total_scaled)
#mix_features = mixture.predict_proba(total_scaled)
#mix_columns = ['mix-{:03d}'.format(i) for i in range(mix_features.shape[1])]

In [None]:
#kmeans = KMeans(128)
#kmeans.fit(total_scaled)
#kmeans_features = kmeans.transform(total_scaled)
#kmeans_columns = ['kmeans-{:03d}'.format(i) for i in range(kmeans_features.shape[1])]

In [None]:
new_total = pd.DataFrame(total_proc, columns = total2.columns)
#added_features = np.concatenate([pca_features, mix_features, kmeans_features], axis = 1)
#added_columns = pca_columns + mix_columns + kmeans_columns
added_features = pca_features
added_columns = pca_columns
new_total = pd.concat([new_total, pd.DataFrame(added_features, columns = added_columns)], axis = 1)
train_targets=targets2[scored_columns]
train_set = new_total.loc[:trains_without_veh-1]
test_set = new_total.loc[trains_without_veh:]

num_columns = train_set.shape[1]
num_targets = train_targets.shape[1]
print('Train data', train_set.shape)
print('Train targets', train_targets.shape)
print('Test set', test_set.shape)

In [None]:
# partially from https://github.com/google-research/google-research/blob/master/tabnet/tabnet_model.py
def glu(act, n_units):
  """Generalized linear unit nonlinear activation."""
  return act[:, :n_units] * tf.nn.sigmoid(act[:, n_units:])

def dense_bn_glu(input_size, num_features, dropout = 0.5):
        in_layer = L.Input(input_size)
        base_layer = L.Dropout(dropout)(in_layer)#in_layer
        layer = L.Dense(2 * num_features)(base_layer)
        layer = L.BatchNormalization()(layer)
        layer = glu(layer, num_features)
        
        return Model(in_layer, layer)

def combine_models(model1, model2):
    model2_output = model2(model1.output)
    return Model(model1.input, model2_output)

In [None]:
# Logic handling TabNet
def create_TabNet2_model(num_columns, n_steps = 4, dropout = 0.5):
    num_shared = 1
    num_unique = 2
    num_features = 2*num_scored
    num_output_features = num_scored
    num_steps = n_steps
    relaxation = 1.5
    
    input_layer = L.Input(num_columns)
    input_base = L.BatchNormalization()(input_layer)
    masked_input = input_base
    output_agg = []#tf.ones([0, num_scored, 1])#np.empty([0, num_scored, 1], dtype=np.float32)#0
    agg_compl_mask = 1
    
    mask_agg = [] # for sparse loss
    epsilon = 1e-6
    
    # Static mask
    output_mask = np.random.randint(2, size=[1, num_scored, num_steps])
    output_mask_compensation = num_steps / (np.sum(output_mask, axis=2, keepdims = True) + 1e-6)
    output_mask = output_mask * output_mask_compensation
    
    #Learned mask agg
    out_agg_compl_mask = 1
    out_mask_agg = []
    
    
    shared_model = dense_bn_glu(num_columns, num_features, dropout)
    for i in range(num_shared-1):
        shared_model = combine_models(shared_model, 
                                      dense_bn_glu(num_features, num_features, dropout))

    def feature_transformer(num_layers, num_features, num_shared_features = None):
        cur_model = shared_model
        if (num_shared_features is None):
            num_shared_features = num_features
        for i in range(num_layers):
            cur_model = combine_models(cur_model,
                                       dense_bn_glu(num_shared_features, num_features, dropout))
        return cur_model

    def attentive_transformer(num_features, input_data):
        hidden = L.Dense(num_features)(input_data)
        hidden = L.BatchNormalization()(hidden)
        return hidden
    
    #control_features = feature_transformer(num_unique, num_features)(masked_input)
    cur = feature_transformer(num_unique, num_features)(masked_input)
    coef_features = cur[:,num_output_features:]
    for i in range(num_steps):
        mask = attentive_transformer(num_columns, coef_features)
        mask *= agg_compl_mask
        mask = L.Activation('sigmoid')(mask)
        mask_agg.append(tf.expand_dims(mask, axis = 2))
        agg_compl_mask *= (relaxation - mask)
        masked_input = mask * input_base
        
        cur = feature_transformer(num_unique, num_features)(masked_input)
        coef_features = cur[:,num_output_features:]
        output_features = tf.expand_dims(L.Activation('sigmoid')(cur[:,:num_output_features]), axis = 2)
        #output_agg = tf.concat([output_agg, (1.0/num_steps)*output_features], axis = 2)
        
        # Learned out mask START
        out_mask = attentive_transformer(num_scored, coef_features)
        out_mask *= out_agg_compl_mask
        out_mask = L.Activation('sigmoid')(out_mask)
        out_agg_compl_mask *= (relaxation - out_mask)
        out_mask = tf.expand_dims(out_mask, axis = 2)
        out_mask_agg.append(out_mask)
        # Learned out mask END
        
        output_agg.append((1.0/num_steps)*output_features)
    
    output = tf.concat(output_agg, axis=2)
    
    #Use random output mask
    #output = L.Dropout(0.5, noise_shape=[None, 1, num_steps])(output)
    
    #Use static output mask
    #output = output_mask * output
    
    #Use learned output mask
    out_mask = tf.concat(out_mask_agg, axis = 2)
    mask_loss = L.Lambda(lambda x: x, name='mask')(out_mask)
    out_mask_compensation = num_steps / (tf.reduce_sum(out_mask, axis=2, keepdims = True) + epsilon)
    out_mask = out_mask_compensation*out_mask#L.Lambda(lambda x:  * x, name='mask_norm')(out_mask)
    output = out_mask * output
    
    # Calc sparse loss
    mask_total = tf.concat(mask_agg, axis = 2)
    sparse_val = -mask_total*tf.math.log(mask_total+epsilon)
    sparse_val = tf.reduce_sum(sparse_val, axis = 1) # sum on dimensions
    sparse_loss = L.Lambda(lambda x:tf.reduce_mean(x), name='sparse')(sparse_val)
    
    
    output = L.Lambda(lambda x:tf.reduce_sum(x, axis = 2), name='output')(output)
    
    return Model(input_layer, [output, mask_loss, sparse_loss])#, mask_loss, sparse_loss])

In [None]:
def dummy_loss(y_true, y_pred):
    return y_pred

base_lr = 0.01

def compile_model(model, lr = base_lr):
    focal_loss = tfa.losses.SigmoidFocalCrossEntropy()
    binary_loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(optimizer=tf.keras.optimizers.Adam(lr), 
                  loss=[binary_loss, binary_loss, dummy_loss],
                  metrics={'output':focal_loss},
                  loss_weights=[1.0, 0.0003, 0.0000001]
                  )

In [None]:
def smooth_labels(labels, factor=0.1):
    # smooth the labels
    labels *= (1.0 - factor)
    labels += (factor / labels.shape[1])
    # returned the smoothed labels
    return labels

def get_mask_loss_data(train_shape, val_shape, f=np.zeros):
    return f(train_shape), f(val_shape)

def get_sparse_loss_data(train_shape, val_shape):
    return get_mask_loss_data(train_shape, val_shape, np.zeros)

class Cycle_LR():
    def __init__(self, base_lr, cycle_start, period):
        self.base_lr = base_lr
        self.cycle_start = cycle_start
        self.period = period
        
        self.alpha1 = 0.3 * base_lr
        self.alpha2 = 0.03 * self.alpha1

    def get_lr(self, epoch, cur):
        if epoch == 35:
            print('Cur lr', cur)
            return self.base_lr * 0.33
        if epoch > self.cycle_start:
            t = ((epoch - self.cycle_start) % self.period) / self.period
            cur = self.alpha2*t + self.alpha1 * (1-t)
            print('New lr:', cur)
        return cur

In [None]:
num_folds = 1
num_steps = 8
id_range = np.arange(train_targets.shape[0])

target_vals = train_targets.values[:,:num_scored].astype(float)

def train_classifier(train_set, model_filename, epochs = 150, lr = base_lr, dropout = 0.5):
    #kf=KFold(num_folds, shuffle=True, random_state=seed)
    for k, (train_ids, test_ids) in enumerate([(id_range,id_range)]):#kf.split(train_set,target_vals)):
        classifier = create_TabNet2_model(train_set.shape[1], num_steps, dropout) #create_TabNet_model()
        compile_model(classifier, lr)

        checkpoint_filepath = model_filename + '_checkpoint{:02d}.h5'.format(k)
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            metric = 'val_output_loss',
            save_weights_only=True,
            save_best_only=True)

        cycle_lr = Cycle_LR(lr, 70, 10)
        lr_schedule = tf.keras.callbacks.LearningRateScheduler(cycle_lr.get_lr, verbose=0)

        #test_without_veh = test_ids[~vehicles_mask.values[test_ids]]
        mask_train, mask_val = get_mask_loss_data([*(target_vals[train_ids].shape), num_steps], 
                                                   [*(target_vals[test_ids].shape), num_steps], np.zeros)
        sparse_train, sparse_val = get_sparse_loss_data([target_vals[train_ids].shape[0], 1], [target_vals[test_ids].shape[0], 1])

        train_true_vals = [smooth_labels(target_vals[train_ids])]
        val_true_vals = [target_vals[test_ids]]

        train_true_vals.append(mask_train)
        val_true_vals.append(mask_val)

        train_true_vals.append(sparse_train)
        val_true_vals.append(sparse_val)
        
        sample_weights = train_nums2[train_ids]
        classifier.fit(train_set[train_ids],
              train_true_vals,
              sample_weight = 8.0 / sample_weights,
              validation_data=(train_set[test_ids], val_true_vals),
              callbacks=[model_checkpoint_callback, lr_schedule],
              epochs=epochs, batch_size=256)
        return classifier

In [None]:
c_model_filename = 'tabnet_classifier'
corr_train_ids, corr_val_ids = train_test_split(id_range, test_size=0.4, random_state=seed + 1)
c_model = train_classifier(train_set.values, c_model_filename, 80, lr = 0.03, dropout = 0.5)

In [None]:
# Predict by all models from folds
def make_prediction(test_set):
    results_c = []
    results_g = []
    for k in range(num_folds):
        c_model.load_weights(c_model_filename + '_checkpoint{:02d}.h5'.format(k))
        results_c.append(c_model.predict(test_set)[0])
        
        #g_model.load_weights(g_model_filename + '_checkpoint{:02d}.h5'.format(k))
        #results_g.append(g_model.predict(test_set[g_columns])[0])
        
    #result = 0.5*(np.mean(results_g, axis = 0) + np.mean(results_c, axis = 0))
    result = np.mean(results_c, axis = 0)
    return result

In [None]:
sample_submission = pd.read_csv(data_folder + '/sample_submission.csv')
sample_submission.shape

In [None]:
submission_no_veh = ~vehicles_mask[n_train:]
sample_submission.loc[:, scored_columns]=0
results = make_prediction(test_set)
sample_submission.loc[submission_no_veh, scored_columns]=results
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)