Inspired by \
https://www.kaggle.com/rahulsd91/moa-label-smoothing \
https://www.kaggle.com/c/lish-moa/discussion/192211 \
https://www.kaggle.com/kushal1506/moa-pytorch-0-01859-rankgauss-pca-nn#About-this-notebook \
https://www.kaggle.com/gogo827jz/hyperparameter-tuning-for-neural-network-on-tpu

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import sys

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses
import tensorflow_addons as tfa

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss


sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_dir = '../input/lish-moa/'

train_set = pd.read_csv('../input/lish-moa/train_features.csv')
train_target = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_set = pd.read_csv('../input/lish-moa/test_features.csv')
#train_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')
ss.loc[:, train_target.columns[1:]] = 0

In [None]:
GENES = [col for col in train_set.columns if col.startswith('g-')]
CELLS = [col for col in train_set.columns if col.startswith('c-')]

drop_index = train_set[train_set.cp_type == 'ctl_vehicle'].index
add_index = test_set[test_set.cp_type == 'ctl_vehicle'].index

In [None]:
#preprocessing

def transform_features(df):
    #one_hot = pd.get_dummies(df['cp_dose']) 
    #df = df.join(one_hot)
    df = df.drop(df[df.cp_type == 'ctl_vehicle'].index)  # drop where cp_type==ctl_vehicle
    #df = df.drop(['sig_id','cp_type', 'cp_dose'], axis=1)
    df = df.drop(['sig_id','cp_type'], axis=1)
    df['cp_time']=df['cp_time'].map({24: 1, 48: 2, 72: 3})
    df["cp_dose"] = df['cp_dose'].map({"D1": 1, "D2": 2})
    return df


X_train = transform_features(train_set)
X_test = transform_features(test_set)

train_target = train_target.drop(drop_index, axis = 0)
y_train = train_target.drop(['sig_id'], axis=1).values

In [None]:
#RankGauss
def rankgauss(D):
    for col in (GENES+CELLS):
        qt = preprocessing.QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
        vec = D[col].values.reshape(-1,1)
        qt.fit(vec)
        D[col] = qt.transform(vec.reshape(-1,1))
        return D
                     
X_train = rankgauss(X_train)
X_test = rankgauss(X_test)

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

#feature importance
top_feats = [  0,   1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  14,  15,
        16,  18,  19,  20,  21,  23,  24,  25,  27,  28,  29,  30,  31,
        32,  33,  34,  35,  36,  37,  39,  40,  41,  42,  44,  45,  46,
        48,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
        63,  64,  65,  66,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  92,
        93,  94,  95,  96,  97,  99, 100, 101, 103, 104, 105, 106, 107,
       108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 163, 164,
       165, 166, 167, 168, 169, 170, 172, 173, 175, 176, 177, 178, 180,
       181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 195,
       197, 198, 199, 202, 203, 205, 206, 208, 209, 210, 211, 212, 213,
       214, 215, 218, 219, 220, 221, 222, 224, 225, 227, 228, 229, 230,
       231, 232, 233, 234, 236, 238, 239, 240, 241, 242, 243, 244, 245,
       246, 248, 249, 250, 251, 253, 254, 255, 256, 257, 258, 259, 260,
       261, 263, 265, 266, 268, 270, 271, 272, 273, 275, 276, 277, 279,
       282, 283, 286, 287, 288, 289, 290, 294, 295, 296, 297, 299, 300,
       301, 302, 303, 304, 305, 306, 308, 309, 310, 311, 312, 313, 315,
       316, 317, 320, 321, 322, 324, 325, 326, 327, 328, 329, 330, 331,
       332, 333, 334, 335, 338, 339, 340, 341, 343, 344, 345, 346, 347,
       349, 350, 351, 352, 353, 355, 356, 357, 358, 359, 360, 361, 362,
       363, 364, 365, 366, 368, 369, 370, 371, 372, 374, 375, 376, 377,
       378, 379, 380, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
       392, 393, 394, 395, 397, 398, 399, 400, 401, 403, 405, 406, 407,
       408, 410, 411, 412, 413, 414, 415, 417, 418, 419, 420, 421, 422,
       423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435,
       436, 437, 438, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
       452, 453, 454, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465,
       466, 468, 469, 471, 472, 473, 474, 475, 476, 477, 478, 479, 482,
       483, 485, 486, 487, 488, 489, 491, 492, 494, 495, 496, 500, 501,
       502, 503, 505, 506, 507, 509, 510, 511, 512, 513, 514, 516, 517,
       518, 519, 521, 523, 525, 526, 527, 528, 529, 530, 531, 532, 533,
       534, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
       549, 550, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563,
       564, 565, 566, 567, 569, 570, 571, 572, 573, 574, 575, 577, 580,
       581, 582, 583, 586, 587, 590, 591, 592, 593, 595, 596, 597, 598,
       599, 600, 601, 602, 603, 605, 607, 608, 609, 611, 612, 613, 614,
       615, 616, 617, 619, 622, 623, 625, 627, 630, 631, 632, 633, 634,
       635, 637, 638, 639, 642, 643, 644, 645, 646, 647, 649, 650, 651,
       652, 654, 655, 658, 659, 660, 661, 662, 663, 664, 666, 667, 668,
       669, 670, 672, 674, 675, 676, 677, 678, 680, 681, 682, 684, 685,
       686, 687, 688, 689, 691, 692, 694, 695, 696, 697, 699, 700, 701,
       702, 703, 704, 705, 707, 708, 709, 711, 712, 713, 714, 715, 716,
       717, 723, 725, 727, 728, 729, 730, 731, 732, 734, 736, 737, 738,
       739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751,
       752, 753, 754, 755, 756, 758, 759, 760, 761, 762, 763, 764, 765,
       766, 767, 769, 770, 771, 772, 774, 775, 780, 781, 782, 783, 784,
       785, 787, 788, 790, 793, 795, 797, 799, 800, 801, 805, 808, 809,
       811, 812, 813, 816, 819, 820, 821, 822, 823, 825, 826, 827, 829,
       831, 832, 833, 834, 835, 837, 838, 839, 840, 841, 842, 844, 845,
       846, 847, 848, 850, 851, 852, 854, 855, 856, 858, 860, 861, 862,
       864, 867, 868, 870, 871, 873, 874]

print(len(top_feats))

In [None]:
#PCA for genes
n_comp = 600  

data = pd.concat([X_train[GENES], X_test[GENES]])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[GENES]))

train2 = data2[:X_train.shape[0]]
test2 = data2[-X_test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
X_train = pd.concat((X_train, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)

In [None]:
#PCA for cells
n_comp = 50  

data = pd.concat([X_train[CELLS], X_test[CELLS]])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[CELLS]))

train2 = data2[:X_train.shape[0]]
test2 = data2[-X_test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
X_train = pd.concat((X_train, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)

In [None]:
var_thresh = VarianceThreshold(0.8)  #<-- Update
data = X_train.append(X_test)
data_transformed = var_thresh.fit_transform(data)

X_train = data_transformed[ : X_train.shape[0]]
X_test = data_transformed[-X_test.shape[0] : ]

In [None]:
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

In [None]:
n_train = X_train.shape[0]
n_test = X_test.shape[0]
n_features = X_train.shape[1]
n_labels = y_train.shape[1]

epsilon = 1E-3
p_min = epsilon
p_max = 1-epsilon

In [None]:
#model

sgd = tf.keras.optimizers.SGD()
adamw = tfa.optimizers.AdamW(weight_decay = 0.0001)
adam = tf.keras.optimizers.Adam()
radam = tfa.optimizers.RectifiedAdam()
lookahead_radam = tfa.optimizers.Lookahead(radam)
lookahead_adamw = tfa.optimizers.Lookahead(adamw)

def create_model(n_features, opt):
    model = Sequential([
    layers.Input(n_features),
    layers.BatchNormalization(),
    layers.Dropout(0.466),
        
    tfa.layers.WeightNormalization(layers.Dense(1024)),
    layers.LeakyReLU(),
    layers.Dropout(0.466),
    layers.BatchNormalization(),
    

    tfa.layers.WeightNormalization(layers.Dense(2048)),
    layers.LeakyReLU(),
    layers.Dropout(0.466),
    layers.BatchNormalization(),
        
    tfa.layers.WeightNormalization(layers.Dense(2048)),
    layers.LeakyReLU(),
    layers.Dropout(0.466),
    layers.BatchNormalization(),
        
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(n_labels, activation="sigmoid"))
    ])
    model.compile(optimizer= opt,
                  loss=losses.BinaryCrossentropy(label_smoothing=1E-6), metrics=logloss)
    return model

#layers.LeakyReLU()
#optimizer = tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 700)
#loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss
#tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation = 'relu')),

In [None]:
n_folds = 5
seed = 42

test_pred = np.zeros((n_test, n_labels))
oof_pred = np.zeros((n_train, n_labels))

mskf = MultilabelStratifiedKFold(n_splits = n_folds, random_state = seed, shuffle = True)


for n, (tr, te) in enumerate(mskf.split(X_train, y_train)):

    print(f'Starting fold: {n}')

    model = create_model(n_features, 'adam')

    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, verbose=1, mode='min',min_lr=1E-5)
    #checkpoint = ModelCheckpoint(f'split_nn.hdf5', monitor = 'val_loss', verbose = 0, save_best_only = True, save_weights_only = True, mode = 'min')
    early_stopping = callbacks.EarlyStopping(monitor='val_logloss', min_delta=1E-4, patience=10, mode='min',restore_best_weights=True)
    model.fit(
        X_train[tr],
        y_train[tr],
        validation_data=(X_train[te], y_train[te]),
        epochs=80, 
        batch_size=128,
        callbacks = [reduce_lr,early_stopping]
    )

    test_pred += model.predict(X_test)/n_folds
    oof_pred[te,:] += model.predict(X_train[te])

In [None]:
print(f'OOF log loss: {log_loss(np.ravel(y_train), np.ravel(np.clip(oof_pred, p_min, p_max)))}')

In [None]:
print(f'OOF log loss: {log_loss(np.ravel(y_train), np.ravel(oof_pred))}')

In [None]:
predictions = np.clip(test_pred.copy(), p_min,p_max)
for pos in add_index:
    predictions = np.insert(predictions, pos, values=np.zeros(206), axis=0)

In [None]:
#submission
ss = pd.DataFrame(predictions, columns=train_target.columns[1:])
ss.insert(0,'sig_id', test_set['sig_id'].values)
ss.to_csv('submission.csv',index = False)
ss.describe()

In [None]:
ss