In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

tf.random.Generator = None  # Patch for a bug

import tensorflow_addons as tfa

np.random.seed(666)


In [None]:
# uploading the datasets 
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')
train_target = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

## understanding categorical features

In [None]:
train_features.loc[:,'cp_type'].value_counts()


In [None]:
test_features.loc[:,'cp_type'].value_counts()

In [None]:
train_features.loc[:,'cp_dose'].value_counts()

In [None]:
test_features.loc[:,'cp_dose'].value_counts()

In [None]:
train_features.loc[:,'cp_time'].value_counts()

## Performing feature engineering

In [None]:
train_features.loc[:,'cp_time'] = train_features.loc[:,'cp_time']/train_features.loc[:,'cp_time'].min()
train_features.head()

In [None]:
train_features['cp_type'] = train_features['cp_type'].astype('category')
train_features['cp_type'] = train_features['cp_type'].cat.codes
train_features.head()

In [None]:
test_features['cp_type'] = test_features['cp_type'].astype('category')
test_features['cp_type'] = test_features['cp_type'].cat.codes
test_features.head()

In [None]:
test_features.loc[:,'cp_time'] = test_features.loc[:,'cp_time']/test_features.loc[:,'cp_time'].min()
test_features.head()

In [None]:
train_features['cp_dose'] = train_features['cp_dose'].astype('category')
train_features['cp_dose'] = train_features['cp_dose'].cat.codes

In [None]:
test_features['cp_dose'] = test_features['cp_dose'].astype('category')
test_features['cp_dose'] = test_features['cp_dose'].cat.codes

In [None]:
test_features = test_features.drop('sig_id',axis = 1)
test_features.head()

In [None]:
train_features = train_features.drop('sig_id',axis = 1)
train_features.head()

### removing features with correlation above 90%

In [None]:
cor_matrix = train_features.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
print(to_drop)

In [None]:
for i in to_drop:
    train_features = train_features.drop([i], axis=1)
    test_features = test_features.drop([i], axis=1) 
    
  

In [None]:
cols = train_features.columns  

### Applying feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler(with_mean = True, with_std = True)
train_features = sc.fit_transform(train_features)

test_features = sc.fit_transform(test_features)

In [None]:
x = pd.DataFrame(data = train_features,  
                  index = None,  
                  columns = cols)


x_test = pd.DataFrame(data = test_features,  
                  index = None,  
                  columns = cols)

In [None]:
x.head(1)

In [None]:
x_test.head(1)

In [None]:
y = train_target.drop('sig_id',axis = 1)
y.head(2)

### creating autoencoder model

In [None]:
def create_autoencoder():
    input_vector = Input(shape=(847,))
    encoded = Dense(1500, activation='elu')(input_vector)
    encoded = Dense(400, activation='elu')(encoded)
    decoded = Dense(1500, activation='elu')(encoded)
    decoded = Dense(847, activation='elu')(decoded)
    
    autoencoder = tf.keras.Model(input_vector, decoded)
    autoencoder.compile(optimizer='adadelta', loss='mse')
    
    return autoencoder

autoencoder = create_autoencoder()

In [None]:
autoencoder.summary()

In [None]:
mu, sigma = 0, 0.1

noise = np.random.normal(mu, sigma, [23814, 847]) 
noised_train = x + noise

In [None]:
autoencoder.fit(noised_train, x,
                epochs=3000,
                batch_size=128,
                shuffle=True,
                validation_split=0.2)

In [None]:
encoder = tf.keras.Model(autoencoder.input, autoencoder.layers[2].output)

In [None]:
train_features = pd.DataFrame(encoder.predict(x))
test_features = pd.DataFrame(encoder.predict(x_test))

In [None]:
train_features.head()

In [None]:
def create_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Input(400),
    tf.keras.layers.BatchNormalization(),

    tfa.layers.WeightNormalization(tf.keras.layers.Dense(512)),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
        
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(1024)),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
        
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(512)),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
        
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid"))
    ])
    model.compile(optimizer=tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 700), loss='binary_crossentropy')
    return model

In [None]:
submission.loc[:, y.columns] = 0
res = y.copy()
for n, (tr, te) in enumerate(KFold(n_splits=7, random_state=666, shuffle=True).split(y)):
    print(f'Fold {n}')
    
    model = create_model()
    
    model.fit(
        train_features.values[tr],
        y.values[tr],
        epochs=100, 
        batch_size=128
    )
    
    submission.loc[:, y.columns] += model.predict(test_features)
    res.loc[te, y.columns] = model.predict(train_features.values[te])
    
submission.loc[:, y.columns] /= (n+1)

metrics = []
for _target in y.columns:
    metrics.append(log_loss(y.loc[:, _target], res.loc[:, _target]))

In [None]:
print(f'OOF Metric: {np.mean(metrics)}')

In [None]:
test_cp_type = x_test['cp_type']

submission['cp_type'] = test_cp_type
for col in submission.columns:
    if col in ['sig_id', 'cp_type', 'cp_dose', 'cp_time']:
        continue
    submission.loc[submission['cp_type'] == 'ctl_vehicle', col] = 0

submission = submission.drop(['cp_type'], axis=1)



In [None]:
submission.to_csv('submission.csv', index=False)