In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras as k
from keras.models import Sequential
import keras
import os
import warnings
from sklearn.metrics import log_loss
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error,accuracy_score
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
tf.config.optimizer.set_jit(True)

/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv


In [2]:
data_path = "/kaggle/input/lish-moa/"

train_X = pd.read_csv(data_path + 'train_features.csv')
train_Y = pd.read_csv(data_path + 'train_targets_scored.csv')
test_X = pd.read_csv(data_path + 'test_features.csv')
sample_sub = pd.read_csv(data_path + 'sample_submission.csv')

train_targets_nonscored = pd.read_csv(data_path + 'train_targets_nonscored.csv')

#check train and test shapes
train_X.shape
test_X.shape

# check if train has missing data
train_X.isnull().sum().sum()
# check if test has missing data
test_X.isnull().sum().sum()

le = LabelEncoder()
for col_name in ['cp_type','cp_dose']:
    le.fit(train_X[col_name])
    train_X[col_name] = le.transform(train_X[col_name])
    le.fit(test_X[col_name])
    test_X[col_name] = le.transform(test_X[col_name])
    
# get list of columns names for genes and cells    
genes_names = []

for col in train_X.columns:
    if col.startswith("g-"):
        genes_names.append(col)
    
cells_names = []

for col in train_X.columns:
    if col.startswith("c-"):
        cells_names.append(col)

In [3]:
genesCellsNames = genes_names + cells_names
qt = QuantileTransformer(n_quantiles=100,random_state=441,output_distribution='normal')
train_X[genesCellsNames] = qt.fit_transform(train_X[genesCellsNames])
test_X[genesCellsNames] = qt.transform(test_X[genesCellsNames])

In [4]:
#genes
# # of components we are going to use from PCA
genes_pca = PCA(n_components=500, random_state=441)
genes_data = pd.concat([pd.DataFrame(train_X[genes_names]), pd.DataFrame(test_X[genes_names])])

In [6]:
#genes
genes_pca = genes_pca.fit(genes_data[genes_names])

# transform produces new columns
train_pca = genes_pca.transform(train_X[genes_names])
test_pca = genes_pca.transform(test_X[genes_names])

# rename new columns
train_gpca = pd.DataFrame(train_pca, columns=[f'pca_G-{i}' for i in range(500)])
test_gpca = pd.DataFrame(test_pca, columns=[f'pca_G-{i}' for i in range(500)])

train_X = pd.concat((train_X, train_gpca), axis=1)
test_X = pd.concat((test_X, test_gpca), axis=1)

In [7]:
#cells
cells_pca = PCA(n_components=55, random_state=441)
cells_data = pd.concat([pd.DataFrame(train_X[cells_names]), pd.DataFrame(test_X[cells_names])])

In [8]:
#cells
cells_pca = cells_pca.fit(cells_data[cells_names])

# transform produces new columns
train_pca = cells_pca.transform(train_X[cells_names])
test_pca = cells_pca.transform(test_X[cells_names])

# rename new columns
train_cpca = pd.DataFrame(train_pca, columns=[f'pca_C-{i}' for i in range(55)])
test_cpca = pd.DataFrame(test_pca, columns=[f'pca_C-{i}' for i in range(55)])

train_X = pd.concat((train_X, train_cpca), axis=1)
test_X = pd.concat((test_X, test_cpca), axis=1)

In [9]:
# set index for id column for train_X, test_X, train_Y
train_X = train_X.set_index('sig_id')
test_X = test_X.set_index('sig_id')

train_Y = train_Y.set_index('sig_id')

# only include columns from array a and columns from PCA
a = train_X[['cp_type','cp_time','cp_dose']]
b = train_X.filter(like='pca', axis=1)

train_X = pd.concat((a,b),axis=1)

In [10]:
# only include columns from array a_test and columns from PCA
a_test = test_X[['cp_type','cp_time','cp_dose']]
b_test = test_X.filter(like='pca', axis=1)

test_X = pd.concat((a_test,b_test),axis=1)

In [11]:
# make consistent datatypes
tmp = train_X.columns[0:3].tolist()
tmp2 = test_X.columns[0:3].tolist()
for col in tmp:
    train_X[col] = train_X[col].astype(np.float64)
    
for col in tmp2:
    test_X[col] = test_X[col].astype(np.float64)

In [12]:
# convert dataframe to numpy arrays
X = train_X.to_numpy()
Y = train_Y.to_numpy()

X_test = test_X.to_numpy()

col = train_Y.columns
n_columns = len(X.T)

In [13]:
def buildModel(input_layer):
    # produces a random model with random dense, activation and dropout
    layers = input_layer
    
    activation_lst = ["elu","selu","relu","swish"]

    print("--------------------MODEL---START------------------------------")
    
    layer_random = random.randint(1, 5)
    print("number of layer: ", layer_random)

    for _ in range(layer_random):
        # randomly select dense, activation, dropout for random model
        dense_random = random.randint(1, 1000)
        activation_random = random.choice(activation_lst)
        dropout_random = round(random.uniform(0, 0.3), 2)
        
        layers = tf.keras.layers.BatchNormalization()(layers)
        layers = tf.keras.layers.Dense(dense_random,activation=activation_random)(layers)
        layers = tf.keras.layers.Dropout(dropout_random)(layers)
        print("-------------Layer --------------")
        print("dense random: ", dense_random)
        print("activation fn random: ", activation_random)
        print("dropout random: ", dropout_random)
        print("-------------Layer --------------")
    
    outputs = tf.keras.layers.Dense(206,activation='sigmoid')(layers)

    model = tf.keras.models.Model(inputs=input_layer, outputs=outputs)
    #print(model.summary())
    
    epoch_random = random.randint(20, 50)
    print("number of epoch: ", epoch_random)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss="binary_crossentropy", metrics=['acc'])
    
    history = model.fit(X,Y,epochs=epoch_random,batch_size=2000,verbose=0)
    print("---------------------MODEL------END ---------------------------")
    
    return model

def averageModels(models_lst, input_layer):
    # gets models_lst produced buildModel function
    # returns average of all models
    outputs = [model.outputs[0] for model in models_lst]    
    outputs = keras.layers.Average()(outputs)
    
    # takes as input a list of tensors, all of the same shape, and returns a single tensor
    model = keras.Model(input_layer, outputs)
    return model

In [14]:
n_columns = len(X.T)

# input layer
inputs = tf.keras.layers.Input(shape = (n_columns, ))

# Initiliaze model lists. Will contain all models
models_lst = []

#number of model to generate randomly
num_models = 150 

for i in range(num_models):
    print("training model: ", i+1)
    models_lst.append(buildModel(inputs))
    
final_model = averageModels(models_lst, inputs)

pred = final_model.predict(X_test,verbose=0)
sample_sub.iloc[:,1:] = pred

print('Complete, Now create Submisson')
sample_sub.to_csv("submission.csv",index=False)

training model:  1
--------------------MODEL---START------------------------------
number of layer:  2
-------------Layer --------------
dense random:  684
activation fn random:  elu
dropout random:  0.12
-------------Layer --------------
-------------Layer --------------
dense random:  442
activation fn random:  relu
dropout random:  0.23
-------------Layer --------------
number of epoch:  47
---------------------MODEL------END ---------------------------
training model:  2
--------------------MODEL---START------------------------------
number of layer:  4
-------------Layer --------------
dense random:  679
activation fn random:  swish
dropout random:  0.01
-------------Layer --------------
-------------Layer --------------
dense random:  180
activation fn random:  relu
dropout random:  0.2
-------------Layer --------------
-------------Layer --------------
dense random:  21
activation fn random:  elu
dropout random:  0.26
-------------Layer --------------
-------------Layer --------