In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

In [None]:
#Reading datasets
df = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
tts = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
ttns = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
td = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
te_df = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#embedding cp_type, cp_time and cp_dose categorical columns of train dataset
df['cp_type'] = df['cp_type'].map({'trt_cp':0, 'ctl_vehicle':1})
df['cp_time'] = df['cp_time'].map({24:0, 48:1, 72:2})
df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})

In [None]:
#embedding cp_type, cp_time and cp_dose categorical columns of test dataset
te_df['cp_type'] = te_df['cp_type'].map({'trt_cp':0, 'ctl_vehicle':1})
te_df['cp_time'] = te_df['cp_time'].map({24:0, 48:1, 72:2})
te_df['cp_dose'] = te_df['cp_dose'].map({'D1':0, 'D2':1})

In [None]:
#Seperating gene and cell columns
gene_cols = [c for c in df.columns if c.startswith('g-')]
cell_cols = [c for c in df.columns if c.startswith('c-')]

In [None]:
#making copy of train_features dataset
df_cp = df.copy()
te_df_cp = te_df.copy()

In [None]:
#using QunatileTransformer to transform oue gene and cell columns
#QunatileTransformer method transforms the features to follow a uniform or a normal distribution.
from sklearn.preprocessing import QuantileTransformer

In [None]:
qt = QuantileTransformer(n_quantiles=100, random_state=0)
qt.fit(df_cp[gene_cols + cell_cols])

In [None]:
df_cp[gene_cols+cell_cols] = qt.transform(df_cp[gene_cols + cell_cols])
te_df_cp[gene_cols+cell_cols] = qt.transform(te_df[gene_cols + cell_cols])

In [None]:
df_cp.drop('sig_id', axis=1, inplace=True)

In [None]:
te_df_cp.drop('sig_id', axis=1, inplace=True)

In [None]:
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import load_model

In [None]:
# AutoEncoder Model Preparation
n_inputs = df_cp.shape[1]
# define encoder
input_data_shape= Input(shape=(n_inputs,))
# encoder level
encoder= Dense(512, activation='relu')(input_data_shape)
encoder= Dense(128, activation='relu')(encoder)
encoder= Dense(64, activation='relu')(encoder)
encoder= Dense(32, activation='relu')(encoder)
# bottleneck
n_bottleneck = 50
bottleneck = Dense(n_bottleneck)(encoder)
# define decoder
decoder = Dense(32, activation='relu')(bottleneck)
decoder = Dense(64, activation='relu')(decoder)
decoder = Dense(128, activation='relu')(decoder)
decoder = Dense(512, activation='relu')(decoder)

In [None]:
# output layer
output = Dense(n_inputs, activation='linear')(decoder)
# define autoencoder model
model = Model(inputs=input_data_shape, outputs=output)
# compile autoencoder model
model.compile(optimizer='adam', loss='mse')

In [None]:
model.summary()

In [None]:
# fit the autoencoder model to reconstruct input
history = model.fit(df_cp, df_cp, epochs=50, batch_size=16, verbose=2, validation_data=(te_df_cp,te_df_cp))

In [None]:
# define an encoder model (without the decoder)
encoder = Model(inputs=input_data_shape, outputs=bottleneck)
# save the encoder to file
encoder.save('encoder.h5')

In [None]:
# loading the encoder model
encoder = load_model('encoder.h5')

In [None]:
# encode the train data
X_train_encode = encoder.predict(df_cp)
# encode the test data
X_test_encode = encoder.predict(te_df_cp)

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
import warnings
warnings.filterwarnings("ignore")

**Step 1**

In [None]:
num_moa_each_sample = np.sum(tts.drop('sig_id', axis=1), axis=1)

In [None]:
X_1 = pd.concat([df_cp, pd.DataFrame(X_train_encode)], axis=1)    #concatenating original train data with data left after autoencoding
y_1 = num_moa_each_sample.map({1:1,2:1,3:1,4:1,5:1,7:1,0:0})    #if MoA is present then map it to 1 elso 0

In [None]:
#final model for Step 1
model_1 = SGDClassifier(loss='log')
model_1.fit(X_1, y_1)

**Step 2**

In [None]:
#removing first column i.e sig_id column from all datasets and storing in different variables
X = pd.concat([df_cp, pd.DataFrame(X_train_encode)], axis=1)
y = tts.iloc[:,1:]
test = pd.concat([te_df_cp, pd.DataFrame(X_test_encode)], axis=1)

In [None]:
# X = df_cp
# y = tts.iloc[:,1:]
# test = te_df_cp

In [None]:
print(X.shape, y.shape, test.shape)

In [None]:
kf = KFold(n_splits=2, shuffle=True, random_state=22)

In [None]:
'''
best_model = None      #initializing best_model variable for storing best_model
best_loss = 99999999   #initializing best_loss variable to store least log-loss
cv = 1                 #initializing cv variable to store number of cross validation iterating

for train_idx, test_idx in tqdm(kf.split(X, y)):     #iterating for each cv
  X_train , X_val = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

  #training the model
  print('FIT')
  #model = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', max_iter=10000,tol=0.00001, eta0=0.002), n_jobs=-1)
  #model = OneVsRestClassifier(GaussianNB(), n_jobs=-1)
  model = OneVsRestClassifier(RandomForestClassifier(max_depth=2, n_estimators = 100))
  model.fit(X_train, y_train)

  #predicting target values for validation set and computing log-loss for each target features
  print('PREDICT')
  pred = model.predict_proba(X_val)
  pred = np.array(pred)
  
  loss = log_loss(np.ravel(np.array(y_val)), np.ravel(pred))
  print('Log loss for ',cv,' cv = ',loss)
  
  #saving best model and least log-loss
  if loss < best_loss:
      best_model = model
      best_loss = loss
  
  cv += 1    #updating cv variable
''' 

In [None]:
# #best_model = OneVsRestClassifier(RandomForestClassifier(max_depth=2, n_estimators = 100))
best_model = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l2', max_iter=10000,tol=0.00001, eta0=0.002), n_jobs=-1)
best_model.fit(X, y)

**Combining step 1 and step 2**

In [None]:
#predicting target value for test dataset using first model (step 1 model) to find whether MoA is present or not
model_1_pred = model_1.predict(test)#[:,1]

In [None]:
#np.sum(model_1_pred<0.4)

In [None]:
#predicting target values for test dataset
test_pred = best_model.predict_proba(test)

In [None]:
test_pred.shape

In [None]:
test_pred[2].max()

In [None]:
sub.iloc[:,1:] = test_pred
sub.to_csv('submission.csv', index=False)

In [None]:
sub