This notebook is dedicated to "Mechanism of action prediction" competition. A main purpose of the competition is to improve algorithms that can classify drugs based on their biological activity. You can find gene expression and cell viability data of drugs in the train_features.csv and targets (list of biological activities of the drugs) in train_targets_scored.csv.
The notebook contains follow chapters:
1. Data cleaning
2. Principal component analysis
3. Multiclassification model
4. Cross-validation
5. Submission file compilation

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read dataframes and clean the data

In [None]:
X_train=pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
y_train=pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
X_test=pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

In [None]:
y_train=y_train.drop(['sig_id'], axis=1)
idlist=X_test['sig_id']
moa=y_train.columns
X_train=X_train.drop(['sig_id'], axis=1)
X_test=X_test.drop(['sig_id'], axis=1)
X_train['cp_dose']=X_train.cp_dose.replace('D1',"1").replace('D2', "2")
X_train['cp_type']=X_train.cp_type.replace('trt_cp',"1").replace('ctl_vehicle', "2")
X_test['cp_dose']=X_test.cp_dose.replace('D1',"1").replace('D2', "2")
X_test['cp_type']=X_test.cp_type.replace('trt_cp',"1").replace('ctl_vehicle', "2")

In [None]:
X_test[['cp_type','cp_dose']]=X_test[['cp_type','cp_dose']].astype('int64', copy=False)
X_train[['cp_type','cp_dose']]=X_train[['cp_type','cp_dose']].astype('int64', copy=False)

# PCA. We will keep only 70% of the data because it was shown to be optimal in this competition. 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(0.70)
pca.fit(X_train)
X_test_pca= pd.DataFrame(pca.transform(X_test))
X_train_pca=pd.DataFrame(pca.transform(X_train))

# Let us employ a neural network with Keras to build the multiclassification model 

In [None]:
from keras.models import Sequential 
from keras.models import Sequential
from keras import layers,models,optimizers,losses,metrics
import tensorflow as tf
def get_compiled_model():
    model=Sequential()
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(206,activation='softmax'))
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return model
y_test=pd.DataFrame()
model1=get_compiled_model()
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_pca.values,y_train.values))
train_dataset_shuffle = train_dataset.shuffle(len(X_train_pca)).batch(1)
model1.fit(train_dataset_shuffle,epochs=1,batch_size=512)
y_test=pd.DataFrame(model1.predict(X_test_pca)) 

# Cross-validation. I used 20% of the data to validate fit model

In [None]:
cvscores=[]
for j in range(0,5):
        a=int((len(X_train_pca)/5)*j)
        b=int((len(X_train_pca)/5)*(j+1))
        model=get_compiled_model()
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train_pca[a:b].values,y_train[a:b].values))
        train_dataset_shuffle = train_dataset.shuffle(b-a).batch(1)
        model.fit(train_dataset_shuffle, epochs=1, batch_size=512, verbose=0)
        scores = model.evaluate(train_dataset_shuffle, verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

# Submission file compilation

In [None]:
for i in range(len(moa)):
    y_test.rename(columns={i:moa[i]},inplace=True)
y_test.insert(0,'sig_id',idlist)
y_test=round(y_test, 4)
y_test.to_csv('submission.csv', index=False)