# Method of Action Prediction Notebook
This Notebook is aimed to predict method of action for drugs based on their effect on cell expression & cell viability data.

First of all, this is an multi-class, multi-label classification problem as each drug could be useful for more than on effect on human body

In [None]:
# Import data Analysis & manipulation tools
import numpy as np 
import pandas as pd
# Import Machine learning models 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
# Import Evaluation and Improvement tools 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss, f1_score, precision_score, recall_score, f1_score, classification_report
import joblib as jb 
# Import data from files 
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

In [None]:
# Start overview of the data files 
# make sure that there are no null values 
files = [train_features, test_features, train_targets]
def null_check(files):
    check = True 
    for i in files : 
        for j in i.columns: 
            if i[j].isnull().sum() == 0: 
                check = True
            else : 
                check = False 
                break
        if check : 
            print (f'There are no null values, You are good to go')
        elif check == False : 
            print (f'There are null values, please check !!')
null_check(files)

In [None]:
# Now we need to get info about our files to check if data is clean to start applying prediction models
train_features.info()

In [None]:
test_features.info()

In [None]:
train_targets.info()

In [None]:
train_features.head()

In [None]:
train_targets.head()

In [None]:
test_features.head()

We see that we have 3 column in each features dataset that are not numerical, they are strings (objects) which won't work on applying machine learning models as (RFC, KNN) as they need just numerical values.

One of these columns is the 'sig_id' column so we will be most concerned with 'cp_type', 'cp_dose' columns

So, our function here would be trying to transform these data into numerical values and prepare the data finally for prediction models.

But before transforming to numerical and removing Ids before modeling, we need to make a little check to see that IDs in all files are in the same order and matching.

In [None]:
id_check = True 
for i in range (len(train_features)): 
    if train_features['sig_id'][i] == train_targets['sig_id'][i]:
        id_check = True
    else : 
        id_check = False 
        break
        print (f'You have id diff in column ind {i}')
if id_check : 
    print ('All IDs are matching')

In [None]:
train_features = train_features.drop(['sig_id'], axis = 1)
train_targets = train_targets.drop(['sig_id'], axis = 1)
test_features = test_features.drop(['sig_id'], axis = 1);

In [None]:
files = [train_features, test_features, train_targets]
def prepare_data(files):
    for i in files : 
        for label, content in i.items(): 
            if pd.api.types.is_string_dtype(content): 
                i[label] = pd.Categorical(content.astype('category')).codes + 1
                # The 1 addition is to eliminate zeros from data sets as python starts indexing from zero
prepare_data(files)

In [None]:
train_features.head()

In [None]:
train_targets.head()

In [None]:
test_features.head()

In [None]:
# using Random Forest Classifier as it showed speed & accuracy over other algorithms such as KNN, MLP Classifiers
import time 
import warnings
warnings.filterwarnings('ignore')
np.random.seed(80)
x_train, x_val, y_train, y_val = train_test_split(train_features, train_targets, test_size = 0.15)
for j in np.arange(10):
    start = time.time()
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(x_train, y_train.iloc[:,j:j+1])
    y_preds = rfc.predict(x_val)
    print (f'Model has finalized training on subset of data with log loss of {log_loss(y_val.iloc[:,j:j+1],y_preds):.4f} in {(time.time()-start):.2f} seconds')

In [None]:
# Now that changing n_estimators has no effect, we will work on full dataset now to generate predictions
import time 
import warnings
warnings.filterwarnings('ignore')
y_full_preds = pd.read_csv('../input/lish-moa/test_features.csv')['sig_id']
x_train, y_train = train_features, train_targets
x_test = test_features
models = {}
for i in range (len(train_targets.columns)):
    start = time.time()
    models['Model# '+str(i)] = RandomForestClassifier(n_estimators=100)
    models['Model# '+ str(i)].fit(x_train, y_train.iloc[:,i])
    print (f'Training for model# {i+1} completed in {(time.time()-start):.2f} seconds')
    y_preds = pd.DataFrame(models['Model# '+ str(i)].predict(x_test))
    y_full_preds = pd.concat([y_full_preds,y_preds], axis=1)
y_full_preds.to_csv('Sample_Submission.csv', index=False)