In [None]:
# This notebook creates models based on neural networks, XGBoost and logistic regressions
# and ensembles the predictions for the Kaggle's Mechanism of Action competition

import pandas as pd
import numpy as np
import xgboost as xgb
import itertools
from tqdm import tqdm

In [None]:
# Loads and prepares data
RunningOnKaggle=True
if (RunningOnKaggle == True):
    PATH = '../input/lish-moa/'
else:
    PATH = '../../../Kaggle challenges/MoA/'

train_features = pd.read_csv(PATH+'train_features.csv')
test_features = pd.read_csv(PATH+'test_features.csv')
train_targets_nonscored = pd.read_csv(PATH+'train_targets_nonscored.csv')
train_targets_scored = pd.read_csv(PATH+'train_targets_scored.csv')
train_drug = pd.read_csv(PATH+'train_drug.csv')
sample_submission = pd.read_csv(PATH+'sample_submission.csv')

train_ds = pd.merge(train_drug, train_features)

In [None]:
# Savings IDs

full_train_features_ids = train_features.pop('sig_id')
full_test_features_ids = test_features.pop('sig_id')
train_targets_scored.pop('sig_id')
train_targets_nonscored.pop('sig_id')

train_drug = train_drug.set_index('sig_id').reindex(index=full_train_features_ids).reset_index()
drug_ids = pd.Series(train_drug['drug_id'].unique())

In [None]:
# Deal with categorical and discrete variables

feature_names = list(train_features)
categorical_features = ['cp_dose', 'cp_type']
numerical_discrete_features = ['cp_time']
numerical_continuous_features = [name for name in feature_names if name not in itertools.chain(categorical_features, numerical_discrete_features)]

train_features = pd.get_dummies(train_features)
test_features = pd.get_dummies(test_features)

In [None]:
# Split between training and validation datasets

# NOTE: Unlike other notebooks I have seen, I will leave controls in the analyses.
#     Similarly, I will also remove the `cp_type` column so that the model does not know which ones are the control.
#     Finally, the cross-validation strategy will be done on a drug basis, not sig basis.

full_train_features_df = train_features.copy()
full_train_targets_df = train_targets_scored.copy()
full_train_targets_ns_df = train_targets_nonscored.copy()

def train_val_split_drugs(percent_train=0.8):
    num_train_drugs = int(percent_train * len(drug_ids))
    tmp_index = drug_ids.sample(len(drug_ids))
    drugs_train, drugs_val = tmp_index[:num_train_drugs], tmp_index[num_train_drugs:]
    sig_train = train_drug.loc[train_drug['drug_id'].isin(drugs_train), 'sig_id']
    sig_val = train_drug.loc[train_drug['drug_id'].isin(drugs_val), 'sig_id']
    return sig_train, sig_val

sig_train, sig_val = train_val_split_drugs()

val_features_df = train_features[full_train_features_ids.isin(sig_val)]
train_features_df = train_features[full_train_features_ids.isin(sig_train)]
val_targets_df = train_targets_scored[full_train_features_ids.isin(sig_val)]
train_targets_df = train_targets_scored[full_train_features_ids.isin(sig_train)]
val_targets_ns_df = train_targets_nonscored[full_train_features_ids.isin(sig_val)]
train_targets_ns_df = train_targets_nonscored[full_train_features_ids.isin(sig_train)]

print('Total training samples:', len(full_train_features_df))
print('Training split samples:', len(train_features_df))
print('Validation split samples:', len(val_features_df))

In [None]:
# XGBoost

xgb_param = {
    'max_depth': 12,
    'eta': 0.7,
    'objective': 'binary:logistic',
    'nthread': 16
}
xgb_num_round = 50

xgbModels = []
for target_name in tqdm(list(train_targets_scored)):
    print('Training XGBoost model on target variable: ', target_name)
    xgb_train_data = xgb.DMatrix(data=train_features_df, label=train_targets_df[target_name])
    xgb_val_data = xgb.DMatrix(data=val_features_df, label=val_targets_df[target_name])
    xgb_eval = [(xgb_val_data, 'eval'), (xgb_train_data, 'train')]
    tmp_model = xgb.train(params=xgb_param, dtrain=xgb_train_data, num_boost_round=xgb_num_round, evals=xgb_eval, early_stopping_rounds=15)
    xgbModels.append(tmp_model)

predict_xgb = sample_submission.copy()
xgb_test_data = xgb.DMatrix(data=test_features)
for target_number in tqdm(range(train_targets_scored.shape[1])):
    target_variable = list(train_targets_df)[target_number]
    print('Predicting based on XGBoost the MoAs for target variable: ', target_variable)
    predict_xgb[target_variable] = xgbModels[target_number].predict(xgb_test_data, ntree_limit=xgbModels[target_number].best_ntree_limit)

In [None]:
predict_xgb.to_csv('submission.csv', index=False)