In [1]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.metrics import log_loss

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

from sklearn.multioutput import MultiOutputClassifier
import time

In [3]:
SEED = 42
FOLDS = 5
DATA_DIR = '/kaggle/input/lish-moa/'
np.random.seed(SEED)

In [4]:
train = pd.read_csv(DATA_DIR+'train_features.csv')
test = pd.read_csv(DATA_DIR+'test_features.csv')
train_target = pd.read_csv(DATA_DIR+'train_targets_scored.csv')
sub = pd.read_csv(DATA_DIR+'sample_submission.csv')

In [5]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

In [6]:
#train = train.merge(train_target, on='sig_id')

In [7]:
#train.drop(train[train.cp_type=='ctl_vehicle'].index, axis=0, inplace=True)
#test.drop(test[test.cp_type==1].index, axis=0, inplace=True)

In [8]:
train = preprocess(train)
test = preprocess(test)

In [9]:
train

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,0,24,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,0,72,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,0,48,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,0,48,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,0,72,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,0,24,1,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,0.5773,...,0.1969,0.0262,-0.8121,0.3434,0.5372,-0.3246,0.0631,0.9171,0.5258,0.4680
23810,0,24,1,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,-0.2252,...,0.4286,0.4426,0.0423,-0.3195,-0.8086,-0.9798,-0.2084,-0.1224,-0.2715,0.3689
23811,1,48,1,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,-0.2541,...,0.5409,0.3755,0.7343,0.2807,0.4116,0.6422,0.2256,0.7592,0.6656,0.3808
23812,0,24,0,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,0.0323,...,-0.1105,0.4258,-0.2012,0.1506,1.5230,0.7101,0.1732,0.7015,-0.6290,0.0740


In [10]:
X = train.to_numpy()
X_test = test.to_numpy()
y = train_target.iloc[:,1:].to_numpy()

In [11]:
model = MultiOutputClassifier(estimator=XGBClassifier(tree_method='gpu_hist'))
params = {'estimator__colsample_bytree': 0.6522,
          'estimator__gamma': 3.6975,
          'estimator__learning_rate': 0.0503,
          'estimator__max_delta_step': 2.0706,
          'estimator__max_depth': 10,
          'estimator__min_child_weight': 31.5800,
          'estimator__n_estimators': 166,
          'estimator__subsample': 0.8639
         }
model.set_params(**params)

MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.6522,
                                              gamma=3.6975, gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=0.0503,
                                              max_delta_step=2.0706,
                                              max_depth=10,
                                              min_child_weight=31.58,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=166, n_jobs=None,
                  

In [12]:
cv_scores = []
final_preds = []

In [13]:
kf = MultilabelStratifiedKFold(n_splits=FOLDS)
for fold, (train_index, valid_index) in enumerate(kf.split(X, y)):
    
    print('Beginning fold',fold+1)
    print("TRAIN INDEX:", train_index, "VALID INDEX:", valid_index)
    
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    start = time.time()
    model.fit(X_train,y_train)
    print('Total time taken to fit model: ', time.time() - start, ' seconds')
    preds = np.array(model.predict_proba(X_valid))

    preds = preds[:,:,1].T
    score = log_loss(np.ravel(y_valid),np.ravel(preds))

    cv_scores.append(score)

    print('Validation log loss score: {}'.format(score))
    preds = np.array(model.predict_proba(X_test))
    preds = preds[:,:,1].T
    final_preds.append(preds)
    



Beginning fold 1
TRAIN INDEX: [    1     2     3 ... 23811 23812 23813] VALID INDEX: [    0    10    22 ... 23805 23808 23810]
Total time taken to fit model:  202.64636278152466  seconds
Validation log loss score: 0.01677902308697855
Beginning fold 2
TRAIN INDEX: [    0     1     3 ... 23809 23810 23811] VALID INDEX: [    2     5     7 ... 23801 23812 23813]
Total time taken to fit model:  203.05812621116638  seconds
Validation log loss score: 0.016878600725679153
Beginning fold 3
TRAIN INDEX: [    0     2     5 ... 23811 23812 23813] VALID INDEX: [    1     3     4 ... 23798 23802 23806]
Total time taken to fit model:  202.70810985565186  seconds
Validation log loss score: 0.016837053571453205
Beginning fold 4
TRAIN INDEX: [    0     1     2 ... 23810 23812 23813] VALID INDEX: [   23    34    47 ... 23793 23800 23811]
Total time taken to fit model:  203.22569012641907  seconds
Validation log loss score: 0.017030164165947055
Beginning fold 5
TRAIN INDEX: [    0     1     2 ... 23811 23

In [14]:
print('Cross Validation scores: ',cv_scores)

print('Ensembling final predictions')
final_predictions = np.mean(np.array(final_preds),axis=0)

print('Done')

sub.iloc[:,1:] = final_predictions
sub.to_csv('submission.csv',index=False)

Cross Validation scores:  [0.01677902308697855, 0.016878600725679153, 0.016837053571453205, 0.017030164165947055, 0.016853963255410068]
Ensembling final predictions
Done


In [15]:
np.mean(cv_scores)

0.016875760961093608