In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import time
import logging

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, QuantileTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm


from skmultilearn.model_selection import IterativeStratification
import category_encoders as ce

import matplotlib.pyplot as plt
%matplotlib inline

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv', index_col='sig_id')
test_features = pd.read_csv('../input/lish-moa/test_features.csv', index_col='sig_id')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv', index_col='sig_id')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv', index_col='sig_id')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv', index_col='sig_id')

In [None]:
class TransformerLog():
    @property
    def log(self):
        return "{} transform".format(type(self).__name__)
    
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            logging.info("\t {} method took {:2.1f}s".format(method.__name__, (te - ts)))
        return result

    return timed
    
class TypeSelector(BaseEstimator, TransformerMixin, TransformerLog):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    @timeit
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    
class FeatureSelector(BaseEstimator, TransformerMixin, TransformerLog):
    def __init__(self, feature_names):
        self._feature_names = feature_names


    def fit(self, X, y=None):
        return self

    @timeit
    def transform(self, X, y=None):
        logging.info("{} - {}".format(self.log, self._feature_names))
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self._feature_names]
        except KeyError:
            cols_error = list(set(self._feature_names) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

class OrdinalEncoder(BaseEstimator, TransformerMixin, TransformerLog):
    def __init__(self, feature_name, mapping_dict):
        self._feature_name = feature_name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        logging.info("{} - {}".format(self.log, self._feature_names))
        assert isinstance(X, pd.Series)
        
        try:
            return X[self._feature_name].map(mapping_dict)
        except KeyError:
            raise KeyError(f"The DataFrame does not have the column: {self._feature_name}")
            
class ClassifierChainEnsemble(BaseEstimator, ClassifierMixin):
    def __init__(self, base_classifier=None):
        self.base_classifier = base_classifier
        self.chains = [ClassifierChain(self.base_classifier, 
                                       order='random', 
                                       random_state=i) 
                       for i in range(3)]
    
    def fit(self, X_train, y_train):
        for chain in tqdm(self.chains):
            chain.fit(X_train, y_train)
        return self

    def predict(self, X_test):

        y_pred_chains = np.array([chain.predict(X_test) for chain in tqdm(self.chains)])
        
        
        return y_pred_chains.mean(axis=0)
    
    def predict_proba(self, X_test):

        y_pred_chains = np.array([chain.predict_proba(X_test) for chain in tqdm(self.chains)])
        
        
        return y_pred_chains.mean(axis=0)

In [None]:
categorical_features = ['cp_type', 'cp_dose']

cp_time_features = ['cp_time']

g_features = [col for col in train_features.columns if col.startswith('g-')]
c_features = [col for col in train_features.columns if col.startswith('c-')]

numerical_features = cp_time_features + g_features + c_features

In [None]:
categorical_pipeline = Pipeline(steps=[('cat_selector', FeatureSelector(categorical_features)), 
                                       ('onehot_encoder', ce.OneHotEncoder())
                                      ])

numerical_pipeline = Pipeline(steps=[('numeric_selector', FeatureSelector(numerical_features)), 
                                     ('ordinal_encoder', ce.OrdinalEncoder())
                                    ])


preprocessing_pipeline = FeatureUnion(transformer_list=[('categorical_pipeline', categorical_pipeline), 
                                                         ('numerical_pipeline', numerical_pipeline)], 
                                       n_jobs=-1,
                                       verbose=True,
                                      )

lgbm = LGBMClassifier(boosting_type='dart', 
                      objective = 'binary',
                      bagging_freq=3,
                      learning_rate=0.2496,
                      num_leaves=511,
                      max_depth=9,
                      metric= 'binary_logloss',
                      verbosity= 0,
                      reg_alpha= 0.4,
                      reg_lambda= 0.6,
                      random_state= 123,
                      class_weight='balanced',
                      subsample=0.4578,
                      bagging_fraction=0.7697,
                     )

full_pipeline = Pipeline(steps=[('features', preprocessing_pipeline),
                                ('scaler', StandardScaler()),
                                ('log_reg_model', ClassifierChainEnsemble(base_classifier=lgbm))])

In [None]:
N_FOLDS = 4
k_fold = IterativeStratification(n_splits=N_FOLDS, order=1)

test_preds = np.zeros((test_features.shape[0], train_targets_scored.shape[1]))

for fold, (train_indices, valid_indices) in enumerate(k_fold.split(train_features, train_targets_scored)):
    
    X_train = train_features.iloc[train_indices]
    y_train = train_targets_scored.iloc[train_indices]
    
    X_valid = train_features.iloc[valid_indices]
    y_valid = train_targets_scored.iloc[valid_indices]

    full_pipeline.fit(X_train, y_train)
    
    train_loss = log_loss(np.ravel(y_train), np.ravel(full_pipeline.predict_proba(X_train)))
    valid_loss = log_loss(np.ravel(y_valid), np.ravel(full_pipeline.predict_proba(X_valid)))
    
    test_preds += full_pipeline.predict_proba(test_features)
    
    print(f'Fold {fold} - Train loss : {train_loss} , Test loss : {valid_loss}')
    
test_preds /= N_FOLDS

In [None]:
sample_submission.loc[:,:] = np.clip(test_preds,0.0005,0.999)
sample_submission.loc[test_features.cp_type=='ctl_vehicle',sample_submission.columns] = 0
sample_submission.to_csv('submission.csv',index=False)
sample_submission.head()