In [None]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

# read datasets

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")

data = train_features.append(test_features)

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

# main parameters 

from https://www.kaggle.com/vbmokin/moa-pytorch-rankgauss-pca-nn-upgrade-3d-visual


In [None]:
n_comp_GENES = 463
n_comp_CELLS = 60
VarianceThreshold_for_FS = 0.9
NFOLDS = 5

# set seeds

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# transform columns to normal dist with rankgauss (QuantileTransformer)

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
col_names = GENES + CELLS
col_example_index = 300
col_example_name = col_names[300]

In [None]:
fig = plt.figure()
plt.hist(train_features.iloc[:,col_example_index+4].values, bins=100, density= True)
plt.ylabel('Probability')
plt.xlabel(col_example_name)
plt.title('Raw feature distribution');

In [None]:
transformer = QuantileTransformer(n_quantiles=50,random_state=0, output_distribution="normal")
fig = plt.figure()
plt.hist(transformer.fit_transform(train_features.iloc[:,col_example_index+4].values.reshape(-1,1)), bins=100, density= True)

plt.ylabel('Probability')
plt.xlabel(col_example_name)
plt.title('Transformed feature distribution');

In [None]:
# quantile transformer normal dist --> 
for col in (GENES + CELLS):    
    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

# dimensionality reduction with pca 

In [None]:
len(GENES)

In [None]:
# GENES

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
data2 = (PCA(n_components=n_comp_GENES, random_state=42).fit_transform(data[GENES]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp_GENES)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp_GENES)])

train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
len(CELLS)

In [None]:
# CELLS

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
data2 = (PCA(n_components=n_comp_CELLS, random_state=42).fit_transform(data[CELLS]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp_CELLS)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp_CELLS)])

train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
train_features.shape[1]

In [None]:
train_features.head(5)

# feature elimination with variance threshold

In [None]:
data = train_features.append(test_features)
data

In [None]:
var_thresh = VarianceThreshold(VarianceThreshold_for_FS)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

train_features.shape

In [None]:
train_features.head(5)

# create datasets

In [None]:
# merge feature and and targets
merged = train_features.merge(train_targets_scored, on='sig_id')

# remove ctl_vehicle rows
merged = merged[merged['cp_type']!='ctl_vehicle'].reset_index(drop=True)
X_test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# create X_train and y_train
X_train = merged[train_features.columns]
y_train = merged[train_targets_scored.columns]

In [None]:
# drop cp_type column
X_train = X_train.drop('cp_type', axis=1)
X_test = X_test.drop('cp_type', axis=1)

In [None]:
# drop sig_id column
X_train = X_train.drop('sig_id', axis=1)
X_test = X_test.drop('sig_id', axis=1)
y_train = y_train.drop('sig_id', axis=1)

In [None]:
X_train.head(5)

# XGBoost with CountEncoder

In [None]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))
#classifier = MultiOutputClassifier(XGBClassifier())

clf = Pipeline([('encode', CountEncoder(cols=["cp_dose","cp_time"])),
                ('classify', classifier)
               ])

# CV predictions with MultilabelStratifiedKFold

In [None]:
oof_preds = np.zeros(y_train.shape)
test_preds = np.zeros((X_test.shape[0], y_train.shape[1]))
oof_losses = []
mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=42)
for fn, (trn_idx, val_idx) in enumerate(mskf.split(X_train, y_train)):
    print('Starting fold: ', fn)
    X_trn, X_val = X_train.iloc[trn_idx,:], X_train.iloc[val_idx,:]
    y_trn, y_val = y_train.iloc[trn_idx,:], y_train.iloc[val_idx,:]
    
    clf.fit(X_trn, y_trn)
    val_preds = clf.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = clf.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / NFOLDS
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

In [None]:
print('OOF log loss: ', log_loss(np.ravel(y_train), np.ravel(oof_preds)))

# Analysis of OOF preds

In [None]:
# set control test preds to 0
control_mask = test_features['cp_type']!='ctl_vehicle'

dummy_preds = np.zeros((ss.shape[0],ss.shape[1]-1))
dummy_preds[control_mask] = test_preds

In [None]:
# create the submission file
ss.iloc[:,1:] = dummy_preds
ss.to_csv('submission.csv', index=False)