In [None]:
!pip install ../input/pytorch-tabnet/pytorch_tabnet-2.0.1-py3-none-any.whl
!pip install ../input/iter-strart/iterative_stratification-0.1.6-py3-none-any.whl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import torch
from pytorch_tabnet.tab_model import TabNetClassifier,TabNetRegressor
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
import random
import matplotlib.pyplot as plt
import os
import copy
from sklearn.metrics import log_loss ,roc_auc_score
from pickle import load,dump
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    
seed_everything(33)

In [None]:
from sklearn.metrics import log_loss
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score, log_loss

class LogitsLogLoss(Metric):
    
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):       
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [None]:
#load files
def load_data(path):
    
    train_feature = pd.read_csv(f'{path}train_features.csv')
    train_target = pd.read_csv(f'{path}train_targets_scored.csv')
    test_feature = pd.read_csv(f'{path}test_features.csv')
    sample_output = pd.read_csv(f'{path}sample_submission.csv')
    features = train_feature.columns[1:]
    #scored = pd.read_csv('input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv(f'{path}train_drug.csv')
    targets = train_target.columns[1:]
    train_target = train_target.merge(drug, on='sig_id', how='left')
    
    return train_feature,train_target,test_feature,sample_output,targets,features                         

In [None]:
#data pre-processing
def LabelEncoding(train_feature,test_feature):
    
    le = LabelEncoder()
    le1 = LabelEncoder()
    le2 = LabelEncoder()
    train_feature['cp_type'] = pd.DataFrame(data = le.fit_transform(train_feature['cp_type']))
    train_feature['cp_time'] = pd.DataFrame(data = le1.fit_transform(train_feature['cp_time']))
    train_feature['cp_dose'] = pd.DataFrame(data = le2.fit_transform(train_feature['cp_dose']))
    test_feature['cp_type'] = pd.DataFrame(data = le.transform(test_feature['cp_type']))
    test_feature['cp_time'] = pd.DataFrame(data = le1.transform(test_feature['cp_time']))
    test_feature['cp_dose'] = pd.DataFrame(data = le2.transform(test_feature['cp_dose']))
    
    return train_feature,test_feature

In [None]:
def PreProcess(train_feature,test_feature):
    
    GENES = [col for col in train_feature.columns if col.startswith('g-')]
    CELLS = [col for col in train_feature.columns if col.startswith('c-')]

    for col in (GENES + CELLS):

        transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
        vec_len = len(train_feature[col].values)
        vec_len_test = len(test_feature[col].values)
        raw_vec = train_feature[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_feature[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test_feature[col] = transformer.transform(test_feature[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]


    n_comp = 600  #<--Update
    pca_g = PCA(n_components=n_comp, random_state=42)
    data = pd.concat([pd.DataFrame(train_feature[GENES]), pd.DataFrame(test_feature[GENES])])
    gpca= (pca_g.fit(data[GENES]))
    train2= (gpca.transform(train_feature[GENES]))
    test2 = (gpca.transform(test_feature[GENES]))

    train_gpca = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
    test_gpca = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

    # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
    train_feature = pd.concat((train_feature, train_gpca), axis=1)
    test_feature = pd.concat((test_feature, test_gpca), axis=1)

    dump(gpca, open('gpca.pkl', 'wb'))

    #CELLS
    n_comp = 50  #<--Update

    pca_c = PCA(n_components=n_comp, random_state=42)
    data = pd.concat([pd.DataFrame(train_feature[CELLS]), pd.DataFrame(test_feature[CELLS])])
    cpca= (pca_c.fit(data[CELLS]))
    train2= (cpca.transform(train_feature[CELLS]))
    test2 = (cpca.transform(test_feature[CELLS]))

    train_cpca = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
    test_cpca = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

    # drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
    train_feature = pd.concat((train_feature, train_cpca), axis=1)
    test_feature = pd.concat((test_feature, test_cpca), axis=1)

    dump(cpca, open('cpca.pkl', 'wb'))


    c_n = [f for f in list(train_feature.columns) if f not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose']]
    mask = (train_feature[c_n].var() >= 0.85).values
    tmp = train_feature[c_n].loc[:, mask]
    train_feature = pd.concat([train_feature[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp], axis=1)
    tmp = test_feature[c_n].loc[:, mask]
    test_feature = pd.concat([test_feature[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp], axis=1)

    return train_feature,test_feature

In [None]:
def VarThreshold(train_feature,test_feature):
    
    c_n = [f for f in list(train_feature.columns) if f not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose']]
    mask = (train_feature[c_n].var() >= 0.85).values
    tmp = train_feature[c_n].loc[:, mask]
    train_feature = pd.concat([train_feature[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp], axis=1)
    tmp = test_feature[c_n].loc[:, mask]
    test_feature = pd.concat([test_feature[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp], axis=1)
    
    return train_feature,test_feature    

In [None]:
def Stratify_Drugs(scored, threshold, FOLDS, SEED):
    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc<=threshold].index.sort_values()
    vc2 = vc.loc[vc>threshold].index.sort_values()

    # STRATIFY DRUGS 19X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, 
              random_state=SEED)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 19X
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, 
              random_state=SEED)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(),'fold'] = scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')
    
    return scored

In [None]:
def Train_Drug(MAX_EPOCH, FOLDS, train_feature,scored,test_feature):
   
    unused_feat = ['sig_id']
    features = [ col for col in train_feature.columns if col not in unused_feat] 
    train_feature = train_feature[features]
    #train_target = train_target.drop("sig_id", axis=1)
    test_feature = test_feature[features]
    X_test = test_feature.values   
    
    tabnet_params = dict(n_d=24, n_a=24, n_steps=1, gamma=1.3,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_preds = []
    for i in range(FOLDS):    
        X_train = train_feature.loc[scored.fold != i].values
        y_train = scored.loc[scored.fold != i][targets].values
        X_val = train_feature.loc[scored.fold == i].values
        y_val = scored.loc[scored.fold == i][targets].values
        model = TabNetRegressor(**tabnet_params)
        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_val, y_val)],
                  eval_name = ["val"],
                  eval_metric = ["logits_ll"],
                  max_epochs=MAX_EPOCH,
                  patience=50, batch_size=1024, virtual_batch_size=128,
                  num_workers=1, drop_last=False,
                  # use binary cross entropy as this is not a regression problem
                  loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)
        y_test = model.predict(X_test)
        test_preds.append(1 / (1 + np.exp(-y_test)))

    test_preds_total = np.stack(test_preds)
    return test_preds_total
    

In [None]:
def Train_Simple(MAX_EPOCH,FOLDS,SEED,train_feature,train_target,test_feature):
    
    unused_feat = ['sig_id']
    features = [ col for col in train_feature.columns if col not in unused_feat] 
    train_feature = train_feature[features]
    train_target = train_target.drop(["sig_id",'drug_id','fold'], axis=1)
    test_feature = test_feature[features]
    X_test = test_feature.values   
    
    tabnet_params = dict(n_d=24, n_a=24, n_steps=1, gamma=1.3,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    
    test_preds = []
    mskf = MultilabelStratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_feature, train_target)):
        print("FOLDS : ", fold_nb)

        ## model
        X_train, y_train = train_feature.values[train_idx, :], train_target.values[train_idx, :]
        X_val, y_val = train_feature.values[val_idx, :], train_target.values[val_idx, :]
        model = TabNetRegressor()#**tabnet_params)

        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_val, y_val)],
                  eval_name = ["val"],
                  eval_metric = ["logits_ll"],
                  max_epochs=MAX_EPOCH,
                  patience=20, batch_size=1024, virtual_batch_size=128,
                  num_workers=1, drop_last=False,
                  # use binary cross entropy as this is not a regression problem
                  loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)

        y_test = model.predict(X_test)
        test_preds.append(1 / (1 + np.exp(-y_test)))

    test_preds_total = np.stack(test_preds)
    return test_preds_total

In [None]:
threshold = 19 #Drug sample count threshold
FOLDS = 5 #
SEED = 33
epochs = 500
#data loading
train_feature,train_target,test_feature,sample_output,targets,features = load_data('/kaggle/input/lish-moa/')
#data Label Encoding
train_feature,test_feature = LabelEncoding(train_feature,test_feature)
#data pre-processing
train_feature,test_feature = PreProcess(train_feature,test_feature)
#Variance Threshold
train_feature,test_feature = VarThreshold(train_feature,test_feature)
#Stratifications
scored = Stratify_Drugs(train_target, threshold, FOLDS, SEED)
#Training with drug stratification
pred = Train_Drug(epochs, FOLDS, train_feature,scored,test_feature)
#Training with simple stratification
#pred1 = Train_Simple(epochs,FOLDS,SEED,train_feature,train_target,test_feature)

In [None]:
sample_output1 = sample_output.copy()
#sample_output2 = sample_output.copy()
all_feat = [col for col in sample_output.columns if col not in ["sig_id"]]
sample_output1[all_feat] = pred.mean(axis=0)
#sample_output2[all_feat] = pred1.mean(axis=0)
#sample_output = pd.concat([sample_output1, sample_output2]).groupby(level=0).mean()
# set control to 0
sample_output1.loc[test_feature['cp_type']==0, sample_output.columns[1:]] = 0
sample_output1.to_csv('submission.csv', index=None)