In [1]:
!cp ../input/moa-tabnet-train-copy/*.zip ./

In [2]:
!pip uninstall -y typing
!pip install ../input/pytorchtabnetpretraining/pytorch_tabnet-2.0.1-py3-none-any.whl

Found existing installation: typing 3.7.4.3
Uninstalling typing-3.7.4.3:
  Successfully uninstalled typing-3.7.4.3
Processing /kaggle/input/pytorchtabnetpretraining/pytorch_tabnet-2.0.1-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.1


In [3]:
import numpy as np 
import pandas as pd 
import os
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import random
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from tqdm import tqdm
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import QuantileTransformer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [4]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

# Data


In [5]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/moa-make-foldsmoa-make-folds/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/moa-make-foldsmoa-make-folds/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [6]:
#使用PCA制造出一部分特征
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [7]:
for col in (GENES + CELLS):
    transformer = QuantileTransformer(n_quantiles=115, random_state=0, output_distribution="normal")   #50 75 100 125 150 25 
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [8]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
target = train[train_targets_scored.columns[:207]]

In [9]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [10]:
folds = train.copy()
folds.head()

Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,fold_42,fold_0,fold_1,fold_2,fold_3
0,id_000644bb2,24,D1,1.13477,0.908153,-0.416081,-0.967866,-0.254718,-1.015497,-1.364912,...,0,0,0,0,0,6,1,3,3,5
1,id_000779bfc,72,D1,0.1196,0.681885,0.272164,0.079996,1.204608,0.685581,0.314454,...,0,0,0,0,0,0,0,3,6,3
2,id_000a6266a,48,D1,0.780234,0.944877,1.422334,-0.132034,-0.00745,1.492153,0.23541,...,0,0,0,0,0,0,5,3,3,1
3,id_0015fd391,48,D1,-0.735342,-0.274628,-0.438384,0.759605,2.389946,-0.859146,-2.287594,...,0,0,0,0,0,6,4,3,1,2
4,id_001626bd3,72,D2,-0.452845,-0.4773,0.971824,0.970578,1.463978,-0.871062,-0.376199,...,0,0,0,0,0,1,6,5,3,3


In [11]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1086)
(21948, 1086)
(3624, 875)
(21948, 207)
(3982, 207)


In [12]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
def process_data(data):
    
    data['cp_time'] = lb.fit_transform(data['cp_time'])
    data['cp_dose'] = lb.fit_transform(data['cp_dose'])
    return data

feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['fold_42','sig_id','fold_0','fold_1','fold_2','fold_3']]
len(feature_cols)

874

In [13]:
def evals(model, X, y, verbose=True):
    with torch.no_grad():
        y_preds = model.predict(X)
        y_preds = torch.clamp(y_preds, 0.0,1.0).detach().numpy()
    score = log_loss_multi(y, y_preds)
    #print("Logloss = ", score)
    return y_preds, score


def inference_fn(model, X ,verbose=True):
    with torch.no_grad():
        y_preds = model.predict( X )
        y_preds = torch.sigmoid(torch.as_tensor(y_preds)).numpy()
    return y_preds

def log_loss_score(actual, predicted,  eps=1e-15):
    p1 = actual * np.log(predicted+eps)
    p0 = (1-actual) * np.log(1-predicted+eps)
    loss = p0 + p1

    return -loss.mean()

def log_loss_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        results[i] = log_loss_score(y_true[:,i], y_pred[:,i])
    return results.mean()

def check_targets(targets):
    ### check if targets are all binary in training set
    
    for i in range(targets.shape[1]):
        if len(np.unique(targets[:,i])) != 2:
            return False
    return True

def auc_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        try:
            results[i] = roc_auc_score(y_true[:,i], y_pred[:,i])
        except:
            pass
    return results.mean()

In [14]:
class Config(object):
    def __init__(self):
        self.num_class = len(target_cols)
        self.verbose=False
        self.seed = 0
        self.device =  "cpu"
        self.EPOCHS = 300
        self.num_ensembling = 1
        # Parameters model
        self.cat_emb_dim=[1] * 2 #to choose
        self.cats_idx = list(range(2))
        self.cat_dims = [3,2]
        self.num_numericals= len(feature_cols)-2
        self.patience = 50
        self.batch_size=1024
        self.NFOLDS = 7
    
        # save
        self.save_name = "/kaggle/working/tabnet_raw_step1"
        
        self.strategy = "KFOLD" # 
cfg = Config()

In [15]:
from pytorch_tabnet.tab_model import TabNetRegressor

In [16]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score, log_loss

class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [17]:
def run_training(fold, seed):
    
    seed_everything(seed)
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train[f'fold_{seed}'] != fold].index
    val_idx = train[train[f'fold_{seed}'] == fold].index
    
    train_df = train[train[f'fold_{seed}'] != fold].reset_index(drop=True)
    valid_df = train[train[f'fold_{seed}'] == fold].reset_index(drop=True)
    
    X_train, y_train = train_df[feature_cols].values, train_df[target_cols].values
    X_val, y_val = valid_df[feature_cols].values, valid_df[target_cols].values
            
    model = TabNetRegressor(n_d=24, n_a=64, n_steps=1, 
                            n_independent=1, n_shared=1,
                            gamma=1.2, lambda_sparse=0, 
                            cat_dims=cfg.cat_dims, 
                            cat_emb_dim=cfg.cat_emb_dim, 
                            cat_idxs=cfg.cats_idx, 
                            optimizer_fn=torch.optim.Adam,
                            optimizer_params=dict(lr=2e-2, weight_decay=1e-5), 
                            mask_type='entmax', 
                            device_name=cfg.device, 
                            scheduler_params=dict(mode='min', factor=0.1, patience=10), 
                            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau)
    

    name = cfg.save_name + f"_fold{fold}_{seed}.zip"
    model.load_model(name)
    
    preds = model.predict(X_val)
    valid_preds = torch.sigmoid(torch.as_tensor(preds)).detach().cpu().numpy()
    score = log_loss_multi(y_val, preds)
    
    oof = np.zeros((len(train), len(target_cols)))
    oof[val_idx] = valid_preds
    
    x_test = torch.as_tensor(test[feature_cols].values)
    predictions = model.predict(x_test)
    predictions = torch.sigmoid(torch.as_tensor(predictions)).detach().cpu().numpy()
    
    return oof, predictions

In [18]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [19]:
# Averaging on multiple SEEDS
SEED = [42,0,1,2,3] #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(cfg.NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train[target_cols] = oof
test[target_cols] = predictions

Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
Device used : cpu
Device used : cuda
D

In [20]:
len(target_cols)

206

In [21]:
valid_results = train_targets_scored.drop(columns=target_cols+['fold_42','fold_0','fold_1','fold_2','fold_3']).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
valid_results = valid_results.add_prefix('pre_')
valid_results.rename(columns={'pre_sig_id':'sig_id'}, inplace=True)

In [22]:
oof = train_targets_scored.drop(columns=['fold_42','fold_0','fold_1','fold_2','fold_3'],axis=1)
oof = oof.merge(valid_results, on=['sig_id'], how='left')
oof.to_csv('moa_nn_oof.csv', index=False)

In [23]:
oof.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,pre_tropomyosin_receptor_kinase_inhibitor,pre_trpv_agonist,pre_trpv_antagonist,pre_tubulin_inhibitor,pre_tyrosine_kinase_inhibitor,pre_ubiquitin_specific_protease_inhibitor,pre_vegfr_inhibitor,pre_vitamin_b,pre_vitamin_d_receptor_agonist,pre_wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0.000807,0.001157,0.002949,0.003137,0.002221,0.000777,0.001965,0.002452,0.000291,0.002128
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0.001034,0.001162,0.003155,0.000966,0.001398,0.000492,0.001672,0.002535,0.006897,0.002952
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0.000564,0.001255,0.001742,0.001424,0.01462,0.000629,0.064658,0.001145,0.00023,0.001541
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0.000582,0.018047,0.001754,0.197664,0.008973,0.000901,0.001898,0.00075,0.000658,0.000561
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0.000861,0.001019,0.00357,0.002447,0.001481,0.000676,0.000877,0.001993,0.000373,0.002469


In [24]:
score = 0
for i in (target_cols):
    score_ = log_loss(oof[i], oof['pre_'+i])
    score += score_ / 206
    
print("CV log_loss: ", score)

CV log_loss:  0.015665928517910092


In [25]:
auc_score = 0
for i in (target_cols):
    score_ = roc_auc_score(oof[i], oof['pre_'+i])
    score += score_ / target.shape[1]
    
print("CV AUC: ", score)

CV AUC:  0.6707558040207743


In [26]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001091,0.00099,0.001917,0.016006,0.022213,0.00509,0.002669,0.00643,0.00035,...,0.000937,0.000786,0.002975,0.000965,0.000799,0.000615,0.000716,0.00216,0.003511,0.001535
1,id_001897cda,0.00057,0.00095,0.002104,0.00234,0.001574,0.001966,0.002473,0.010145,0.002058,...,0.000871,0.001378,0.003699,0.001091,0.007105,0.000663,0.011186,0.001294,0.0109,0.002685
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.001157,0.001207,0.001672,0.009913,0.014072,0.004203,0.002665,0.005191,0.000488,...,0.000669,0.00207,0.002777,0.021083,0.006821,0.000766,0.002139,0.001855,0.000905,0.00219
4,id_0027f1083,0.001734,0.001559,0.001958,0.019246,0.024183,0.004626,0.00475,0.002949,0.000631,...,0.000797,0.000975,0.003703,0.001108,0.001424,0.000743,0.001039,0.002154,0.000452,0.00182
