In [1]:
!pip uninstall -y typing
!pip install ../input/pytorchtabnetpretraining/pytorch_tabnet-2.0.1-py3-none-any.whl

Found existing installation: typing 3.7.4.3
Uninstalling typing-3.7.4.3:
  Successfully uninstalled typing-3.7.4.3
Processing /kaggle/input/pytorchtabnetpretraining/pytorch_tabnet-2.0.1-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.1


In [2]:
import numpy as np 
import pandas as pd 
import os
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import random
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from tqdm import tqdm
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import QuantileTransformer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

In [4]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/moa-make-foldsmoa-make-folds/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/moa-make-foldsmoa-make-folds/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [5]:
#使用PCA制造出一部分特征
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [6]:
for col in (GENES + CELLS):
    transformer = QuantileTransformer(n_quantiles=115, random_state=0, output_distribution="normal")   #50 75 100 125 150 25 
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [7]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
target = train[train_targets_scored.columns[:207]]

In [8]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [9]:
folds = train.copy()
folds.head()

Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,fold_42,fold_0,fold_1,fold_2,fold_3
0,id_000644bb2,24,D1,1.13477,0.908153,-0.416081,-0.967866,-0.254718,-1.015497,-1.364912,...,0,0,0,0,0,6,1,3,3,5
1,id_000779bfc,72,D1,0.1196,0.681885,0.272164,0.079996,1.204608,0.685581,0.314454,...,0,0,0,0,0,0,0,3,6,3
2,id_000a6266a,48,D1,0.780234,0.944877,1.422334,-0.132034,-0.00745,1.492153,0.23541,...,0,0,0,0,0,0,5,3,3,1
3,id_0015fd391,48,D1,-0.735342,-0.274628,-0.438384,0.759605,2.389946,-0.859146,-2.287594,...,0,0,0,0,0,6,4,3,1,2
4,id_001626bd3,72,D2,-0.452845,-0.4773,0.971824,0.970578,1.463978,-0.871062,-0.376199,...,0,0,0,0,0,1,6,5,3,3


In [10]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1086)
(21948, 1086)
(3624, 875)
(21948, 207)
(3982, 207)


In [11]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
def process_data(data):
    
    data['cp_time'] = lb.fit_transform(data['cp_time'])
    data['cp_dose'] = lb.fit_transform(data['cp_dose'])
    return data

feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['fold_42','sig_id','fold_0','fold_1','fold_2','fold_3']]
len(feature_cols)

874

In [12]:
def evals(model, X, y, verbose=True):
    with torch.no_grad():
        y_preds = model.predict(X)
        y_preds = torch.clamp(y_preds, 0.0,1.0).detach().numpy()
    score = log_loss_multi(y, y_preds)
    #print("Logloss = ", score)
    return y_preds, score


def inference_fn(model, X ,verbose=True):
    with torch.no_grad():
        y_preds = model.predict( X )
        y_preds = torch.sigmoid(torch.as_tensor(y_preds)).numpy()
    return y_preds

def log_loss_score(actual, predicted,  eps=1e-15): #eps=1e-15
    p1 = actual * np.log(predicted+eps)
    p0 = (1-actual) * np.log(1-predicted+eps)
    loss = p0 + p1

    return -loss.mean()

def log_loss_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        results[i] = log_loss_score(y_true[:,i], y_pred[:,i])
    return results.mean()

def check_targets(targets):
    ### check if targets are all binary in training set
    
    for i in range(targets.shape[1]):
        if len(np.unique(targets[:,i])) != 2:
            return False
    return True

def auc_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        try:
            results[i] = roc_auc_score(y_true[:,i], y_pred[:,i])
        except:
            pass
    return results.mean()

In [13]:
class Config(object):
    def __init__(self):
        self.num_class = len(target_cols)
        self.verbose=False
        self.seed = 0
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.EPOCHS = 300
        self.num_ensembling = 1
        # Parameters model
        self.cat_emb_dim=[1] * 2 #to choose
        self.cats_idx = list(range(2))
        self.cat_dims = [3,2]
        self.num_numericals= len(feature_cols)-2
        self.patience = 50
        self.batch_size=1024
        self.NFOLDS = 7
    
        # save
        self.save_name = "tabnet_raw_step1"
        
        self.strategy = "KFOLD" # 
cfg = Config()


In [14]:
from pytorch_tabnet.tab_model import TabNetRegressor

In [15]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score, log_loss

class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [16]:
def run_training(fold, seed):
    
    seed_everything(seed)
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train[f'fold_{seed}'] != fold].index
    val_idx = train[train[f'fold_{seed}'] == fold].index
    
    train_df = train[train[f'fold_{seed}'] != fold].reset_index(drop=True)
    valid_df = train[train[f'fold_{seed}'] == fold].reset_index(drop=True)
    
    X_train, y_train = train_df[feature_cols].values, train_df[target_cols].values
    X_val, y_val = valid_df[feature_cols].values, valid_df[target_cols].values
            
    model = TabNetRegressor(n_d=24, n_a=64, n_steps=1, 
                            n_independent=1, n_shared=1,
                            gamma=1.2, lambda_sparse=0, 
                            cat_dims=cfg.cat_dims, 
                            cat_emb_dim=cfg.cat_emb_dim, 
                            cat_idxs=cfg.cats_idx, 
                            optimizer_fn=torch.optim.Adam,
                            optimizer_params=dict(lr=2e-2, weight_decay=1e-5), 
                            mask_type='entmax', 
                            device_name=cfg.device, 
                            scheduler_params=dict(mode='min', factor=0.1, patience=10), 
                            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau)
    
    model.fit(X_train=X_train, y_train=y_train, 
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=cfg.EPOCHS, 
              patience=cfg.patience, 
              batch_size=cfg.batch_size, 
              virtual_batch_size=128,
              num_workers=0, 
              drop_last=False, 
              loss_fn=torch.nn.functional.binary_cross_entropy_with_logits) 
    
    preds = model.predict(X_val)
    valid_preds = torch.sigmoid(torch.as_tensor(preds)).detach().cpu().numpy()
    score = log_loss_multi(y_val, preds)
    name = cfg.save_name + f"_fold{fold}_{seed}"
    model.save_model(name)
    
    oof = np.zeros((len(train), len(target_cols)))
    oof[val_idx] = valid_preds
    
    x_test = torch.as_tensor(test[feature_cols].values)
    predictions = model.predict(x_test)
    
    return oof, predictions

In [17]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [18]:
# Averaging on multiple SEEDS
SEED = [42, 0, 1,2,3,4] #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(cfg.NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train[target_cols] = oof
test[target_cols] = predictions

Device used : cuda
epoch 0  | loss: 0.39931 | val_logits_ll: 0.10102 |  0:00:03s
epoch 1  | loss: 0.03284 | val_logits_ll: 0.02737 |  0:00:04s
epoch 2  | loss: 0.02398 | val_logits_ll: 0.02206 |  0:00:06s
epoch 3  | loss: 0.02161 | val_logits_ll: 0.02114 |  0:00:07s
epoch 4  | loss: 0.02086 | val_logits_ll: 0.02078 |  0:00:09s
epoch 5  | loss: 0.02058 | val_logits_ll: 0.02052 |  0:00:11s
epoch 6  | loss: 0.02027 | val_logits_ll: 0.02036 |  0:00:12s
epoch 7  | loss: 0.02003 | val_logits_ll: 0.02021 |  0:00:14s
epoch 8  | loss: 0.0197  | val_logits_ll: 0.01983 |  0:00:16s
epoch 9  | loss: 0.0192  | val_logits_ll: 0.01977 |  0:00:17s
epoch 10 | loss: 0.01881 | val_logits_ll: 0.01915 |  0:00:19s
epoch 11 | loss: 0.01855 | val_logits_ll: 0.0191  |  0:00:20s
epoch 12 | loss: 0.01825 | val_logits_ll: 0.01916 |  0:00:22s
epoch 13 | loss: 0.01804 | val_logits_ll: 0.0188  |  0:00:24s
epoch 14 | loss: 0.01777 | val_logits_ll: 0.01894 |  0:00:25s
epoch 15 | loss: 0.01768 | val_logits_ll: 0.01862 |

KeyError: 'fold_4'

In [19]:
len(target_cols)

206

In [20]:
valid_results = train_targets_scored.drop(columns=target_cols+['fold_42','fold_0','fold_1','fold_2','fold_3']).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
valid_results = valid_results.add_prefix('pre_')
valid_results.rename(columns={'pre_sig_id':'sig_id'}, inplace=True)

In [21]:
oof = train_targets_scored.drop(columns=['fold_42','fold_0','fold_1','fold_2','fold_3'],axis=1)
oof = oof.merge(valid_results, on=['sig_id'], how='left')
oof.to_csv('moa_nn_oof.csv', index=False)

In [22]:
oof.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,pre_tropomyosin_receptor_kinase_inhibitor,pre_trpv_agonist,pre_trpv_antagonist,pre_tubulin_inhibitor,pre_tyrosine_kinase_inhibitor,pre_ubiquitin_specific_protease_inhibitor,pre_vegfr_inhibitor,pre_vitamin_b,pre_vitamin_d_receptor_agonist,pre_wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
score = 0
for i in (target_cols):
    score_ = log_loss(oof[i], oof['pre_'+i])
    score += score_ / 206
    
print("CV log_loss: ", score)

CV log_loss:  9.99200722162639e-16


In [24]:
auc_score = 0
for i in (target_cols):
    score_ = roc_auc_score(oof[i], oof['pre_'+i])
    score += score_ / target.shape[1]
    
print("CV AUC: ", score)

CV AUC:  0.995169082125608


In [25]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

KeyError: "['calcineurin_inhibitor', 'cholesterol_inhibitor', 'hsp_inhibitor', 'ribonucleoside_reductase_inhibitor', 'glutamate_inhibitor', 'laxative', 'tlr_antagonist', 'bacterial_cell_wall_synthesis_inhibitor', 'gonadotropin_receptor_agonist', 'flt3_inhibitor', 'antiviral', 'ikk_inhibitor', 'atp_synthase_inhibitor', 'aurora_kinase_inhibitor', 'fungal_squalene_epoxidase_inhibitor', 'anticonvulsant', 'nrf2_activator', 'dna_inhibitor', 'nicotinic_receptor_agonist', 'diuretic', 'ppar_receptor_antagonist', 'acetylcholine_receptor_agonist', 'hcv_inhibitor', 'glutamate_receptor_agonist', 'potassium_channel_antagonist', 'jak_inhibitor', '11-beta-hsd1_inhibitor', 'antihistamine', 'acetylcholinesterase_inhibitor', 'bacterial_30s_ribosomal_subunit_inhibitor', 'immunosuppressant', 'ras_gtpase_inhibitor', 'nfkb_inhibitor', 'bacterial_dna_gyrase_inhibitor', 'monoamine_oxidase_inhibitor', 'free_radical_scavenger', 'thrombin_inhibitor', 'sigma_receptor_antagonist', 'gamma_secretase_inhibitor', 'p-glycoprotein_inhibitor', 'glucocorticoid_receptor_agonist', 'autotaxin_inhibitor', 'trpv_agonist', 'antiprotozoal', 'transient_receptor_potential_channel_antagonist', 'mucolytic_agent', 'gaba_receptor_agonist', 'chloride_channel_blocker', 'corticosteroid_agonist', 'histamine_receptor_agonist', 'progesterone_receptor_antagonist', 'glutamate_receptor_antagonist', 'carbonic_anhydrase_inhibitor', 'lipoxygenase_inhibitor', 'prostaglandin_inhibitor', 'fatty_acid_receptor_agonist', 'vegfr_inhibitor', 'focal_adhesion_kinase_inhibitor', 'raf_inhibitor', 'dna_alkylating_agent', 'acat_inhibitor', 'opioid_receptor_antagonist', 'adrenergic_receptor_agonist', 'antifungal', 'acetylcholine_receptor_antagonist', 'atpase_inhibitor', 'bcr-abl_inhibitor', 'mek_inhibitor', 'opioid_receptor_agonist', 'tlr_agonist', 'alk_inhibitor', 'ppar_receptor_agonist', 'nitric_oxide_donor', 'integrin_inhibitor', 'fgfr_inhibitor', 'leukotriene_receptor_antagonist', 'hmgcr_inhibitor', 'rna_polymerase_inhibitor', 'radiopaque_medium', 'estrogen_receptor_antagonist', 'apoptosis_stimulant', 'tubulin_inhibitor', 'caspase_activator', 'estrogen_receptor_agonist', 'histone_lysine_methyltransferase_inhibitor', 'serotonin_receptor_agonist', 'pi3k_inhibitor', 'nitric_oxide_synthase_inhibitor', 'kit_inhibitor', 'catechol_o_methyltransferase_inhibitor', 'pdgfr_inhibitor', 'bacterial_membrane_integrity_inhibitor', 'gsk_inhibitor', 'tachykinin_antagonist', 'igf-1_inhibitor', 'cyclooxygenase_inhibitor', 'thymidylate_synthase_inhibitor', 'cannabinoid_receptor_antagonist', 'sodium_channel_inhibitor', 'dipeptidyl_peptidase_inhibitor', 'wnt_inhibitor', 'retinoid_receptor_antagonist', 'aromatase_inhibitor', 'hiv_inhibitor', 'egfr_inhibitor', 'erbb2_inhibitor', 'protein_kinase_inhibitor', 'sigma_receptor_agonist', 'src_inhibitor', 'bacterial_dna_inhibitor', 'progesterone_receptor_agonist', 'protein_phosphatase_inhibitor', 'topoisomerase_inhibitor', 'rho_associated_kinase_inhibitor', 'norepinephrine_reuptake_inhibitor', 'androgen_receptor_antagonist', 'steroid', 'atp-sensitive_potassium_channel_antagonist', 'atr_kinase_inhibitor', 'calcium_channel_blocker', 'cc_chemokine_receptor_antagonist', 'neuropeptide_receptor_antagonist', 'chk_inhibitor', 'hdac_inhibitor', 'histamine_receptor_antagonist', 'lxr_agonist', 'serotonin_reuptake_inhibitor', 'bacterial_50s_ribosomal_subunit_inhibitor', 'mdm_inhibitor', 'protein_synthesis_inhibitor', 'dihydrofolate_reductase_inhibitor', 'analgesic', 'p38_mapk_inhibitor', 'aldehyde_dehydrogenase_inhibitor', 'benzodiazepine_receptor_agonist', 'cytochrome_p450_inhibitor', 'tgf-beta_receptor_inhibitor', 'vitamin_d_receptor_agonist', 'tropomyosin_receptor_kinase_inhibitor', 'casein_kinase_inhibitor', 'ubiquitin_specific_protease_inhibitor', 'monoacylglycerol_lipase_inhibitor', 'gaba_receptor_antagonist', 'insulin_sensitizer', 'bcl_inhibitor', 'btk_inhibitor', 'mineralocorticoid_receptor_antagonist', 'pdk_inhibitor', 'phospholipase_inhibitor', 'elastase_inhibitor', 'beta_amyloid_inhibitor', 'sphingosine_receptor_agonist', 'tnf_inhibitor', 'pkc_inhibitor', 'antioxidant', 'protein_tyrosine_kinase_inhibitor', 'trpv_antagonist', 'potassium_channel_activator', 'cck_receptor_antagonist', 'vitamin_b', 'histone_lysine_demethylase_inhibitor', 'leukotriene_inhibitor', 'angiotensin_receptor_antagonist', 'dopamine_receptor_agonist', 'antibiotic', 'anesthetic_-_local', 'adenosine_receptor_agonist', 'bromodomain_inhibitor', 'cannabinoid_receptor_agonist', 'adenosine_receptor_antagonist', 'faah_inhibitor', 'proteasome_inhibitor', 'monopolar_spindle_1_kinase_inhibitor', 'imidazoline_receptor_agonist', 'akt_inhibitor', 'coagulation_factor_inhibitor', 'serotonin_receptor_antagonist', 'farnesyltransferase_inhibitor', 'bacterial_antifolate', 'chelating_agent', 'angiogenesis_inhibitor', '5-alpha_reductase_inhibitor', 'retinoid_receptor_agonist', 'syk_inhibitor', 'nitric_oxide_production_inhibitor', 'androgen_receptor_agonist', 'parp_inhibitor', 'membrane_integrity_inhibitor', 'prostanoid_receptor_antagonist', 'smoothened_receptor_antagonist', 'insulin_secretagogue', 'dopamine_receptor_antagonist', 'lipase_inhibitor', 'tyrosine_kinase_inhibitor', 'cholinergic_receptor_antagonist', 'mtor_inhibitor', 'adenylyl_cyclase_activator', 'ampk_activator', 'antiarrhythmic', 'orexin_receptor_antagonist', 'antimalarial', 'adrenergic_receptor_antagonist', 'atm_kinase_inhibitor', 'phosphodiesterase_inhibitor', 'cdk_inhibitor', 'anti-inflammatory'] not in index"

In [26]:
sub.head()

NameError: name 'sub' is not defined