In [1]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import QuantileTransformer

In [3]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/moa-make-foldsmoa-make-folds/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/moa-make-foldsmoa-make-folds/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [4]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [5]:
#归一化、
from sklearn.preprocessing import StandardScaler,MinMaxScaler
std = StandardScaler()
train_features[CELLS] = std.fit_transform(train_features[CELLS])
test_features[CELLS] = std.transform(test_features[CELLS])

std = StandardScaler()
train_features[GENES] = std.fit_transform(train_features[GENES])
test_features[GENES] = std.transform(test_features[GENES])

In [6]:
#RankGauss
for col in (GENES + CELLS):

    transformer = QuantileTransformer(n_quantiles=150,random_state=0, output_distribution="normal")  #30 50 75 100， 125，150
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [7]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [8]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
target = train[train_targets_scored.columns[:207]]

In [9]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [10]:
train_non_scored = train_features.merge(train_targets_nonscored, on='sig_id')
train_non_scored = train_non_scored[train_non_scored['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
target_non_scored = train_non_scored[train_targets_nonscored.columns[:403]]
train_non_scored = train_non_scored.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [11]:
target_non_scored_cols = target_non_scored.drop('sig_id', axis=1).columns.values.tolist()

In [12]:
folds = train.copy()
folds.head()

Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,fold_42,fold_0,fold_1,fold_2,fold_3
0,id_000644bb2,24,D1,1.136019,0.907605,-0.415851,-0.967186,-0.254175,-1.016241,-1.367177,...,0,0,0,0,0,6,1,3,3,5
1,id_000779bfc,72,D1,0.119324,0.681886,0.272338,0.080009,1.204341,0.686643,0.314242,...,0,0,0,0,0,0,0,3,6,3
2,id_000a6266a,48,D1,0.780214,0.94554,1.425405,-0.132057,-0.006972,1.490785,0.235571,...,0,0,0,0,0,0,5,3,3,1
3,id_0015fd391,48,D1,-0.735392,-0.274322,-0.438155,0.759388,2.46055,-0.858765,-2.2789,...,0,0,0,0,0,6,4,3,1,2
4,id_001626bd3,72,D2,-0.452689,-0.477644,0.972584,0.970047,1.464425,-0.870447,-0.376373,...,0,0,0,0,0,1,6,5,3,3


In [13]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1086)
(21948, 1086)
(3624, 875)
(21948, 207)
(3982, 207)


In [14]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

In [15]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds



In [16]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

# Model

In [17]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.45)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def recalibrate_layer(self, layer):
        if(torch.isnan(layer.weight_v).sum() > 0):
            print ('recalibrate layer.weight_v')
            layer.weight_v = torch.nn.Parameter(torch.where(torch.isnan(layer.weight_v), torch.zeros_like(layer.weight_v), layer.weight_v))
            layer.weight_v = torch.nn.Parameter(layer.weight_v + 1e-7)

        if(torch.isnan(layer.weight).sum() > 0):
            print ('recalibrate layer.weight')
            layer.weight = torch.where(torch.isnan(layer.weight), torch.zeros_like(layer.weight), layer.weight)
            layer.weight += 1e-7
    
    def forward(self, x):
        x = self.batch_norm1(x)
        self.recalibrate_layer(self.dense1)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        self.recalibrate_layer(self.dense2)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        
        x = self.dense3(x)
        self.recalibrate_layer(self.dense3)
        return x
    
    
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))    

# Preprocessing steps

In [18]:
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['fold_42','sig_id','fold_0','fold_1','fold_2','fold_3']]
len(feature_cols)

877

In [19]:
# HyperParameters
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 200
BATCH_SIZE = 784
LEARNING_RATE = 1e-2
WEIGHT_DECAY = 1e-5
NFOLDS = 7           
EARLY_STOPPING_STEPS = 30
EARLY_STOP = True

num_features=len(feature_cols)
num_targets=len(target_cols)
num_non_scored_targets=len(target_non_scored_cols)
hidden_size=700   # 1500 1300 1400 1000 900 800 1700  20000


# Single fold training

In [20]:
def run_training(fold, seed):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train[f'fold_{seed}'] != fold].index
    val_idx = train[train[f'fold_{seed}'] == fold].index
    
    train_df = train[train[f'fold_{seed}'] != fold].reset_index(drop=True)
    valid_df = train[train[f'fold_{seed}'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    #scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.15, div_factor=1e3, max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    scheduler = torch.torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=7, eps=1e-08)
    
    loss_fn = nn.BCEWithLogitsLoss()
    loss_tr = SmoothBCEwLogits(smoothing =0.00)
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
   
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, loss_tr, trainloader, DEVICE)
       # print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        scheduler.step(valid_loss)
        
        if valid_loss < best_loss:
            early_step=0
            best_loss = valid_loss
           # oof[val_idx] = valid_preds
            print(f'best_loss is: {best_loss}, save_model{fold}_{seed}_pth!')
            torch.save(model.state_dict(), f"FOLD{fold}_{seed}.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    del model
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,

    )
    
    model.load_state_dict(torch.load(f"FOLD{fold}_{seed}.pth"))
    model.to(DEVICE)
    
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
    oof[val_idx] = valid_preds
    print(f'load model_{fold}_{seed},valie_loss is {valid_loss}')
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions


In [21]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [22]:
# Averaging on multiple SEEDS
SEED = [42, 0] #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train[target_cols] = oof
test[target_cols] = predictions


FOLD: 0, EPOCH: 0, valid_loss: 0.04767447616904974
best_loss is: 0.04767447616904974, save_model0_42_pth!
FOLD: 0, EPOCH: 1, valid_loss: 0.022641637828201056
best_loss is: 0.022641637828201056, save_model0_42_pth!
FOLD: 0, EPOCH: 2, valid_loss: 0.022541329730302095
best_loss is: 0.022541329730302095, save_model0_42_pth!
FOLD: 0, EPOCH: 3, valid_loss: 0.02105805790051818
best_loss is: 0.02105805790051818, save_model0_42_pth!
FOLD: 0, EPOCH: 4, valid_loss: 0.019458835944533348
best_loss is: 0.019458835944533348, save_model0_42_pth!
FOLD: 0, EPOCH: 5, valid_loss: 0.018745516426861286
best_loss is: 0.018745516426861286, save_model0_42_pth!
FOLD: 0, EPOCH: 6, valid_loss: 0.018458887934684753
best_loss is: 0.018458887934684753, save_model0_42_pth!
FOLD: 0, EPOCH: 7, valid_loss: 0.018153571523725986
best_loss is: 0.018153571523725986, save_model0_42_pth!
FOLD: 0, EPOCH: 8, valid_loss: 0.018131399992853403
best_loss is: 0.018131399992853403, save_model0_42_pth!
FOLD: 0, EPOCH: 9, valid_loss: 0

In [23]:
len(target_cols)

206

In [24]:
valid_results = train_targets_scored.drop(columns=target_cols+['fold_42','fold_0','fold_1','fold_2','fold_3']).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
valid_results = valid_results.add_prefix('pre_')
valid_results.rename(columns={'pre_sig_id':'sig_id'}, inplace=True)

In [25]:
oof = train_targets_scored.drop(columns=['fold_42','fold_0','fold_1','fold_2','fold_3'],axis=1)
oof = oof.merge(valid_results, on=['sig_id'], how='left')
oof.to_csv('moa_nn_oof.csv', index=False)

In [26]:
oof.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,pre_tropomyosin_receptor_kinase_inhibitor,pre_trpv_agonist,pre_trpv_antagonist,pre_tubulin_inhibitor,pre_tyrosine_kinase_inhibitor,pre_ubiquitin_specific_protease_inhibitor,pre_vegfr_inhibitor,pre_vitamin_b,pre_vitamin_d_receptor_agonist,pre_wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0.00027,0.000546,0.002515,0.00276,0.001421,0.000378,0.001013,0.002308,0.000326,0.002011
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0.001195,0.001772,0.002058,0.004516,0.003325,0.000365,0.002068,0.003208,0.001955,0.002418
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0.000291,0.001293,0.00224,0.001661,0.00484,0.000692,0.05458,0.000669,0.000502,0.001214
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0.000452,0.003896,0.001684,0.179265,0.00382,0.000479,0.001704,0.001057,0.000249,0.000342
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0.000938,0.001248,0.003526,0.003252,0.001624,0.000558,0.001656,0.002112,0.00056,0.001276


In [27]:
score = 0
for i in (target_cols):
    score_ = log_loss(oof[i], oof['pre_'+i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.015564507895729994


In [28]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

In [29]:
sub.shape

(3982, 207)