In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 
                                                   'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df = df.drop(columns='sig_id')
    return df


train_features = preprocess(train_features)
test_features = preprocess(test_features)

# Delete objects without MoA. They has ctl_vehicle == 1 and they are cacb2b860 drug
train_targets = train_targets.loc[train_features['cp_type'] == 0].reset_index(drop=True)
train_features = train_features.loc[train_features['cp_type'] == 0].reset_index(drop=True)
train_targets = train_targets.drop(columns='sig_id')
train_features.head()

In [None]:
from sklearn.feature_selection import VarianceThreshold

features_np = train_features.to_numpy()
vt_selector = VarianceThreshold(threshold=0.8)
vt_selector.fit(features_np[:, 3:])
_top_feats = vt_selector.get_support(indices=True)
_top_feats = _top_feats + 3
_f = np.array([1, 2])
_top_feats = np.copy(np.concatenate([_f, _top_feats]))
del _f
top_feats = _top_feats.tolist()  # !
del _top_feats

print(len(top_feats))
print(top_feats[:5], 'â€¦', top_feats[-5:])
del features_np

In [None]:
def create_folds(num_starts, num_splits):
    
    folds = []
    
    # LOAD FILES
    train_feats = pd.read_csv('../input/lish-moa/train_features.csv')
    scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
    scored = scored.loc[train_feats['cp_type'] == 'trt_cp', :]
    drug = drug.loc[train_feats['cp_type'] == 'trt_cp', :]
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    for seed in range(num_starts):

        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}; dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)
        tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        scored['fold'] = scored.drug_id.map(dct1)
        scored.loc[scored.fold.isna(),'fold'] =\
            scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
        scored.fold = scored.fold.astype('int8')
        folds.append(scored.fold.values)
        
        del scored['fold']
        
    return np.stack(folds)

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

In [None]:
N_FOLDS = 6
N_STARTS = 5
N_EPOCHS = 50
random_state = 42

batch_size = 128
val_batch_size = batch_size * 4

ntargets = train_targets.shape[1]
targets = [col for col in train_targets.columns]

criterion = nn.BCELoss()  # Binary Cross Entropy loss function

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class MoaModel(nn.Module):
    def __init__(self, num_columns):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1048))
        
        self.batch_norm2 = nn.BatchNorm1d(1048)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1048, 1048))
        
        self.batch_norm3 = nn.BatchNorm1d(1048)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1048, 206))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.sigmoid(self.dense3(x))
        
        return x

In [None]:
# dataset class
class MoaDataset(Dataset):
    def __init__(self, df, targets, feats_idx, mode='train'):
        self.mode = mode
        self.feats = feats_idx
        self.data = df[:, feats_idx]
        if mode=='train':
            self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.mode == 'train':
            return torch.FloatTensor(self.data[idx]), torch.FloatTensor(self.targets[idx])
        elif self.mode == 'test':
            return torch.FloatTensor(self.data[idx]), 0

In [None]:
folds_cv = create_folds(N_STARTS, N_FOLDS)

unique, counts = np.unique(folds_cv[0], return_counts=True)
for i in unique:
    print(i, counts[i])
del unique
del counts

In [None]:
for seed in range(N_STARTS):
    print(f'Train seed {seed}')
    set_seed(seed)

    for fold in range(N_FOLDS):
        tr_idx = folds_cv[seed] != fold
        te_idx = folds_cv[seed] == fold

        xtrain = train_features.to_numpy()[tr_idx]
        ytrain = train_targets.to_numpy()[tr_idx]
        xval = train_features.to_numpy()[te_idx]
        yval = train_targets.to_numpy()[te_idx]

        train_set = MoaDataset(xtrain, ytrain, top_feats)
        val_set = MoaDataset(xval, yval, top_feats)
        
        dataloaders = {
            'train': DataLoader(train_set, batch_size=batch_size, shuffle=True),
            'val': DataLoader(val_set, batch_size=val_batch_size, shuffle=False)
        }

        model = MoaModel(len(top_feats)).to(device)
        checkpoint_path = f'repeat:{seed}_Fold:{fold+1}.pt'
        optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                         factor=0.1, patience=3, 
                                                         eps=1e-4, verbose=True)
        best_loss = {'train': np.inf, 'val': np.inf}
            
        for epoch in range(N_EPOCHS):
            epoch_loss = {'train': 0.0, 'val': 0.0}
          
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()
                else:
                    model.eval()
                
                running_loss = 0.0
                
                for i, (x, y) in enumerate(dataloaders[phase]):
                    x, y = x.to(device), y.to(device)
                    
                    optimizer.zero_grad()
                    
                    with torch.set_grad_enabled(phase=='train'):
                        preds = model(x)
                        loss = criterion(preds, y)
                        
                        if phase=='train':
                            loss.backward()
                            optimizer.step()
                        
                    running_loss += loss.item() / len(dataloaders[phase])
                
                epoch_loss[phase] = running_loss
            
            print("Epoch {}/{}   -   loss: {:5.5f}   -   val_loss: {:5.5f}".format(epoch+1, N_EPOCHS, epoch_loss['train'], epoch_loss['val']))
            
            scheduler.step(epoch_loss['val'])
            
            if epoch_loss['val'] < best_loss['val']:
                best_loss = epoch_loss
                torch.save(model.state_dict(), checkpoint_path)
                
                
                
                

In [None]:
test_np = test_features.to_numpy()

oof = np.zeros((len(train_features.to_numpy()), N_STARTS, ntargets))
oof_targets = np.zeros((len(train_features.to_numpy()), ntargets))
preds = np.zeros((len(test_np), ntargets))

In [None]:
def mean_log_loss(y_true, y_pred):
    metrics = []
    worst_target = None
    worst_loss = 0.
    all_targets_ll = {}
    for i, target in enumerate(targets):
        _ll = log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1])
        metrics.append(_ll)
        all_targets_ll[target] = _ll
        if _ll > worst_loss:
            worst_loss = _ll
            worst_target = target
    return np.mean(metrics), (worst_target, worst_loss), all_targets_ll

In [None]:
worst_targets_in_seed = []
all_targets_ll_in_seed = []
for seed in range(N_STARTS):
    print(f"Inference for seed {seed}")
    seed_targets = []
    seed_oof = []
    seed_preds = np.zeros((len(test_np), ntargets, N_FOLDS))
    
    for fold in range(N_FOLDS):
        # print(n, len(tr))
        tr_idx = folds_cv[seed] != fold
        te_idx = folds_cv[seed] == fold
        xval = train_features.to_numpy()[te_idx]
        yval = train_targets.to_numpy()[te_idx]
        fold_preds = []
        
        val_set = MoaDataset(xval, yval, top_feats)
        test_set = MoaDataset(test_np, None, top_feats, mode='test')
        
        dataloaders = {
            'val': DataLoader(val_set, batch_size=val_batch_size, shuffle=False),
            'test': DataLoader(test_set, batch_size=val_batch_size, shuffle=False)
        }
        
        checkpoint_path = f'repeat:{seed}_Fold:{fold+1}.pt'
        model = MoaModel(len(top_feats)).to(device)
        model.load_state_dict(torch.load(checkpoint_path))
        model.eval()
        
        for phase in ['val', 'test']:
            for i, (x, y) in enumerate(dataloaders[phase]):
                # print(i)
                if phase == 'val':
                    x, y = x.to(device), y.to(device)
                elif phase == 'test':
                    x = x.to(device)
                
                with torch.no_grad():
                    batch_preds = model(x)
                    
                    if phase == 'val':
                        seed_targets.append(y)
                        seed_oof.append(batch_preds)
                        # print(y_pred_lr.shape)
                    elif phase == 'test':
                        fold_preds.append(batch_preds)
                    
        fold_preds = torch.cat(fold_preds, dim=0).cpu().numpy()
        seed_preds[:, :, fold] = fold_preds
        
    seed_targets = torch.cat(seed_targets, dim=0).cpu().numpy()
    seed_oof = torch.cat(seed_oof, dim=0).cpu().numpy()
    seed_preds = np.mean(seed_preds, axis=2)
    
    print("Score for this seed {:5.5f}".format(mean_log_loss(seed_targets, seed_oof)[0]))
    worst_targets_in_seed.append(mean_log_loss(seed_targets, seed_oof)[1])
    all_targets_ll_in_seed.append(mean_log_loss(seed_targets, seed_oof)[2])
    oof_targets = seed_targets
    oof[:, seed, :] = seed_oof
    preds += seed_preds / N_STARTS

oof = np.mean(oof, axis=1)
print("Overall score is {:5.5f}".format(mean_log_loss(oof_targets, oof)[0]))

In [None]:
ss[targets] = preds
ss.loc[test_features['cp_type']== 1, targets] = 0
ss.to_csv('submission.csv', index=False)