# Installing dependencies

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import os, re, random, gc
from tqdm import tqdm
from glob import glob

from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.decomposition import PCA

from sklearn.preprocessing import QuantileTransformer, LabelEncoder

import torch 
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset

import warnings
warnings.filterwarnings('ignore')

# Config

In [None]:
class config:
    
    ###############
    # Training
    ###############
    
    num_folds = 5
    
    num_workers = 8
    batch_size = 128
    num_epochs = 30
    
    ###############
    # LR scheduling
    ###############
    
    step_scheduler = True
    lr = 1e-4
    
    ###############
    # Miscellaneous
    ###############
    
    seed = 2020
    verbose = True
    verbose_step = 5
    
    seeds = [0, 42]

# Data manipulation

In [None]:
def encode_labels(df):
    
    le = LabelEncoder()
    cat_feats = ['cp_time', 'cp_dose', 'cp_type']
    
    for feat in cat_feats:
        df[feat] = le.fit_transform(df[feat])
    
    return df

In [None]:
DATA_PATH = '../input/lish-moa/'

TRAIN_FEATURES_PATH = DATA_PATH + 'train_features.csv'
TEST_FEATURES_PATH = DATA_PATH + 'test_features.csv'
TRAIN_TARGETS_PATH  = DATA_PATH + 'train_targets_scored.csv'

# Loading data

features_df = pd.read_csv(TRAIN_FEATURES_PATH)
targets_df = pd.read_csv(TRAIN_TARGETS_PATH)

test_features_df = pd.read_csv(TEST_FEATURES_PATH)

In [None]:
# Encoding features

features_df = encode_labels(features_df)
test_features_df = encode_labels(test_features_df)

# Remove control groups from features_df

features_df = features_df[features_df['cp_type'] != 0]
targets_df = targets_df.loc[features_df.index]

features_df = features_df.reset_index(drop=True)
targets_df = targets_df.reset_index(drop=True)

assert len(features_df) == len(targets_df), 'Dataframes do not have the same length'

In [None]:
# Transforming feature distributions

gene_features = [x for x in list(features_df.columns) if 'g-' in x]
cell_features = [x for x in list(features_df.columns) if 'c-' in x]
cat_features = ['cp_type', 'cp_time', 'cp_dose']

for col in gene_features + cell_features:
    transformer = QuantileTransformer()
    
    vec_len = len(features_df[col].values)
    vec_len_test = len(test_features_df[col].values)
    raw_vec = features_df[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    features_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features_df[col] = transformer.transform(test_features_df[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
features_df.head()

# Utils

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(config.seed)

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Dataset

### Dataset class

In [None]:
class MoADataset(Dataset):
    def __init__(self, features, targets=None, train=True):
        super().__init__()
        self.features = features.values
        self.train = train
        
        if self.train:
            self.targets = targets.values
                
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):
        feats = self.features[item, :].astype(np.float32)
        
        if self.train: 
            
            targets = self.targets[item, 1:].astype(np.float32) 
            
            return {
                'features': torch.tensor(feats, dtype=torch.float),
                'targets': torch.tensor(targets, dtype=torch.float),
            }
        else: 
            return {'features': torch.tensor(feats, dtype=torch.float)}

# Model

In [None]:
class BaselineModel(nn.Module):
    def __init__(self):
        super(BaselineModel, self).__init__()
        
        self.num_features = len(gene_features + cell_features + cat_features)
        
        self.block1 = nn.Sequential(
            nn.BatchNorm1d(self.num_features),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(self.num_features, 2048)),
            nn.ReLU(),
        )
        
        self.block2 = nn.Sequential(
            nn.BatchNorm1d(2048),
            nn.Dropout(0.5),
            nn.utils.weight_norm(nn.Linear(2048, 1024)),
            nn.ReLU(),
        )
        
        self.block3 = nn.Sequential(
            nn.BatchNorm1d(1024),
            nn.Dropout(0.5),
            nn.utils.weight_norm(nn.Linear(1024, 206)),
        )
    
    def forward(self,
                inputs):
        
        
        x = self.block1(inputs)
        x = self.block2(x)
            
        return self.block3(x)

# Fitter

In [None]:
class Fitter:
    def __init__(self, model, seed, fold, device, config):
        self.config = config
        self.model = model
        self.seed = seed
        self.device = device
        self.fold = fold
                        
        self.epoch = 0
        
        self.history = {
            'train_history_loss': [],
            'val_history_loss': [],
        }
        
        self.base_dir = './'
        self.log_path = f'{self.base_dir}/log.txt'
        
        self.best_loss = float('inf')
        
        self.optimizer = torch.optim.Adam(
            self.model.parameters(),
            weight_decay=1e-5
        )
        
        self.scheduler = lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='min',
            factor=0.1,
            patience=3,
            eps=1e-4,
            verbose=True
        )
        
        self.criterion = nn.BCEWithLogitsLoss().to(self.device)
        self.log(f'Fitter prepared. Training on {self.device}')
    
    def fit(self, train_loader, valid_loader):
        
        for epoch in range(self.config.num_epochs):
            
            if self.config.verbose:
                lr = self.optimizer.param_groups[0]['lr']
                timestamp = datetime.utcnow().isoformat()
                self.log(f'\n{timestamp}\nLR: {lr}\n')
            
            t = time.time()
            train_loss = self.train_one_epoch(train_loader)
            self.history['train_history_loss'].append(train_loss.avg)
            
            self.log(f'[RESULT]: Train. Epoch: {self.epoch}, ' + \
                     f'loss: {train_loss.avg:.5f}, ' + \
                     f'time: {(time.time() - t):.5f}')
            self.save(f'{self.base_dir}/last-checkpoint.bin')
            
            t = time.time()
            val_loss, y_oof = self.validation_one_epoch(valid_loader)
            self.history['val_history_loss'].append(val_loss.avg)
            
            self.log(f'[RESULT]: Val. Epoch: {self.epoch}, ' + \
                     f'val_loss: {val_loss.avg:.5f}, ' + \
                     f'time: {(time.time() - t):.5f}')
            
            self.scheduler.step(val_loss.avg)
            
            if val_loss.avg < self.best_loss:
                self.best_loss = val_loss.avg
                self.model.eval()
                self.save(f'{self.base_dir}/best-loss-fold-{str(self.fold)}-seed-{str(self.seed)}.bin')
            
            self.epoch += 1 
        
        return y_oof
    
    def train_one_epoch(self, train_loader):
        self.model.train()
        
        loss_score = AverageMeter()
        
        t = time.time()
        
        for step, data in enumerate(train_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Train Step {step}/{len(train_loader)}, ' + \
                        f'loss: {loss_score.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            
            features = data['features']
            targets = data['targets']
            
            features = features.to(self.device)
            targets = targets.to(self.device).float()
                
            batch_size = features.shape[0]
            
            for p in self.model.parameters(): p.grad = None
                
            outputs = self.model(
                features
            )
                
            loss = self.criterion(outputs, targets)
            loss.backward()
                
            loss_score.update(
                loss.detach().item(), 
                batch_size
            )
                
            self.optimizer.step()
        
        return loss_score

    def validation_one_epoch(self, valid_loader):
        self.model.eval()
        
        preds = []
        
        loss_score = AverageMeter()
        
        t = time.time()
        
        for step, data in enumerate(valid_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Val Step {step}/{len(valid_loader)}, ' + \
                        f'loss: {loss_score.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            
            features = data['features']
            targets = data['targets']
            
            features = features.to(self.device)
            targets = targets.to(self.device).float()
            
            batch_size = features.shape[0]
            
            with torch.no_grad():
                outputs = self.model(
                    features
                )
                loss = self.criterion(outputs, targets)
                loss_score.update(loss.detach().item(), batch_size)
                
                preds.append(
                    torch.sigmoid(outputs).detach().cpu().numpy()
                )
        
        return loss_score, np.concatenate(preds)
    
    def save(self, path):
        self.model.eval()
        
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'best_loss': self.best_loss,
            'epoch': self.epoch,
            'history': self.history,
        }, path)
    
    def load(self, path):
        checkpoint = torch.load(path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.best_summary_loss = checkpoint['best_loss']
        self.epoch = checkpoint['epoch'] + 1
        self.history = checkpoint['history']
        
    def log(self, message):
        if self.config.verbose:
            print(message)
        with open(self.log_path, 'a+') as logger:
            logger.write(f'{message}\n')
                
    def print_history(self):
        plt.figure(figsize=(15,5))
        
        plt.plot(
            np.arange(self.config.num_epochs),
            self.history['train_history_loss'],
            '-o',
            label='Train loss',
            color='#ff7f0e'
        )
        
        plt.plot(
            np.arange(self.config.num_epochs),
            self.history['val_history_loss'],
            '-o',
            label='Val loss',
            color='#1f77b4'
        )
        
        x = np.argmin(self.history['val_history_loss'])
        y = np.min(self.history['val_history_loss'])
        
        plt.ylim(0, 0.03)
        
        xdist = plt.xlim()[1] - plt.xlim()[0]
        ydist = plt.ylim()[1] - plt.ylim()[0]
        
        plt.scatter(x, y, s=200, color='#1f77b4')
        
        plt.text(
            x-0.03*xdist,
            y-0.13*ydist,
            'min loss\n%.5f'%y,
            size=14
        )
        
        plt.ylabel('Loss', size=14)
        plt.xlabel('Epoch', size=14)
        
        plt.legend(loc=2)
        
        plt.title(f'FOLD {self.fold + 1}',size=18)
        
        plt.legend(loc=3)
        plt.show()  

# Engine

In [None]:
oof_preds = np.zeros((len(features_df), 206, 3))

device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu'
)

all_features = cat_features + gene_features + cell_features

In [None]:
for i, seed in enumerate(config.seeds):
    kfold = MultilabelStratifiedKFold(config.num_folds, shuffle=True, random_state=seed)
    
    X = features_df[all_features].values
    y = targets_df.values
    
    for fold, (trn_, val_) in enumerate(kfold.split(X, y)):
        
        # Model
        model = BaselineModel().to(device)
        
        # Data
        X_train = features_df[all_features].loc[trn_].reset_index(drop=True)
        X_valid = features_df[all_features].loc[val_].reset_index(drop=True)
        
        y_train = targets_df.loc[trn_].reset_index(drop=True)
        y_valid = targets_df.loc[val_].reset_index(drop=True)
        
        # Dataset
        train_dataset = MoADataset(X_train, y_train)
        valid_dataset = MoADataset(X_valid, y_valid)
        
        # Dataloader
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=config.batch_size,
            pin_memory=True,
            drop_last=True,
            shuffle=True,
            num_workers=config.num_workers
        )

        valid_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )
        
        # Fitter
        fitter = Fitter(model, seed, fold, device, config)

        y_oof = fitter.fit(train_loader, valid_loader) 
        oof_preds[val_, :, i] = y_oof

        fitter.print_history()

# CV score

In [None]:
oof_preds = np.mean(oof_preds, axis=2)

target_cols = list(targets_df.columns)
target_cols.remove('sig_id')

In [None]:
oof_score = 0
y_true = targets_df[target_cols].values

for i in range(oof_preds.shape[1]):
    _score = log_loss(y_true[:,i], oof_preds[:,i])
    oof_score += _score / y_true.shape[1]

In [None]:
print('OOF CV score', oof_score)

# Inference

In [None]:
class TestConfig:
    
    ###############
    # Verbosity   #
    ###############
    verbose = True
    verbose_step = 1
        
    ###############
    # Data loader #
    ###############
    
    data_loader_params = dict(
        batch_size=128,
        num_workers=8,
        pin_memory=False,
        drop_last=False,
        shuffle=False,
    )

In [None]:
MODEL_PATHS = glob('./*.bin')
MODEL_PATHS.remove('./last-checkpoint.bin')

In [None]:
ctl_indices = test_features_df[test_features_df['cp_type'] == 'ctl_vehicle'].index

In [None]:
class Predictor:
    def __init__(self, model, df, device, config):
        self.model = model
        self.df = df
        self.device = device
        self.config = config
        
        self.base_dir = './'
        self.log_path = f'{self.base_dir}/log.txt'
        
        self._log(f'Predictor prepared. Predicting on {self.device}.')
    
    def predict(self):
        if self.config.verbose:
            timestamp = datetime.utcnow().isoformat()
            self._log(f'\n{timestamp}\n')
        
        t = time.time()
        
        dataset = MoADataset(
            self.df,
            train=False,
        )
        
        loader = torch.utils.data.DataLoader(
            dataset,
            **self.config.data_loader_params,
        )
        
        probabilities = self._predict_one_loader(loader)
        self._log(f'Inference done. Time: {(time.time() - t):.5f}')
                
        return probabilities

    def _predict_one_loader(self, test_loader):
        self.model.eval()
        
        t = time.time()
        
        probabilities = []
        
        for step, data in enumerate(test_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Inference Step {step}/{len(test_loader)}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            
            features = data['features']
        
            features = features.to(self.device)
                        
            with torch.no_grad():
                outputs = self.model(
                    features
                )
                
                probabilities.extend(
                    torch.sigmoid(outputs).data.cpu().numpy().tolist()
                )
        
        return np.array(probabilities)
    
    def _log(self, message):
        if self.config.verbose:
            print(message)
        with open(self.log_path, 'a+') as logger:
            logger.write(f'{message}\n')

In [None]:
final_predictions = []

In [None]:
for i, model_path in enumerate(MODEL_PATHS):
    print(f'Predicting with model #{i+1}')
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = BaselineModel().to(device)
    
    model.load_state_dict(
        torch.load(
            model_path, 
            map_location=torch.device('cpu')
        )['model_state_dict']
    )
    
    # Inference
    predictor = Predictor(model, test_features_df[all_features], device, TestConfig)
    predictions = predictor.predict()
    
    final_predictions.append(predictions)

In [None]:
probabilities = np.mean(final_predictions, axis=0)
print(probabilities.shape)

In [None]:
probabilities[ctl_indices, :] = 0

In [None]:
sub_df = pd.read_csv('../input/lish-moa/sample_submission.csv')
sub_df.head()

In [None]:
sub_df.loc[:, 1:] = probabilities

In [None]:
sub_df.to_csv('submission.csv', index=False)