In [None]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
import random
import pandas as pd
import os
import copy
import gc
import time

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from sklearn.pipeline import Pipeline



import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

import warnings
warnings.filterwarnings('ignore')

os.listdir('../input/lish-moa')

pd.set_option('max_columns', 2000)

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

train = train_features.merge(train_targets_scored, on='sig_id')

drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')

train = train.merge(drug, on="sig_id")

train["drug_id_cnt"] = train["drug_id"].map(train["drug_id"].value_counts().to_dict())

train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)

test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target_cols = train_targets_scored.drop('sig_id', axis=1).columns.values.tolist()

GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

feature_cols = [c for c in train_features.columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['fold','sig_id', 'drug_id']]


In [None]:
train.head()

In [None]:
SEED = 1337
FOLDS = 5

# LOCATE DRUGS
vc = train.drug_id.value_counts()
vc1 = vc.loc[(vc==6)|(vc==12)|(vc==18)].index.sort_values()
vc2 = vc.loc[(vc!=6)&(vc!=12)&(vc!=18)].index.sort_values()

# STRATIFY DRUGS 18X OR LESS
dct1 = {}; dct2 = {}
skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]
for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[target_cols])):
    dd = {k:fold for k in tmp.index[idxV].values}
    dct1.update(dd)

# STRATIFY DRUGS MORE THAN 18X
skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)
for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[target_cols])):
    dd = {k:fold for k in tmp.sig_id[idxV].values}
    dct2.update(dd)

# ASSIGN FOLDS
train['fold'] = train.drug_id.map(dct1)
train.loc[train.fold.isna(),'fold'] = train.loc[train.fold.isna(),'sig_id'].map(dct2)
train.fold = train.fold.astype('int8')



In [None]:
#train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
folds = train["fold"]
train["fold"].value_counts()

In [None]:
kf = []
for f in range(FOLDS):
    trn_idx = np.where(folds.values!=f)[0]
    val_idx = np.where(folds.values==f)[0]
    kf.append([trn_idx, val_idx])

In [None]:
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

class MoADataset:
    def __init__(self, genes, cells, features, sample_weights=None, targets=None):
        self.genes = genes
        self.cells = cells
        self.features = features
        self.sample_weights = sample_weights
        self.targets = targets
        
        if self.sample_weights is None:
            self.sample_weights = np.zeros(len(self.genes))
        
        if self.targets is None:
            self.targets = np.zeros((len(self.genes),1))
        
    def __len__(self):
        return (self.genes.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'genes' : torch.tensor(self.genes[idx,:], dtype=torch.float),
            'cells' : torch.tensor(self.cells[idx,:], dtype=torch.float),
            'features': torch.tensor(self.features[idx,:], dtype=torch.float),
            'sample_weights': torch.tensor(self.sample_weights[idx], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)           
        }
        return dct


class FE():
    
    def __init__(self):
        self.fnames = []
        return
    
    def fit(self, df):
        ''
        self.ohe = {}
        for f in ["cp_time", "cp_dose"]:
            self.ohe[f] = OneHotEncoder(sparse=False).fit(df[f].values.reshape(-1,1))
        
    def transform(self, df, is_train=True):
        
        if is_train:
            y = df[target_cols]
        else:
            y = None
        
        X = pd.DataFrame()
        
        X[GENES] = df[GENES]
        X[CELLS] = df[CELLS]
        
        for f in self.ohe.keys():
            oh = self.ohe[f].transform(df[f].values.reshape(-1,1))
            oh = pd.DataFrame(oh, columns=[f"{f}_{c}" for c in self.ohe[f].get_feature_names()])
            X = pd.concat([X, oh], axis=1)
            self.fnames += list(oh.columns)
        
        return X, y
    
fe = FE()

fe.fit(pd.concat([train, test], axis=0))



X_train, Y_train = fe.transform(train)
X_test, Y_test = fe.transform(test, is_train=False)


In [None]:
def metric(y, p):
    loss = 0
    y_pred_clip = np.clip(p, 1e-15, 1 - 1e-15)
    for i in range(p.shape[1]):
        loss += - np.mean(y[:, i] * np.log(y_pred_clip[:, i]) + (1 - y[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / p.shape[1]

train["sample_weight"] = 1



def get_activation(act):
    if act == "leaky_relu":
        return nn.LeakyReLU()
    if act == "relu":
        return nn.ReLU()
    if act == "prelu":
        return nn.PReLU()
    
def get_bn(bn, l):
    if bn == True:
        return nn.BatchNorm1d(l)
    else:
        return nn.Identity()

class Model(nn.Module):
    def __init__(self, num_targets):
        super(Model, self).__init__()
        
        hidden1 = 128
        hidden2 = 256
        
        dropout1 = 0.2749655379470675
        dropout2 = 0.059365935529546235
        dropout3 = 0.33205409463214886
        
        activation1 = "leaky_relu"
        activation2 = "prelu"
        activation3 = "prelu"
        
        bn1 = True
        bn2 = False
        bn3 = False
        bn4 = True
        
        self.genes = nn.Sequential(
            get_bn(bn1, len(GENES)),
            nn.utils.weight_norm(nn.Linear(len(GENES), hidden1, bias=True)),
            get_activation(activation1),
            get_bn(bn2, hidden1),
            nn.Dropout(dropout1),
            #nn.LeakyReLU(),
            #trial.suggest_categorical('activation1', [nn.LeakyReLU(), nn.ReLU(), nn.PReLU()]),
        )
        
        self.cells = nn.Sequential(
            get_bn(bn3, len(CELLS)),
            nn.utils.weight_norm(nn.Linear(len(CELLS), hidden2, bias=True)),
            get_activation(activation2),
            get_bn(bn4, hidden2),
            nn.Dropout(dropout2),
            #nn.LeakyReLU(),
            #trial.suggest_categorical('activation2', [nn.LeakyReLU(), nn.ReLU(), nn.PReLU()]),
            #nn.BatchNorm1d(hidden2),
        )

#         )
        
        
        self.out = nn.Sequential(
            #get_bn(bn3, hidden1+hidden2),
            
            nn.utils.weight_norm(nn.Linear(hidden1+hidden2, hidden1+hidden2, bias=False)),
            get_activation(activation3),
            #nn.LeakyReLU(),
            #trial.suggest_categorical('activation3', [nn.LeakyReLU(), nn.ReLU(), nn.PReLU()]),
            
            nn.BatchNorm1d(hidden1+hidden2),
            nn.Dropout(dropout3),
            nn.utils.weight_norm(nn.Linear(hidden1+hidden2, num_targets, bias=True)),
            #nn.Sigmoid()
        )
    
    def forward(self, data):
        
        genes = data["genes"].to(DEVICE)
        cells = data["cells"].to(DEVICE)
        #features = data["features"].to(DEVICE)
        
#         genes_cnn = torch.max(self.genes_cnn(genes.unsqueeze(1)), dim=2)[0]
#         cells_cnn = torch.max(self.cells_cnn(cells.unsqueeze(1)), dim=2)[0]
        
        genes = self.genes(genes)
        cells = self.cells(cells)
        #features = self.features(features)
        
        
        
        x = torch.cat([genes, cells], dim=1)
        
        #x = torch.cat([x, features], dim=1)
        
        out = self.out(x)
        
        #out = torch.clamp(out, 1e-15, 1 - 1e-15)
        
        return out
    

    


EPOCHS = 29

BATCH_SIZE = 512

NFOLDS = 5
NBAGS = 5

if len(test) > 3624:
    FULL_FIT = False
else:
    FULL_FIT = False

num_features=X_train.shape[1]
num_targets=Y_train.shape[1]

seed = 42    
#seed_everything(seed)

oof = np.zeros((len(train), num_targets))

p_test = []

for fold, (trn_idx, val_idx) in enumerate(kf):
    
#     if fold != 4:
#         continue
    
    if FULL_FIT:
        trn_idx = np.arange(len(X_train))
    
    preds_bag = []

    for bag in range(NBAGS):
        
        GENES = [col for col in train_features.columns if col.startswith('g-')]
        CELLS = [col for col in train_features.columns if col.startswith('c-')]
        FNAMES = fe.fnames

        x_train, y_train  = X_train.iloc[trn_idx].reset_index(drop=True).copy(), Y_train.values[trn_idx].copy()
        x_valid, y_valid =  X_train.iloc[val_idx].reset_index(drop=True).copy(), Y_train.values[val_idx].copy()
        x_test = X_test.copy()
        
        
        
        
        pipe = Pipeline([('scaler', QuantileTransformer(n_quantiles=1000, random_state=np.random.randint(10_000), output_distribution="normal"))])
        #pipe = Pipeline([('scaler', RobustScaler())])
        pipe.fit(np.concatenate([X_train[GENES], X_test[GENES]], axis=0))
        x_train[GENES] = pipe.transform(x_train[GENES])
        x_valid[GENES] = pipe.transform(x_valid[GENES])
        x_test[GENES] = pipe.transform(x_test[GENES])
        
#         n_comp = 600
#         pca = Pipeline([('pca', PCA(n_components=n_comp)), ('scaler', QuantileTransformer(n_quantiles=100, random_state=np.random.randint(10_000), output_distribution="normal"))])
#         pca.fit(np.concatenate([x_train[GENES], x_valid[GENES], x_test[GENES]], axis=0))
#         x_train_pca = pd.DataFrame(pca.transform(x_train[GENES]), columns=[f'pca_G-{i}' for i in range(n_comp)])
#         x_train = pd.concat([x_train, x_train_pca], axis=1)
#         x_valid_pca = pd.DataFrame(pca.transform(x_valid[GENES]), columns=[f'pca_G-{i}' for i in range(n_comp)])
#         x_valid = pd.concat([x_valid, x_valid_pca], axis=1)
#         x_test_pca = pd.DataFrame(pca.transform(x_test[GENES]), columns=[f'pca_G-{i}' for i in range(n_comp)])
#         x_test = pd.concat([x_test, x_test_pca], axis=1)
#         GENES += list(x_train_pca.columns)
        #, ('var', VarianceThreshold(0.8))
        pipe = Pipeline([('scaler', RobustScaler(quantile_range=(25.0, 75.0)))])
        pipe.fit(np.concatenate([X_train[CELLS], X_test[CELLS]], axis=0))
        x_train[CELLS] = pipe.transform(x_train[CELLS])
        x_valid[CELLS] = pipe.transform(x_valid[CELLS])
        x_test[CELLS] = pipe.transform(x_test[CELLS])
        #x_train[CELLS] = np.exp(x_train[CELLS])
        #x_valid[CELLS] = np.exp(x_valid[CELLS])
        
#         n_comp = 50
#         pca = PCA(n_components=n_comp)
#         pca.fit(np.concatenate([x_train[CELLS], x_valid[CELLS], x_test[CELLS]], axis=0))
#         x_train_pca = pd.DataFrame(pca.transform(x_train[CELLS]), columns=[f'pca_C-{i}' for i in range(n_comp)])
#         x_train = pd.concat([x_train, x_train_pca], axis=1)
#         x_valid_pca = pd.DataFrame(pca.transform(x_valid[CELLS]), columns=[f'pca_C-{i}' for i in range(n_comp)])
#         x_valid = pd.concat([x_valid, x_valid_pca], axis=1)
#         x_test_pca = pd.DataFrame(pca.transform(x_test[CELLS]), columns=[f'pca_C-{i}' for i in range(n_comp)])
#         x_test = pd.concat([x_test, x_test_pca], axis=1)
#         CELLS += list(x_train_pca.columns)
        
#         print(len(CELLS))

#         var = VarianceThreshold(0.8)
#         var.fit(np.concatenate([x_train[GENES]], axis=0))
#         x_train_genes = var.transform(x_train[GENES])
#         x_valid_genes = var.transform(x_valid[GENES])  
#         print(x_train[GENES].shape, x_train_genes.shape)
        
#         var = VarianceThreshold(0.8)
#         var.fit(np.concatenate([x_train[CELLS]], axis=0))
#         x_train_cells = var.transform(x_train[CELLS])
#         x_valid_cells = var.transform(x_valid[CELLS])  
#         print(x_train[CELLS].shape, x_train_cells.shape)
        
        #x_train = x_train.values
        #x_valid = x_valid.values
        #x_test = x_test.values

        train_dataset = MoADataset(x_train[GENES].values, x_train[CELLS].values, x_train[FNAMES].values, train["sample_weight"].values[trn_idx], y_train)
        valid_dataset = MoADataset(x_valid[GENES].values, x_valid[CELLS].values, x_valid[FNAMES].values, train["sample_weight"].values[val_idx], y_valid)
        test_dataset = MoADataset(x_test[GENES].values, x_test[CELLS].values, x_test[FNAMES].values)
        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
        testloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

        model = Model(
            num_targets=num_targets
        )

        model.to(DEVICE)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.04647353847564317, weight_decay=8.087569236449597e-06)

        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=len(trainloader)*EPOCHS//2, num_training_steps=len(trainloader)*EPOCHS)

        #loss_fn = nn.BCEWithLogitsLoss()
        loss_fn = nn.BCEWithLogitsLoss()

        best_loss = np.inf
        best_preds = None

        for epoch in range(EPOCHS):

            start_time = time.time()

            model.train()
            train_loss = 0

            progress_bar = tqdm(trainloader, total=len(trainloader), disable=True)
            for data in progress_bar:
                optimizer.zero_grad()
                outputs = model(data)
                loss = nn.BCEWithLogitsLoss(reduction="mean")(outputs, data['y'].to(DEVICE))

    #             loss = loss * sample_weights.reshape(-1,1)
    #             loss = loss.sum() / sample_weights.sum()

                loss.backward()
                optimizer.step()

                if scheduler is not None:
                    scheduler.step()

                train_loss += loss.item() / len(trainloader)
                progress_bar.set_description(f"loss: {loss.item():.2f}")

            model.eval()
            valid_loss = 0
            valid_targets = []
            valid_preds = []

            for data in validloader:
                outputs = model(data)
                loss = loss_fn(outputs, data["y"].to(DEVICE))

                valid_loss += loss.item() / len(validloader)
                
                valid_targets.append(data["y"])
                valid_preds.append(outputs.sigmoid())
             
            valid_targets = torch.cat(valid_targets)
            valid_preds = torch.cat(valid_preds)
            #print(loss_fn(valid_preds, valid_targets))
            valid_preds = valid_preds.detach().cpu().numpy()
                
            valid_score = metric(y_valid, valid_preds)

            print(f"Fold {fold} Epoch {epoch} {time.time()-start_time:.2f}s train_loss: {train_loss:.5f}, val_loss: {valid_loss:.5f}, val_score: {valid_score:.5f}")
            
            best_preds = valid_preds
            
            
            
        print()
        
        preds_bag.append(best_preds)
        
        bag_score = metric(y_valid, np.mean(preds_bag, axis=0))
        print(f"Bag {bag} val_score: {bag_score:.5f}")
        print()
        
        test_preds = []
        for data in testloader:
            outputs = model(data)

            test_preds.append(outputs.sigmoid())
        test_preds = torch.cat(test_preds).detach().cpu().numpy()
        p_test.append(test_preds)

    oof[val_idx] = np.mean(preds_bag, axis=0)

    print()
    #return oof, predictions

In [None]:
metric(Y_train.values, oof)

In [None]:
# import scipy as sp
# from functools import partial


# init=[0]*oof.shape[1]

# #init = 1/np.mean(Y_train.values, axis=0)

# def _loss(coef, X, y):
#     X = X+coef
#     #print(coef)
#     loss = metric(y,X)
#     print(loss)
#     return loss

# loss_partial = partial(_loss, X=oof, y=Y_train.values)
# initial_coef = np.array(init).reshape(-1)
# coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead', options={"maxiter": 100})

In [None]:
p_test = np.mean(p_test, axis=0)

In [None]:
test[target_cols] = p_test


In [None]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()