In [None]:
!ls ../input

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import os
import time
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA, FastICA, TruncatedSVD, FactorAnalysis, DictionaryLearning, LatentDirichletAllocation
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

try:
    from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
except:
    from transformers import WarmupLinearSchedule, WarmupCosineSchedule

import warnings
warnings.filterwarnings('ignore')

DEBUG = False
IS_TRAIN = False
DATA_PATH = '../input/lish-moa'
CACHE_PATH = '../input/mlp1-torch-weights1109'
if not os.path.exists(CACHE_PATH):
    os.mkdir(CACHE_PATH)
data_process_file = f'{CACHE_PATH}/data_process.pkl'

EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False
hidden_size = 2048
# FEAT_SAMPLE_RATE = 0.95
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
# SEED = [0]
SEED = [0, 1, 2, 3, 4, 5, 6]

if DEBUG:
    NFOLDS = 2
    EPOCHS = 2
    SEED = [0]
    CACHE_PATH = f'{CACHE_PATH}/debug'
    if not os.path.exists(CACHE_PATH):
        os.mkdir(CACHE_PATH)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def save_pickle(dic, save_path):
    with open(save_path, 'wb') as f:
        pickle.dump(dic, f)


def load_pickle(load_path):
    with open(load_path, 'rb') as f:
        message_dict = pickle.load(f)
    return message_dict

def score_fn(y_true, y_pred):
    losses = []
    for idx in range(len(target_cols)):
        loss = log_loss(y_true[:, idx], y_pred[:, idx])
        losses.append(loss)
    return np.mean(losses)

seed_everything(seed=42)

selected_cols = ['sig_id','cp_type','cp_time','cp_dose'
                     ,'g-0','g-2','g-3','g-5','g-6','g-7','g-8'
,'g-9','g-11','g-12','g-13','g-15','g-16','g-17','g-18','g-20','g-21'
,'g-22','g-24','g-25','g-26','g-27','g-28','g-29','g-30','g-31','g-32'
,'g-33','g-34','g-36','g-37','g-38','g-39','g-41','g-42','g-43','g-45'
,'g-47','g-48','g-49','g-50','g-51','g-52','g-53','g-54','g-55','g-56'
,'g-57','g-58','g-60','g-61','g-62','g-63','g-65','g-66','g-67','g-68'
,'g-69','g-70','g-71','g-72','g-73','g-75','g-76','g-77','g-78','g-79'
,'g-80','g-81','g-83','g-84','g-85','g-86','g-87','g-89','g-90','g-91'
,'g-92','g-93','g-94','g-96','g-97','g-98','g-100','g-101','g-102','g-103'
,'g-104','g-105','g-106','g-107','g-108','g-109','g-110','g-111','g-112'
,'g-113','g-114','g-115','g-116','g-117','g-118','g-119','g-120','g-121'
,'g-122','g-123','g-124','g-125','g-126','g-127','g-129','g-130','g-131'
,'g-132','g-133','g-134','g-135','g-136','g-137','g-138','g-139','g-140'
,'g-141','g-142','g-143','g-144','g-146','g-147','g-148','g-149','g-150'
,'g-151','g-152','g-154','g-156','g-157','g-158','g-160','g-161','g-162'
,'g-163','g-164','g-165','g-166','g-167','g-169','g-170','g-172','g-173'
,'g-174','g-175','g-177','g-178','g-179','g-180','g-181','g-183','g-184'
,'g-185','g-186','g-187','g-188','g-189','g-190','g-192','g-194','g-195'
,'g-196','g-199','g-200','g-202','g-203','g-205','g-206','g-207','g-208'
,'g-209','g-210','g-211','g-212','g-215','g-216','g-217','g-218','g-219'
,'g-221','g-222','g-224','g-225','g-226','g-227','g-228','g-229','g-230'
,'g-231','g-233','g-235','g-236','g-237','g-238','g-239','g-240','g-241'
,'g-242','g-243','g-245','g-246','g-247','g-248','g-250','g-251','g-252'
,'g-253','g-254','g-255','g-256','g-257','g-258','g-260','g-262','g-263'
,'g-265','g-267','g-268','g-269','g-270','g-272','g-273','g-274','g-276'
,'g-279','g-280','g-283','g-284','g-285','g-286','g-287','g-291','g-292'
,'g-293','g-294','g-296','g-297','g-298','g-299','g-300','g-301','g-302'
,'g-303','g-305','g-306','g-307','g-308','g-309','g-310','g-312','g-313'
,'g-314','g-317','g-318','g-319','g-321','g-322','g-323','g-324','g-325'
,'g-326','g-327','g-328','g-329','g-330','g-331','g-332','g-335','g-336'
,'g-337','g-338','g-340','g-341','g-342','g-343','g-344','g-346','g-347'
,'g-348','g-349','g-350','g-352','g-353','g-354','g-355','g-356','g-357'
,'g-358','g-359','g-360','g-361','g-362','g-363','g-365','g-366','g-367'
,'g-368','g-369','g-371','g-372','g-373','g-374','g-375','g-376','g-377'
,'g-379','g-380','g-381','g-382','g-383','g-384','g-385','g-386','g-387'
,'g-388','g-389','g-390','g-391','g-392','g-394','g-395','g-396','g-397'
,'g-398','g-400','g-402','g-403','g-404','g-405','g-407','g-408','g-409'
,'g-410','g-411','g-412','g-414','g-415','g-416','g-417','g-418','g-419'
,'g-420','g-421','g-422','g-423','g-424','g-425','g-426','g-427','g-428'
,'g-429','g-430','g-431','g-432','g-433','g-434','g-435','g-438','g-439'
,'g-440','g-441','g-442','g-443','g-444','g-445','g-446','g-447','g-449'
,'g-450','g-451','g-453','g-454','g-455','g-456','g-457','g-458','g-459'
,'g-460','g-461','g-462','g-463','g-465','g-466','g-468','g-469','g-470'
,'g-471','g-472','g-473','g-474','g-475','g-476','g-479','g-480','g-482'
,'g-483','g-484','g-485','g-486','g-488','g-489','g-491','g-492','g-493'
,'g-497','g-498','g-499','g-500','g-502','g-503','g-504','g-506','g-507'
,'g-508','g-509','g-510','g-511','g-513','g-514','g-515','g-516','g-518'
,'g-520','g-522','g-523','g-524','g-525','g-526','g-527','g-528','g-529'
,'g-530','g-531','g-533','g-534','g-535','g-536','g-537','g-538','g-539'
,'g-540','g-541','g-542','g-543','g-544','g-546','g-547','g-550','g-551'
,'g-552','g-553','g-554','g-555','g-556','g-557','g-558','g-559','g-560'
,'g-561','g-562','g-563','g-564','g-566','g-567','g-568','g-569','g-570'
,'g-571','g-572','g-574','g-577','g-578','g-579','g-580','g-583','g-584'
,'g-587','g-588','g-589','g-590','g-592','g-593','g-594','g-595','g-596'
,'g-597','g-598','g-599','g-600','g-602','g-604','g-605','g-606','g-608'
,'g-609','g-610','g-611','g-612','g-613','g-614','g-616','g-619','g-620'
,'g-622','g-624','g-627','g-628','g-629','g-630','g-631','g-632','g-634'
,'g-635','g-636','g-639','g-640','g-641','g-642','g-643','g-644','g-646'
,'g-647','g-648','g-649','g-651','g-652','g-655','g-656','g-657','g-658'
,'g-659','g-660','g-661','g-663','g-664','g-665','g-666','g-667','g-669'
,'g-671','g-672','g-673','g-674','g-675','g-677','g-678','g-679','g-681'
,'g-682','g-683','g-684','g-685','g-686','g-688','g-689','g-691','g-692'
,'g-693','g-694','g-696','g-697','g-698','g-699','g-700','g-701','g-702'
,'g-704','g-705','g-706','g-708','g-709','g-710','g-711','g-712','g-713'
,'g-714','g-720','g-722','g-724','g-725','g-726','g-727','g-728','g-729'
,'g-731','g-733','g-734','g-735','g-736','g-737','g-738','g-739','g-740'
,'g-741','g-742','g-743','g-744','g-745','g-746','g-747','g-748','g-749'
,'g-750','g-751','g-752','g-753','g-755','g-756','g-757','g-758','g-759'
,'g-760','g-761','g-762','g-763','g-764','g-766','g-767','g-768','g-769'
,'g-771','c-0','c-5','c-6','c-7','c-8','c-9','c-10','c-12','c-13','c-15'
,'c-18','c-20','c-22','c-24','c-25','c-26','c-30','c-33','c-34','c-36'
,'c-37','c-38','c-41','c-44','c-45','c-46','c-47','c-48','c-50','c-51'
,'c-52','c-54','c-56','c-57','c-58','c-59','c-60','c-62','c-63','c-64'
,'c-65','c-66','c-67','c-69','c-70','c-71','c-72','c-73','c-75','c-76'
,'c-77','c-79','c-80','c-81','c-83','c-85','c-86','c-87','c-89','c-92'
,'c-93','c-95','c-96','c-98','c-99']

if IS_TRAIN:
    data_features = pd.read_csv(f'{DATA_PATH}/train_features.csv')[selected_cols]
    test_features = pd.read_csv(f'{DATA_PATH}/test_features.csv')[selected_cols]
    train_targets_scored = pd.read_csv(f'{DATA_PATH}/train_targets_scored.csv')
    train_targets_nonscored = pd.read_csv(f'{DATA_PATH}/train_targets_nonscored.csv')
    print(f'Train: {len(data_features)}')

    target_cols = [col for col in train_targets_scored.columns if col != 'sig_id']
    data_process_dict = {}
    data_process_dict['target_cols'] = target_cols

    # from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
    #
    # weights_arr = np.array(len(data_features) / train_targets_scored[target_cols].sum(axis=0))
    # scaler = MinMaxScaler((1., 3.))
    # scaler.fit(weights_arr.reshape(-1, 1))
    # weights_arr_out = scaler.transform(weights_arr.reshape(-1, 1))
    # weights_arr_out = weights_arr_out.reshape(-1)

else:
    data_features = pd.read_csv(f'{DATA_PATH}/test_features.csv')[selected_cols]
    sample_submission = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')
    print(f'Test: {len(data_features)}')

    data_process_dict = load_pickle(data_process_file)
    target_cols = data_process_dict['target_cols']

id_col = 'sig_id'
GENES = [col for col in data_features.columns if col.startswith('g-')]
CELLS = [col for col in data_features.columns if col.startswith('c-')]
feature_cols = GENES + CELLS

########################################################################################################################
##### process data

#RankGauss
vec_len = len(data_features)
if IS_TRAIN:
    rankGauss_dict = {}
    for col in tqdm(GENES + CELLS, desc='GENES + CELLS cols rankgauss fitting'):
        transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
        transformer.fit(data_features[col].values.reshape(vec_len, 1))
        rankGauss_dict[col] = transformer
    data_process_dict['rankGauss_dict'] = rankGauss_dict
else:
    rankGauss_dict = data_process_dict['rankGauss_dict']

for col in tqdm(GENES + CELLS, desc='GENES + CELLS cols rankgauss transforming'):
    raw_vec = data_features[col].values.reshape(vec_len, 1)
    data_features[col] = rankGauss_dict[col].transform(raw_vec).reshape(1, vec_len)[0]

### PCA features + Existing features
# GENES
n_comp = 50
if IS_TRAIN:
    pca = PCA(n_components=n_comp, random_state=42)
    pca.fit(data_features[GENES])
    data_process_dict['g_pca'] = pca
pca = data_process_dict['g_pca']
data2 = pca.transform(data_features[GENES])

data2 = pd.DataFrame(data2, columns=[f'pca_G-{i}' for i in range(n_comp)])
feature_cols.extend([f'pca_G-{i}' for i in range(n_comp)])

data_features = pd.concat((data_features, data2), axis=1)

#CELLS
n_comp = 15
if IS_TRAIN:
    pca = PCA(n_components=n_comp, random_state=42)
    pca.fit(data_features[CELLS])
    data_process_dict['c_pca'] = pca
pca = data_process_dict['c_pca']
data2 = pca.transform(data_features[CELLS])

data2 = pd.DataFrame(data2, columns=[f'pca_C-{i}' for i in range(n_comp)])
feature_cols.extend([f'pca_C-{i}' for i in range(n_comp)])

data_features = pd.concat((data_features, data2), axis=1)

### TruncatedSVD features + Existing features
# GENES
n_comp = 50
if IS_TRAIN:
    svd = TruncatedSVD(n_components=n_comp, random_state=42)
    svd.fit(data_features[GENES])
    data_process_dict['g_svd'] = svd
svd = data_process_dict['g_svd']
data2 = svd.transform(data_features[GENES])

data2 = pd.DataFrame(data2, columns=[f'TruncatedSVD_G-{i}' for i in range(n_comp)])
feature_cols.extend([f'TruncatedSVD_G-{i}' for i in range(n_comp)])

data_features = pd.concat((data_features, data2), axis=1)

#CELLS
n_comp = 15
if IS_TRAIN:
    svd = TruncatedSVD(n_components=n_comp, random_state=42)
    svd.fit(data_features[CELLS])
    data_process_dict['c_svd'] = svd
svd = data_process_dict['c_svd']
data2 = svd.transform(data_features[CELLS])

data2 = pd.DataFrame(data2, columns=[f'TruncatedSVD_C-{i}' for i in range(n_comp)])
feature_cols.extend([f'TruncatedSVD_C-{i}' for i in range(n_comp)])

data_features = pd.concat((data_features, data2), axis=1)

### FactorAnalysis features + Existing features
# GENES
n_comp = 50
if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp, random_state=42)
    fa.fit(data_features[GENES])
    data_process_dict['g_fa'] = fa
fa = data_process_dict['g_fa']
data2 = fa.transform(data_features[GENES])

data2 = pd.DataFrame(data2, columns=[f'FA_G-{i}' for i in range(n_comp)])
feature_cols.extend([f'FA_G-{i}' for i in range(n_comp)])

data_features = pd.concat((data_features, data2), axis=1)

#CELLS
n_comp = 15
if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp, random_state=42)
    fa.fit(data_features[CELLS])
    data_process_dict['c_fa'] = fa
fa = data_process_dict['c_fa']
data2 = fa.transform(data_features[CELLS])

data2 = pd.DataFrame(data2, columns=[f'FA_C-{i}' for i in range(n_comp)])
feature_cols.extend([f'FA_C-{i}' for i in range(n_comp)])

data_features = pd.concat((data_features, data2), axis=1)

if IS_TRAIN:
    cate_dict = {}
    for col in ['cp_time', 'cp_dose']:
        cate_dict[col] = []
        for mod in data_features[col].unique():
            cate_dict[col].append(mod)
    data_process_dict['cate_dict'] = cate_dict
cate_dict = data_process_dict['cate_dict']

for col in cate_dict:
    for mod in cate_dict[col]:
        data_features[str(mod)] = (data_features[col] == mod).astype(int)
        feature_cols.append(str(mod))

########################################################################################################################
##### prepare models
class MoADataset:
    def __init__(self, features, targets, isTrain=True):
        self.features = features
        self.targets = targets
        self.isTrain = isTrain

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float),
        }
        if self.isTrain:
            dct['y'] = torch.tensor(self.targets[idx, :], dtype=torch.float)
        return dct

def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0

    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        #         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

        final_loss += loss.item()

    final_loss /= len(dataloader)

    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []

    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())

    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)

    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []

    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(outputs.sigmoid().detach().cpu().numpy())

    preds = np.concatenate(preds)

    return preds

from torch.nn.modules.loss import _WeightedLoss
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

def MultilabelStratifiedKFold_run(
        model_fn,
        train_feat,
        train_labels,
        test_feat,
        seed,
        model_name,
        input_dim,
        output_dim,
        nfolds=NFOLDS,
        verbose=0,
        isTrain=IS_TRAIN,
):
    seed_everything(seed)
    if IS_TRAIN:
        train_pred = np.zeros((len(train_feat), output_dim))
        kf = MultilabelStratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed)
        from sklearn.cluster import KMeans
        cluster_num = 250
        kmeans = KMeans(n_clusters=cluster_num, random_state=0).fit(train_labels)
        label_cluster = np.eye(cluster_num)[kmeans.predict(train_labels)]
        # kf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=42)
        # for _fold, (trn_idx, val_idx) in enumerate(kf.split(train_feat, train_labels)):
        for _fold, (trn_idx, val_idx) in enumerate(kf.split(train_feat, label_cluster)):
            start_time = time.time()

            # feat_num = train_feat.shape[1]
            # sample_num = int(FEAT_SAMPLE_RATE * feat_num)
            # print(f'feat_num: {feat_num}, sample_num: {sample_num}')
            # sample_cols_indexs = sorted(random.sample([feat_i for feat_i in range(feat_num)], sample_num))
            # data_process_dict[f'Seed{seed}Fold{_fold}_cols'] = sample_cols_indexs

            train_dataset = MoADataset(train_feat[trn_idx], train_labels[trn_idx], True)
            valid_dataset = MoADataset(train_feat[val_idx], train_labels[val_idx], True)

            trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
            validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

            model = model_fn(
                num_features=input_dim,
                num_targets=output_dim,
                hidden_size=hidden_size,
            )

            model.to(DEVICE)

            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
            scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
                                                      max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))

            # loss_fn = nn.BCEWithLogitsLoss()
            # loss_fn = BCEWithLogitsLoss(smooth_eps=1e-3)
            # loss_fn = SmoothBCEwLogits(weight=torch.tensor(weights_arr_out), smoothing=1e-3).to(DEVICE)
            loss_fn = SmoothBCEwLogits(smoothing=1e-3)

            early_stopping_steps = EARLY_STOPPING_STEPS
            early_step = 0

            best_loss = np.inf

            model_weights = f"{CACHE_PATH}/{model_name}_SEED{seed}_FOLD{_fold}.pth"
            for epoch in range(EPOCHS):

                train_loss = train_fn(model, optimizer, scheduler, loss_fn, trainloader, DEVICE)
                valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
                if verbose:
                    print(f"SEED: {seed}, FOLD: {_fold}, EPOCH: {epoch:3}, train_loss: {train_loss:.5f}, valid_loss: {valid_loss:.5f}, "
                          f"time: {(time.time() - start_time) / 60:.2f}min")

                if valid_loss < best_loss:

                    best_loss = valid_loss
                    torch.save(model.state_dict(), model_weights)

                elif (EARLY_STOP == True):

                    early_step += 1
                    if (early_step >= early_stopping_steps):
                        break

            model.load_state_dict(torch.load(model_weights))
            train_pred[val_idx] = inference_fn(model, validloader, DEVICE)
        return train_pred
    else:
        test_pred = np.zeros((len(test_feat), output_dim))
        for _fold in range(nfolds):
            # sample_cols_indexs = data_process_dict[f'Seed{seed}Fold{_fold}_cols']

            model = Model(
                num_features=input_dim,
                num_targets=output_dim,
                hidden_size=hidden_size,
            )
            model.to(DEVICE)
            model_weights = f"{CACHE_PATH}/{model_name}_SEED{seed}_FOLD{_fold}.pth"
            model.load_state_dict(torch.load(model_weights))

            testdataset = MoADataset(test_feat, None, False)
            testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
            test_pred += inference_fn(model, testloader, DEVICE) / nfolds
        return test_pred

# ########################################################################################################################
# ##### make meta features
# print('Making nonscored feat...')
# if IS_TRAIN:
#     nonscored_drop_cols = [id_col]
#     for col in [var for var in train_targets_nonscored.columns if var != 'sig_id']:
#         if train_targets_nonscored[col].sum() == 0:
#             nonscored_drop_cols.append(col)
#     nonscored_cols = [col for col in train_targets_nonscored.columns if col not in nonscored_drop_cols]
#     data_process_dict['nonscored_cols'] = nonscored_cols
# else:
#     nonscored_cols = data_process_dict['nonscored_cols']
# print('nonscored_cols: ', len(nonscored_cols))
# non_scored_feat = data_features[feature_cols].values
# if IS_TRAIN:
#     non_scored_labels = train_targets_nonscored[nonscored_cols].values
#
# if IS_TRAIN:
#     data_meta_feat = MultilabelStratifiedKFold_run(
#         train_feat=non_scored_feat,
#         train_labels=non_scored_labels,
#         test_feat=None,
#         seed=0,
#         model_name='nonscored_model',
#         input_dim=len(feature_cols),
#         output_dim=len(nonscored_cols),
#         nfolds=5,
#         verbose=0,
#         isTrain=IS_TRAIN,
#     )
#     oof_score = score_fn(y_true=non_scored_labels, y_pred=data_meta_feat)
#     print(f'Nonscored OOF score={oof_score:.5f}')
# else:
#     data_meta_feat, _ = MultilabelStratifiedKFold_run(
#         train_feat=None,
#         train_labels=None,
#         test_feat=non_scored_feat,
#         seed=0,
#         model_name='nonscored_model',
#         input_dim=len(feature_cols),
#         output_dim=len(nonscored_cols),
#         nfolds=5,
#         verbose=0,
#         isTrain=IS_TRAIN,
#     )
# for idx, col in enumerate(tqdm(nonscored_cols, desc='Appending unscored feat')):
#     data_features[col] = data_meta_feat[:, idx]
#     feature_cols.append(col)
# del data_meta_feat

########################################################################################################################
##### prepare data and train
if IS_TRAIN:
    data = data_features.merge(train_targets_scored, on='sig_id')
    data = data[data['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

    data_labels = data[target_cols].values
else:
    data = data_features[data_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

# external_feature_chosen = pd.read_csv('../input/feature_selecting/chosen_feat_df.csv').top_feat_cols.values.tolist()
# feature_cols = [col for col in feature_cols if col in external_feature_chosen]

feature_cols = pd.read_csv('../input/feature-selecting1105/chosen_feat_df.csv').top_feat_cols.values.tolist()

data_feat = data[feature_cols].values

In [None]:
# MLP1 OOF log_loss:  0.014429373656645358 7seed7cv LB 0.01834
CACHE_PATH = '../input/mlp1-torch-weights1109'
hidden_size = 2048
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.4)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))

        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.4)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        self.GeLU = nn.GELU()
        self.RReLU = nn.RReLU()

    def forward(self, x):
        x = self.batch_norm1(x)
        # x = self.dropout1(x)
        x = self.dense1(x)
        # x = F.relu(x)
        x = self.PReLU(x)
        # x = self.LeakyReLU(x)
        # x = self.GeLU(x)
        # x = self.RReLU(x)

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense2(x)
        # x = F.relu(x)
        x = self.PReLU(x)
        # x = self.LeakyReLU(x)
        # x = self.GeLU(x)
        # x = self.RReLU(x)

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x


# Averaging on multiple SEEDS
oof_pred1 = pd.read_csv(f'{CACHE_PATH}/oof_pred.csv')[target_cols].values
npy_pred1 = np.zeros((len(data), len(target_cols)))
for seed in SEED:
    if IS_TRAIN:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=data_feat,
            train_labels=data_labels,
            test_feat=None,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    else:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=None,
            train_labels=None,
            test_feat=data_feat,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    npy_pred1 += seed_npy_pred / len(SEED)

In [None]:
# MLP2 OOF log_loss:  0.0144581231992106 7seed7cv LB ???
CACHE_PATH = '../input/mlp2-torch-weights1110'
hidden_size = 1024

class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()

        hidden_size1 = 1024
        hidden_size2 = 1024
        hidden_size3 = 512

        self.l1_tanh = nn.Sequential(
            nn.BatchNorm1d(num_features),
            nn.Dropout(0.2),
            nn.Linear(num_features, hidden_size3, bias=False),
            nn.Tanh())
        self.l1_sigm = nn.Sequential(
            nn.BatchNorm1d(num_features),
            nn.Dropout(0.2),
            nn.Linear(num_features, hidden_size3, bias=False),
            nn.RReLU())

        self.l1_nn = nn.Sequential(
            nn.BatchNorm1d(hidden_size3),
            nn.LayerNorm(hidden_size3),
            nn.Dropout(0.5),
            nn.Linear(hidden_size3, hidden_size3, bias=False),
            nn.RReLU())

        self.l1 = nn.Sequential(
            nn.BatchNorm1d(num_features),
            nn.LayerNorm(num_features),
            nn.Dropout(0.2),
            nn.Linear(num_features, hidden_size1),
            nn.PReLU(hidden_size1))

        self.tanh = nn.Sequential(
            nn.BatchNorm1d(hidden_size1),
            nn.Dropout(0.5),
            nn.Linear(hidden_size1, hidden_size2, bias=False),
            nn.Tanh())

        self.sigm = nn.Sequential(
            nn.BatchNorm1d(hidden_size1),
            nn.Dropout(0.5),
            nn.Linear(hidden_size1, hidden_size2, bias=False),
            nn.RReLU())

        self.f_layer = nn.Sequential(
            nn.BatchNorm1d(hidden_size2 + hidden_size3),
            nn.Dropout(0.3),
            nn.RReLU(),
            nn.Linear(hidden_size2 + hidden_size3, num_targets)
        )

    def forward(self, x):
        x_tanh_l1 = self.l1_tanh(x)
        x_sigm_l1 = self.l1_sigm(x)
        x_att_l1 = x_tanh_l1 * x_sigm_l1
        x_att_l1 = self.l1_nn(x_att_l1)

        x = self.l1(x)

        x_tanh = self.tanh(x)
        x_sigm = self.sigm(x)

        x = x_tanh * x_sigm

        x = torch.cat((x, x_att_l1), dim=1)
        x = self.f_layer(x)

        return x

# Averaging on multiple SEEDS
oof_pred2 = pd.read_csv(f'{CACHE_PATH}/oof_pred.csv')[target_cols].values
npy_pred2 = np.zeros((len(data), len(target_cols)))
for seed in SEED:
    if IS_TRAIN:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=data_feat,
            train_labels=data_labels,
            test_feat=None,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    else:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=None,
            train_labels=None,
            test_feat=data_feat,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    npy_pred2 += seed_npy_pred / len(SEED)

In [None]:
# MLP3 OOF log_loss:  0.014490372476255158 7seed7cv LB ???
CACHE_PATH = '../input/mlp3-torch-weights1110'
hidden_size = 1024

class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.BN0 = nn.BatchNorm1d(num_features)
        self.dropout0 = nn.Dropout(0.2)

        drop_rate = 0.4
        self.dense1 = nn.Linear(num_features, 2048)
        self.BN1 = nn.BatchNorm1d(2048)
        self.dropout1 = nn.Dropout(drop_rate)

        self.dense2 = nn.Linear(2048, 2048)
        self.BN2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(drop_rate)

        self.dense3 = nn.Linear(2048, 2048)
        self.BN3 = nn.BatchNorm1d(2048)
        self.dropout3 = nn.Dropout(drop_rate)

        self.dense4 = nn.Linear(2048, num_targets)

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()

    def forward(self, inp):
        x = self.BN0(inp)
        x = self.dropout0(x)

        x = self.dense1(x)
        # x = F.relu(x)
        x = self.PReLU(x)
        x = self.BN1(x)
        x = self.dropout1(x)

        x = self.dense2(x)
        # x = F.relu(x)
        x = self.PReLU(x)
        x = self.BN2(x)
        x = self.dropout2(x)

        x = self.dense3(x)
        # x = F.relu(x)
        x = self.PReLU(x)
        x = self.BN3(x)
        x = self.dropout3(x)

        x = self.dense4(x)

        return x

# Averaging on multiple SEEDS
oof_pred3 = pd.read_csv(f'{CACHE_PATH}/oof_pred.csv')[target_cols].values
npy_pred3 = np.zeros((len(data), len(target_cols)))
for seed in SEED:
    if IS_TRAIN:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=data_feat,
            train_labels=data_labels,
            test_feat=None,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    else:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=None,
            train_labels=None,
            test_feat=data_feat,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    npy_pred3 += seed_npy_pred / len(SEED)

In [None]:
# LSTM OOF log_loss:  0.014677774191343845 7seed7cv LB 0.01854
CACHE_PATH = '../input/lstm-torch1116-weights'

class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        hidden_size = 200
        self.Lstm = nn.LSTM(num_features, hidden_size // 2, bidirectional=True, batch_first=True, dropout=0.2)

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        # self.dropout1 = nn.Dropout(0.2)
        # self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        #
        # self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        # self.dropout2 = nn.Dropout(0.4)
        # self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        #
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.4)
        self.dense3 = nn.Linear(hidden_size, num_targets)

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        self.GeLU = nn.GELU()
        self.RReLU = nn.RReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.batch_norm1(x)
        inp = x.unsqueeze(-2)

        self.Lstm.flatten_parameters()
        lstm, _ = self.Lstm(inp)

        max_pool, _ = torch.max(lstm, 1)
        avg_pool = torch.mean(lstm, 1)

        pooled_output = self.dropout(max_pool)
        pooled_output = self.RReLU(pooled_output)

        x = self.batch_norm3(pooled_output)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

# Averaging on multiple SEEDS
oof_pred4 = pd.read_csv(f'{CACHE_PATH}/oof_pred.csv')[target_cols].values
npy_pred4 = np.zeros((len(data), len(target_cols)))
for seed in SEED:
    if IS_TRAIN:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=data_feat,
            train_labels=data_labels,
            test_feat=None,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    else:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=None,
            train_labels=None,
            test_feat=data_feat,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    npy_pred4 += seed_npy_pred / len(SEED)

In [None]:
# CNN OOF log_loss:  0.014720567819513208 7seed7cv LB 0.01862
CACHE_PATH = '../input/cnn-torch1120-weights'

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class SEModule(nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size, padding, reduction=2):
        super(SEModule, self).__init__()
        self.conv0 = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
        self.conv1 = nn.Conv1d(in_channels, in_channels//reduction, kernel_size=kernel_size, padding=padding)
        self.conv2 = nn.Conv1d(in_channels//reduction, out_channels, kernel_size=kernel_size, padding=padding)
    def forward(self, x):
        s = F.adaptive_avg_pool1d(x, 1)
        #s = self.conv1(s)
        #s = F.relu(s, inplace=True)
        s = self.conv0(s)
        x *= torch.sigmoid(s)
        return x

class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()

        self.batch_norm1 = nn.BatchNorm1d(num_features)

        conv_size = 128
        dropout_rate = 0.2

        self.conv1d_1 = nn.Conv1d(num_features, conv_size, kernel_size=1, stride=1, dilation=1, padding=0, padding_mode='replicate')
        self.batch_norm_conv_1 = nn.BatchNorm1d(conv_size)
        self.dropout_conv_1 = nn.Dropout(dropout_rate)

        self.conv1d_2 = nn.Conv1d(num_features, conv_size, kernel_size=3, stride=1, dilation=1, padding=1, padding_mode='replicate')
        self.batch_norm_conv_2 = nn.BatchNorm1d(conv_size)
        self.dropout_conv_2 = nn.Dropout(dropout_rate)

        self.conv1d_3 = nn.Conv1d(num_features, conv_size, kernel_size=5, stride=1, dilation=1, padding=2, padding_mode='replicate')
        self.batch_norm_conv_3 = nn.BatchNorm1d(conv_size)
        self.dropout_conv_3 = nn.Dropout(dropout_rate)

        self.conv1d_4 = nn.Conv1d(num_features, conv_size, kernel_size=15, stride=1, dilation=1, padding=7, padding_mode='replicate')
        self.batch_norm_conv_4 = nn.BatchNorm1d(conv_size)
        self.dropout_conv_4 = nn.Dropout(dropout_rate)

        self.conv1d_5 = nn.Conv1d(num_features, conv_size, kernel_size=31, stride=1, dilation=1, padding=15, padding_mode='replicate')
        self.batch_norm_conv_5 = nn.BatchNorm1d(conv_size)
        self.dropout_conv_5 = nn.Dropout(dropout_rate)

        self.batch_norm3 = nn.BatchNorm1d(conv_size * 5)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.Linear(conv_size * 5, num_targets)

        self.ReLU = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        # self.GeLU = nn.GELU()
        self.RReLU = nn.RReLU()
        self.dropout = nn.Dropout(0.2)
        self.spatial_dropout = SpatialDropout(0.2)

    def forward(self, x):
        # x = self.batch_norm1(x)
        # x = self.dropout(x)
        # x = self.linear1(x)
        x = x.unsqueeze(-1)
        # x = self.spatial_dropout(x)

        x1 = self.conv1d_1(x)
        x1 = F.relu(x1)
        # x1 = self.PReLU(x1)
        x1 = self.batch_norm_conv_1(x1)
        x1 = self.dropout_conv_1(x1)
        x1 = x1.squeeze(-1)

        x2 = self.conv1d_2(x)
        x2 = F.relu(x2)
        # x2 = self.PReLU(x2)
        x2 = self.batch_norm_conv_2(x2)
        x2 = self.dropout_conv_2(x2)
        x2 = x2.squeeze(-1)

        x3 = self.conv1d_3(x)
        x3 = F.relu(x3)
        # x3 = self.PReLU(x3)
        x3 = self.batch_norm_conv_3(x3)
        x3 = self.dropout_conv_3(x3)
        x3 = x3.squeeze(-1)

        x4 = self.conv1d_4(x)
        x4 = F.relu(x4)
        # x4 = self.PReLU(x4)
        x4 = self.batch_norm_conv_4(x4)
        x4 = self.dropout_conv_4(x4)
        x4 = x4.squeeze(-1)

        # x5 = self.conv1d_5(x)
        # x5 = F.relu(x5)
        # # x5 = self.PReLU(x5)
        # x5 = self.batch_norm_conv_5(x5)
        # x5 = self.dropout_conv_5(x5)
        # x5 = x5.squeeze(-1)
        cross1 = x1 * x2
        # cross2 = x2 * x3

        conv_output = torch.cat((x1, x2, x3, x4, cross1), 1)

        x = self.batch_norm3(conv_output)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

# Averaging on multiple SEEDS
oof_pred5 = pd.read_csv(f'{CACHE_PATH}/oof_pred.csv')[target_cols].values
npy_pred5 = np.zeros((len(data), len(target_cols)))
for seed in SEED:
    if IS_TRAIN:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=data_feat,
            train_labels=data_labels,
            test_feat=None,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    else:
        seed_npy_pred = MultilabelStratifiedKFold_run(
            model_fn=Model,
            train_feat=None,
            train_labels=None,
            test_feat=data_feat,
            seed=seed,
            model_name='mlp',
            input_dim=len(feature_cols),
            output_dim=len(target_cols),
            nfolds=NFOLDS,
            verbose=1,
            isTrain=IS_TRAIN,
        )
    npy_pred5 += seed_npy_pred / len(SEED)

In [None]:
oof_pred = oof_pred1 * 0.3 + oof_pred2 * 0.25 + oof_pred3 * 0.2  + oof_pred4 * 0.15  + oof_pred5 * 0.1

train_targets_scored = pd.read_csv(f'{DATA_PATH}/train_targets_scored.csv')
y_true = train_targets_scored[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], oof_pred[:, i])
    score += score_ / len(target_cols)

print("OOF log_loss: ", score)

In [None]:
# OOF log_loss:  0.014349538866134236

In [None]:
npy_pred = npy_pred1 * 0.3 + npy_pred2 * 0.25 + npy_pred3 * 0.2 + npy_pred4 * 0.15 + npy_pred5 * 0.1

test_pred = pd.DataFrame(npy_pred, columns=target_cols)
test_pred[id_col] = data[id_col].values

sub = sample_submission.drop(columns=target_cols).merge(test_pred[['sig_id'] + target_cols], on='sig_id',
                                                            how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
print(sub.shape)