# Соревнование "Mechanisms of Action (MoA) Prediction"

### Александр Чернышёв

## Импорт библиотек

In [None]:
import os
import sys
import time
import copy
import functools
import platform
import random
import tempfile
from argparse import Namespace
from pathlib import Path

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from catboost import CatBoostClassifier

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# import torchsummary

## Устанавливаем сиды для генераторов случайных чисел

In [None]:
dtype = torch.float32
device = torch.device('cuda:0')
seed = 123456


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=seed)

## Считываем данные

In [None]:
data_path = './data/' if 'windows' in platform.platform().lower() else '../input/lish-moa/'

In [None]:
train_features = pd.read_csv(data_path + 'train_features.csv')
test_features = pd.read_csv(data_path + 'test_features.csv')
train_targets_scored = pd.read_csv(data_path + 'train_targets_scored.csv')

In [None]:
train_features.info()

In [None]:
train_features.shape, train_targets_scored.shape, test_features.shape

In [None]:
train_features

## Класс датасета для обучения нейронных сетей

In [None]:
class MoADataset(Dataset):
    def __init__(self, dtype, features, targets=None, feature_columns=None):
        self.dtype = dtype

        if isinstance(features, (pd.DataFrame, pd.Series)):
            if feature_columns is not None:
                features = features[feature_columns]
            features = features.values
        self.features = torch.tensor(features, dtype=self.dtype)
        self.feature_columns = feature_columns

        if targets is None:
            targets = -np.ones(self.features.shape[0])  # фиктивный таргет, если идет инференс модели
        elif isinstance(targets, (pd.DataFrame, pd.Series)):
            targets = targets.values
        self.targets = torch.tensor(targets, dtype=self.dtype)

    def __getitem__(self, i):
        return self.features[i], self.targets[i]

    def __len__(self):
        return self.features.shape[0]

## Предобработка данных

In [None]:
def prepare_data(features, targets=None, OHE=True, scale=True, scaler=StandardScaler()):
    zero_moa_mask = (features.cp_type == 'ctl_vehicle').values
    features_enc = features[~zero_moa_mask].copy()
#     if targets is not None:
#         targets = targets[~zero_moa_mask]

    if scale:
        features_enc.cp_time = features_enc.cp_time.astype(float)
        float_mask = features_enc.dtypes == 'float64'
        if targets is not None:
            scaler.fit(features_enc.loc[:, float_mask])
        features_enc.loc[:, float_mask] = scaler.transform(features_enc.loc[:, float_mask])

#     features_enc = standard_scaler(features)

    # TODO: Здесь могут быть проблемы с тем, что тест и трейн обработаются по-разному!
#     features_enc = features
    if OHE:
#         features_enc = pd.get_dummies(features_enc, columns=['cp_type', 'cp_dose'])
        features_enc.cp_type = (features_enc.cp_type == 'ctl_vehicle').astype(float)
        features_enc.cp_dose = (features_enc.cp_dose == 'D2').astype(float)
#         features_enc.cp_time = features_enc.cp_time == 'D2'
    features_enc = features_enc.drop(columns=['sig_id'])
#     feature_columns = features_enc.drop(columns=['sig_id']).columns.values

    if targets is None:
        return features_enc.values, zero_moa_mask  # , zero_moa_mask  # , feature_columns

    targets_enc = targets.copy()[~zero_moa_mask].drop(columns=['sig_id'])
#     targets_enc = targets.drop(columns=['sig_id'])  # .columns.values
    return features_enc.values, targets_enc.values  # , zero_moa_mask  # , feature_columns, target_columns

In [None]:
### Разбивать надо сначала, а потом уже обрабатывать

# train_features_tr, train_features_val, train_targets_scored_tr, train_targets_scored_val = \
#     train_test_split(train_features, train_targets_scored, test_size=0.2, random_state=seed, shuffle=True)


# train_features_tr, train_targets_scored_tr, train_zero_moa_mask_tr = \
#     prepare_data(train_features_tr, train_targets_scored_tr, OHE=False)
# train_features_val, train_targets_scored_val, train_zero_moa_mask_val = \
#     prepare_data(train_features_val, train_targets_scored_val, OHE=False)
# train_features_enc, train_targets_scored_enc, train_zero_moa_mask_enc = \
#     prepare_data(train_features, train_targets_scored, OHE=False)

# test_features_enc, test_zero_moa_mask_enc = prepare_data(test_features, OHE=False)


# for i, target in enumerate(train_targets_scored_tr.columns):
#     assert train_targets_scored_tr[target].unique().shape[0] == 2, (i, target)


# train_features_tr

In [None]:
train_features_enc, train_targets_scored_enc = prepare_data(train_features, train_targets_scored)

test_features_enc, test_zero_moa_mask = prepare_data(test_features, OHE=True)


train_features_enc

In [None]:
### OLD CODE ###

# def prepare_data(features, targets=None):
#     # TODO: Здесь могут быть проблемы с тем, что тест и трейн обработаются по-разному!
#     features_enc = pd.get_dummies(features, columns=['cp_type', 'cp_dose']).drop(columns=['sig_id'])
# #     feature_columns = features_enc.drop(columns=['sig_id']).columns.values

#     if targets is None:
#         return features_enc  # , feature_columns

#     targets_enc = targets.drop(columns=['sig_id'])  # .columns.values
#     return features_enc, targets_enc  # , feature_columns, target_columns

# train_features_enc, train_targets_scored_enc = prepare_data(train_features, train_targets_scored)
# test_features_enc = prepare_data(test_features)

# (
#     train_features_tr, train_features_val,
#     train_targets_scored_tr, train_targets_scored_val
# ) = train_test_split(train_features_enc, train_targets_scored_enc, test_size=0.2,
#                      random_state=seed, shuffle=True)

## Нейронные сети

### Создание датасетов и даталоадеров

In [None]:
num_workers = 0

# TODO: написать функцию для предсказаний (нужно нормально обрабатывать случай с type=='wehicle')

# test_dataset = MoADataset(dtype, test_features_enc)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

# train_dataset = MoADataset(dtype, train_features_tr, train_targets_scored_tr)
# val_dataset = MoADataset(dtype, train_features_val, train_targets_scored_val)

# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

### Модель

In [None]:
class MoAModel(nn.Module):
    def __init__(self, device, dtype, num_in_features, num_hidden_features, num_out_features, dropout_rate=0.5):
        super().__init__()

        self.device = device
        self.dtype = dtype

        self.net = nn.Sequential(
            nn.BatchNorm1d(num_in_features),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(num_in_features, 2 * num_hidden_features)),  # nn.utils.weight_norm(),
            nn.ReLU(),  # nn.ELU(),
            nn.BatchNorm1d(2 * num_hidden_features),
            nn.Dropout(dropout_rate),

            nn.utils.weight_norm(nn.Linear(2 * num_hidden_features, num_hidden_features)),
            nn.ReLU(),  # nn.ELU(),
            nn.BatchNorm1d(num_hidden_features),
            nn.Dropout(dropout_rate),

            nn.utils.weight_norm(nn.Linear(num_hidden_features, num_out_features)),
        ).to(self.device)

    def forward(self, x):
        return self.net(x.to(self.device, self.dtype))

### Функции для обучения и вывода процесса обучения на экран

In [None]:
def print_results(cur_results, mode, cur_iter, print_every):
    if print_every == 'summary':
        print(
            f'Summary: epoch {cur_results.epoch + 1:3}, '
            f'mode {mode:6}, ',
            end=''
        )

        if mode != 'test':
            losses = cur_results.train_loss if mode == 'train' else cur_results.val_loss
            print(
                f'loss {np.mean(losses):12.5f}, '
            )
        else:
            print()
    elif cur_iter % print_every == 0:
        print(
            f'Epoch {cur_results.epoch + 1:3}, '
            f'mode {mode:6}, '
            f'iter {cur_iter:5}, ',
            end=''
        )

        if mode != 'test':
            losses = cur_results.train_loss if mode == 'train' else cur_results.val_loss
            print(
                f'loss {losses[-1]:12.5f}, '
            )
        else:
            print()

In [None]:
@functools.total_ordering
class ModelWithScore:
    def __init__(self, score, model):
        self.score = score
        self.model = copy.deepcopy(model)

    def __eq__(self, other):
        return self.score == other.score

    def __lt__(self, other):
        return self.score < other.score

#     def __hash__(self):
#         return hash(self.score)

    def __repr__(self):
        return f'Model of type "{type(self.model).__name__}" with score {self.score}'

# ModelWithScore(0.1, [1, 2, 3]) < ModelWithScore(0.5, [1, 2, 3, 4, 5])
# ModelWithScore(0.1, [1, 2, 3]) == ModelWithScore(0.1, [1, 2, 3, 4, 5])

In [None]:
def train_NN(model, criterion, optimizer, lr_scheduler,
             max_epoch, print_every,  # lr,
             train_dataloader, val_dataloader, test_features_enc, test_zero_moa_mask):
    # History
    results = Namespace(
        epochs=[],
#         lr=[],
        train_loss=[],
        val_loss=[],
        test_preds=[],
        best_models=[],
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        max_epoch=max_epoch,
        init_lr=lr,
        print_every=print_every,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        test_features_enc=test_features_enc
    )
    

    for epoch in tqdm(range(max_epoch), desc='Epoch'):
        results.epochs.append(epoch)

        # Training
        model.train()

        train_loss = 0
        for cur_iter, (x, y) in enumerate(train_dataloader):
            optimizer.zero_grad()

            scores = model(x)
            loss = criterion(scores, y.to(device))

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
#             cur_results.train_loss.append(loss.item())
#             cur_results.lr.append(lr_scheduler.get_last_lr()[0])
#             print_results(cur_results, 'train', cur_iter, print_every)
#         print_results(cur_results, 'train', -1, print_every='summary')
        train_loss /= len(train_dataloader)
        results.train_loss.append(train_loss)
#         lr_scheduler.step()

        # Validation
        model.eval()

        val_loss = 0
        for cur_iter, (x, y) in enumerate(val_dataloader):
            with torch.no_grad():
                scores = model(x)
            loss = criterion(scores, y.to(device))
#             cur_results.val_loss.append(loss.item())
            val_loss += loss.item()
        val_loss /= len(val_dataloader)
        results.val_loss.append(val_loss)
        
        results.best_models.append(ModelWithScore(val_loss, model))
        if len(results.best_models) > 3:
            results.best_models.remove(max(results.best_models))

#             print_results(cur_results, 'val', cur_iter, print_every)
#         print_results(cur_results, 'val', -1, print_every='summary')
        print(f'Epoch {epoch + 1:2}, train loss {train_loss:7.5f}, val loss {val_loss:7.5f}')  # , iter {cur_iter + 1:2}

        # Test predictions
        test_preds = predict(model, test_features_enc=test_features_enc, zero_moa_mask=test_zero_moa_mask)
#         model.eval()
#         for cur_iter, (x, y) in enumerate(test_dataloader):
#             with torch.no_grad():
#                 scores = model(x)
#             preds = torch.sigmoid(scores)
#             cur_results.test_preds.append(preds.cpu())

#     #         print_results(cur_results, 'test', cur_iter, print_every)
#         print_results(cur_results, 'test', -1, print_every='summary')
        results.test_preds.append(test_preds)


    return results

### Функция предсказания модели

In [None]:
def predict(model, test_features=None, test_features_enc=None, zero_moa_mask=None):
    if test_features is not None:
        test_features_enc, zero_moa_mask = prepare_data(test_features, OHE=True)

    test_dataset = MoADataset(dtype, test_features_enc)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    probs = []

    model.eval()
    for x, y in test_dataloader:
        with torch.no_grad():
            scores = model(x)
        cur_probs = torch.sigmoid(scores).cpu()
        probs.append(cur_probs)

    probs = torch.cat(probs)

    answer = pd.read_csv(data_path + 'sample_submission.csv')
    answer.iloc[~zero_moa_mask, 1:] = probs
    answer.iloc[zero_moa_mask, 1:] = 0
#     answer.to_csv('submission.csv', index=False)
    return answer

### Константы для датасета

In [None]:
num_in_features = train_features_enc.shape[1]
num_out_features = train_targets_scored_enc.shape[1]

num_hidden_features = 512 * 2 # 1024

In [None]:
# model = MoAModel('cpu', dtype, num_in_features, num_hidden_features, num_out_features)
# predict(model, test_features)
# # torchsummary.summary(model);

In [None]:
print_every = 5

max_epoch = 45
lr = 1e-3

criterion = nn.BCEWithLogitsLoss()

In [None]:
def create_model():
    model = MoAModel(device, dtype, num_in_features, num_hidden_features, num_out_features)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=26, gamma=0.1)

    return model, optimizer, lr_scheduler

In [None]:
# model = MoAModel(device, dtype, num_in_features, num_hidden_features, num_out_features)

# optimizer = optim.Adam(model.parameters(), lr=lr)
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=1/3)

### Обучение по фолдам

In [None]:
batch_size = 2 ** 7

n_folds = 7

In [None]:
def train_NN_by_folds(n_folds, models_path):
#     models_path = 'nn-models/01/'

    Path(models_path).mkdir(parents=True, exist_ok=False)

    kfold = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=517)#seed)

    models = []

    for i, (train, val) in enumerate(tqdm(
        kfold.split(train_features_enc, train_targets_scored_enc),  # [~train_zero_moa_mask_enc]),
        total=kfold.n_splits
    )):
        print()
        print(f'\tFold {i + 1}')
        x_tr = train_features_enc[train]
        y_tr = train_targets_scored_enc[train]
        x_val = train_features_enc[val]
        y_val = train_targets_scored_enc[val]
    #     x_tr = train_features_enc.iloc[train]
    #     y_tr = train_targets_scored_enc.iloc[train]
    #     x_val = train_features_enc.iloc[val]
    #     y_val = train_targets_scored_enc.iloc[val]

        train_dataset = MoADataset(dtype, x_tr, y_tr)
        val_dataset = MoADataset(dtype, x_val, y_val)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)


        model, optimizer, lr_scheduler = create_model()

        result = train_NN(model, criterion, optimizer, lr_scheduler,
                          max_epoch, print_every,
                          train_dataloader, val_dataloader, test_features_enc, test_zero_moa_mask)

    #     torch.save(model.state_dict(), f'{models_path}/NN_model_kfold_{i + 1}')
    #     np.save('')
        np.save(f'{models_path}/best_models_kfold_{i + 1}', result.best_models)

        models.append(model)

In [None]:
models_path = '../input/nn-top3ensemble-with-cv7/nn-models/02/'  # 'nn-models/01/'

# train_NN_by_folds(n_folds, models_path)

In [None]:
all_preds = []

for i in tqdm(range(n_folds)):
    results = np.load(f'{models_path}/best_models_kfold_{i + 1}.npy', allow_pickle=True)

    for best_model in results:
        preds = predict(best_model.model, test_features_enc=test_features_enc, zero_moa_mask=test_zero_moa_mask)
        all_preds.append(preds.iloc[:, 1:].values)

In [None]:
answer = preds.copy()
answer.iloc[:, 1:] = np.array(all_preds).mean(axis=0)

In [None]:
answer

In [None]:
answer.to_csv('submission.csv', index=False)