# Соревнование "Mechanisms of Action (MoA) Prediction"

### Александр Чернышёв

## Импорт библиотек

In [None]:
import os
import sys
import time
import platform
import random
import tempfile
from argparse import Namespace
from pathlib import Path

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from catboost import CatBoostClassifier

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# import torchsummary

## Устанавливаем сиды для генераторов случайных чисел

In [None]:
# dtype = torch.float32
# device = torch.device('cuda:0')
seed = 123456


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=seed)

## Считываем данные

In [None]:
data_path = './data/' if 'windows' in platform.platform().lower() else '../input/lish-moa/'

In [None]:
train_features = pd.read_csv(data_path + 'train_features.csv')
test_features = pd.read_csv(data_path + 'test_features.csv')
train_targets_scored = pd.read_csv(data_path + 'train_targets_scored.csv')

In [None]:
train_features.info()

In [None]:
train_features.shape, train_targets_scored.shape, test_features.shape

In [None]:
train_features

## Обработка случая `cp_type == 'ctl_vehicle'`

In [None]:
(train_features.cp_type == 'ctl_vehicle').mean()

In [None]:
train_features[train_features.cp_type == 'ctl_vehicle']

## Предобработка данных

In [None]:
def prepare_data(features, targets=None, OHE=True, scaler=StandardScaler()):
    features = features.copy()
    zero_moa_mask = (features.cp_type == 'ctl_vehicle').values
    features_enc = features[~zero_moa_mask]

    float_mask = features.dtypes == 'float64'
    if targets is not None:
        scaler.fit(features.loc[:, float_mask])
    features.loc[:, float_mask] = scaler.transform(features.loc[:, float_mask])

    # TODO: Здесь могут быть проблемы с тем, что тест и трейн обработаются по-разному!
    if OHE:
        features_enc = pd.get_dummies(features_enc, columns=['cp_type', 'cp_dose'])
    features_enc = features_enc.drop(columns=['sig_id'])

    if targets is None:
        return features_enc, zero_moa_mask

    targets_enc = targets.drop(columns=['sig_id'])
    return features_enc, targets_enc, zero_moa_mask

In [None]:
train_features_enc, train_targets_scored_enc, train_zero_moa_mask_enc = \
    prepare_data(train_features, train_targets_scored, OHE=False)

train_features_enc

## Градиентные бустинги

### Обучение по фолдам

In [None]:
n_folds = 7

In [None]:
def train_GB(x_tr, y_tr, x_val, y_val, models_path=None, models_path_exist_ok=False, iterations=20, verbose=None):
    start_time = time.time()

    if models_path is None:
        models_path = tempfile.TemporaryDirectory(prefix='models-', dir='./')
    print(f'Models path is {models_path}')

    Path(models_path).mkdir(parents=True, exist_ok=models_path_exist_ok)

    all_cls = {}

    all_targets = y_tr.columns.values.tolist()
    for i, target in enumerate(tqdm(all_targets)):
        if verbose:
            print('\t', i + 1, target)

        if np.unique(y_tr[target]).shape[0] <= 1:
            print(f'Target {target} skipped')
            continue

        cls = CatBoostClassifier(iterations=iterations, loss_function='Logloss',
                                 task_type='GPU', devices='0', verbose=verbose)

        cls.fit(x_tr, y_tr[target], cat_features=[0, 1, 2], eval_set=(x_val, y_val[target]))

        cls.save_model(f'{models_path}/{target}')

        all_cls[target] = cls

        cur_time = time.time()
        if verbose:
            print(f'\tLEARNING TIME: {(cur_time - start_time) / (i + 1):0.1f} sec')

    end_time = time.time()

    print(f'Total running time: {end_time - start_time:0.1f} sec')

    return all_cls

In [None]:
def train_all_GBs():
    kfold = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

    all_cls = []

    for i, (train, val) in enumerate(tqdm(
        list(kfold.split(train_features_enc, train_targets_scored_enc[~train_zero_moa_mask_enc]))[4:],
        total=kfold.n_splits
    ), start=4):
        print()
        print(f'\tFold {i + 1}')
        x_tr = train_features_enc.iloc[train]
        y_tr = train_targets_scored_enc.iloc[train]
        x_val = train_features_enc.iloc[val]
        y_val = train_targets_scored_enc.iloc[val]

        cur_all_cls = train_GB(x_tr, y_tr, x_val, y_val, models_path=f'models_kfold_{i + 1}/', verbose=False)
        all_cls.append(cur_all_cls)

    return all_cls

In [None]:
# all_cls = train_all_GBs()

### Загрузка моделей

In [None]:
all_models_path = '../input/model-gb-with-cv-for-moa-prediction'

all_cls = []
for i in range(n_folds):
    models_path = f'{all_models_path}/models_kfold_{i + 1}/'
    cur_all_cls = {}
    for model_name in os.listdir(models_path):
        full_model_name = f'{models_path}/{model_name}'
        cls = CatBoostClassifier().load_model(full_model_name)
        cur_all_cls[model_name] = cls
    all_cls.append(cur_all_cls)

### Предсказания

In [None]:
def predict(all_cls, test_features):
    test_features_enc, test_zero_moa_mask_enc = prepare_data(test_features, OHE=False)

    answer = pd.read_csv(data_path + 'sample_submission.csv')

    for target in tqdm(answer.iloc[:, 1:].columns.values):
        probs = []

        for cur_all_cls in all_cls:
            if target not in cur_all_cls:
                continue

            cls = cur_all_cls[target]
            cur_probs = cls.predict_proba(test_features_enc)[:, 1]
            probs.append(cur_probs)

        probs = np.mean(probs, axis=0)
        answer.loc[~test_zero_moa_mask_enc, target] = probs

    answer.iloc[test_zero_moa_mask_enc, 1:] = 1e-15

    return answer

In [None]:
answer = predict(all_cls, test_features)

In [None]:
answer.to_csv('submission.csv', index=False)

In [None]:
### Обучение моделей

# models_path = 'models-3'

# Path(models_path).mkdir(parents=True, exist_ok=True)

# all_cls = {}
# for model_path in os.listdir(models_path):
#     target = model_path[8:]
#     model_path = f'{models_path}/{model_path}'
#     cls = CatBoostClassifier().load_model(model_path)
#     all_cls[target] = cls

# all_targets = train_targets_scored_tr.columns.values

# TRAIN = False

# if TRAIN:
#     start_time = time.time()

#     all_cls = {}

#     for i, target in tqdm(enumerate(all_targets[len(all_cls):])):
#         print('\t', i + 1, target)

#         cls = CatBoostClassifier(iterations=20, loss_function='Logloss', task_type="GPU", devices='0')
#         cls.fit(train_features_tr, train_targets_scored_tr[target], cat_features=[0, 1, 2],
#                 eval_set=(train_features_val, train_targets_scored_val[target]))
#         cls.save_model(f'{models_path}/CBC_for_{target}')
# #         all_cls.append(cls)
#         all_cls[target] = cls

#         cur_time = time.time()
#         print(f'\tLEARNING TIME: {(cur_time - start_time) / (i + 1):0.1f} sec')

#     end_time = time.time()

#     print(end_time - start_time)