In [None]:
import os
import time
import platform
import random
from argparse import Namespace
from pathlib import Path

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# import torchsummary

In [None]:
dtype = torch.float32
device = torch.device('cuda:0')
seed = 12345


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=seed)

In [None]:
data_path = './data/' if 'windows' in platform.platform().lower() else '../input/lish-moa/'

In [None]:
train_features = pd.read_csv(data_path + 'train_features.csv')
test_features = pd.read_csv(data_path + 'test_features.csv')
train_targets_scored = pd.read_csv(data_path + 'train_targets_scored.csv')

In [None]:
train_features.info()

In [None]:
train_features.shape, train_targets_scored.shape, test_features.shape

In [None]:
train_features

In [None]:
def prepare_data(features, targets=None, OHE=True):
    # TODO: Здесь могут быть проблемы с тем, что тест и трейн обработаются по-разному!
    features_enc = features
    if OHE:
        features_enc = pd.get_dummies(features_enc, columns=['cp_type', 'cp_dose'])
    features_enc = features_enc.drop(columns=['sig_id'])
#     feature_columns = features_enc.drop(columns=['sig_id']).columns.values

    if targets is None:
        return features_enc  # , feature_columns

    targets_enc = targets.drop(columns=['sig_id'])  # .columns.values
    return features_enc, targets_enc  # , feature_columns, target_columns

In [None]:
train_features_enc, train_targets_scored_enc = prepare_data(train_features, train_targets_scored, OHE=False)
test_features_enc = prepare_data(test_features, OHE=False)

In [None]:
(
    train_features_tr, train_features_val,
    train_targets_scored_tr, train_targets_scored_val
) = train_test_split(train_features_enc, train_targets_scored_enc, test_size=0.2,
                     random_state=seed, shuffle=True)

In [None]:
for i, target in enumerate(train_targets_scored_tr.columns):
    assert train_targets_scored_tr[target].unique().shape[0] == 2

In [None]:
train_features_tr

In [None]:
Path("models").mkdir(parents=True, exist_ok=True)

In [None]:
models_path = '../input/model-gb-for-moa-prediction/'

In [None]:
TRAIN = False

In [None]:
if TRAIN:
    start_time = time.time()


    all_cls = []

    for i, target in tqdm(enumerate(train_targets_scored_tr.columns[len(all_cls):])):
        print('\t', i + 1, target)

        cls = CatBoostClassifier(iterations=20, loss_function='Logloss', task_type="GPU", devices='0')
        cls.fit(train_features_enc, train_targets_scored_enc[target], cat_features=[0, 1, 2])#, eval_set=(train_features_val, train_targets_scored_val[target]))
        cls.save_model(f'models/CBC_for_{target}')
        all_cls.append(cls)

        cur_time = time.time()
        print(f'\tLEARNING TIME: {(cur_time - start_time) / (i + 1):0.1f} sec')

    end_time = time.time()

In [None]:
# end_time - start_time

In [None]:
answer = pd.read_csv(data_path + 'sample_submission.csv')
answer

In [None]:
model_path[8:]

In [None]:
# from sklearn.metrics import log_loss

# all_preds = []

for model_path in tqdm(os.listdir(models_path)):
    target = model_path[8:]

    cls = CatBoostClassifier().load_model(f'{models_path}/{model_path}')

    y_pred = cls.predict_proba(test_features_enc)[:, 1]
    answer[target] = y_pred


# for i, target in tqdm(enumerate(train_targets_scored_tr.columns)):
#     cls = all_cls[i]

#     all_preds.append(y_pred[:, 1])
#     y_true = train_targets_scored_val[target]
#     loss = log_loss(y_true, y_pred, labels=[0, 1])
# 
#     print(f'{i + 1:3} {target:50} {loss:0.6f}')

In [None]:
answer.to_csv('submission.csv', index=False)

In [None]:
# all_preds = np.array(all_preds)
# all_preds = all_preds.T

# all_preds.shape

In [None]:
# all_preds[all_preds < 1e-15] = 1e-15
# all_preds[all_preds > 1 - 1e-15] = 1 - 1e-15

In [None]:
# def log_loss_my(y_true, y_pred):
#     LL = y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
#     return -LL.mean()

# log_loss_my(train_targets_scored_val.values, all_preds)

# target_losses = []

# for i, target in tqdm(enumerate(train_targets_scored_tr.columns)):
#     y_pred = all_preds[:, i]
#     y_true = train_targets_scored_val[target]
#     loss = log_loss_my(y_true.values, y_pred)
    
#     target_losses.append([target, loss])

#     if loss > 0.02:
#         print(f'{i + 1:3} {target:50} {loss:0.6f}')

In [None]:
# target_losses = pd.DataFrame(target_losses, columns=['target', 'loss'])

In [None]:
# # target_losses
# sorted_target_losses = target_losses.sort_values('loss', ascending=False)

In [None]:
# sorted_target_losses.head(50)

In [None]:
# all_cls[1].feature_importances_

In [None]:
# answer.iloc[:, 1:] = all_preds
# answer