In [None]:
%config Completer.use_jedi = False

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import platform

if platform.system() == 'Windows':
    os.environ['comspec'] = 'powershell'
    print(os.getenv('comspec'))

In [None]:
# ! pip install kaggle --user
# ! kaggle competitions download -c lish-moa -p ./
# ! 7z x lish-moa.zip -olish-moa -y
# ! unzip -o lish-moa.zip -d lish-moa 

In [None]:
# base_path = './lish-moa/'
base_path = '../input/lish-moa/'

In [None]:
# ! ls 'lish-moa/'
! ls '../input/lish-moa/'

In [None]:
from typing import List, Union, Optional

import sys
import copy
import itertools
import contextlib
import concurrent
from collections import defaultdict

import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

import torch
from torch.utils.data import TensorDataset, DataLoader

import catalyst.dl
from catalyst.runners import SupervisedRunner
from catalyst.callbacks.metric import BatchMetricCallback
from catalyst.metrics.accuracy import multi_label_accuracy

sys.path.append('./../input/moautils')
from utils import hyperopt_optimize, kfold_score, train, SimpleModel, _getMultiLabelAccuracyCallback

In [None]:
torch.backends.cudnn.benchmark = True
torch.manual_seed(123)
torch.cuda.random.manual_seed(123)

# Set proper device for computations
dtype, device, cuda_device_id = torch.float32, None, 0
os.environ["CUDA_VISIBLE_DEVICES"] = '{0}'.format(str(cuda_device_id) if cuda_device_id is not None else '')
if cuda_device_id is not None and torch.cuda.is_available():
    device = 'cuda:{0:d}'.format(0)
else:
    device = torch.device('cpu')

print(dtype, device)

In [None]:
sample_submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [None]:
train_features = pd.read_csv(os.path.join(base_path, 'train_features.csv'))

In [None]:
train_targets_nonscored = pd.read_csv(os.path.join(base_path, 'train_targets_nonscored.csv'))

In [None]:
train_targets_scored = pd.read_csv(os.path.join(base_path, 'train_targets_scored.csv'))

In [None]:
test_features = pd.read_csv(os.path.join(base_path, 'test_features.csv'))

In [None]:
train_drug = pd.read_csv(os.path.join(base_path, 'train_drug.csv'))

In [None]:
sample_submission.sample(10)

In [None]:
train_features.sample(10)

In [None]:
train_targets_nonscored.sample(10)

In [None]:
train_targets_scored.sample(10)

In [None]:
train_drug.sample(10)

In [None]:
placedo_drug_id = ['cacb2b860', '292ab2c28']

drug_group_sizes = train_drug.groupby('drug_id').count()
drug_group_sizes = drug_group_sizes.drop(placedo_drug_id, axis=0).reset_index()

np.unique(drug_group_sizes['sig_id'], return_counts=True)

In [None]:
class KFoldGroupStratifier:
    def __init__(self, shuffle, n_splits, group_mapping, group_key='drug_id', target_key='sig_id', skip_group_key=()):
        self.shuffle = shuffle
        self.n_splits = n_splits    
        self.group_key = group_key
        self.target_key = target_key    
        self.group_mapping = group_mapping
        self.skip_group_key = set(skip_group_key)
        
        self.groups = group_mapping.groupby(group_key).groups
        self.groups_key = np.array([_ for _ in self.groups.keys() if _ not in skip_group_key])
        
        self.base_generator = KFold(shuffle=shuffle, n_splits=n_splits).split(self.groups_key)
        
    def split(self, *args, **kwargs):
        for base_train_indices, base_test_indices in self.base_generator:
            base_test_keys = self.groups_key[base_test_indices]
            base_train_keys = self.groups_key[base_train_indices]
            
            test_keys = self.group_mapping.loc[self.group_mapping['drug_id'].isin(base_test_keys)]['sig_id']
            train_keys = self.group_mapping.loc[self.group_mapping['drug_id'].isin(base_train_keys)]['sig_id']
            
            yield (
                np.argwhere(self.group_mapping['sig_id'].isin(test_keys).to_numpy()).reshape(-1),
                np.argwhere(self.group_mapping['sig_id'].isin(train_keys).to_numpy()).reshape(-1)
            )

In [None]:
for train_indices, test_indices in KFoldGroupStratifier(
    shuffle=True, n_splits=7, group_mapping=train_drug, skip_group_key=placedo_drug_id
).split(train_features):
    print(train_indices, test_indices)

In [None]:
print('ID`s aligned: ', np.mean(train_drug['sig_id'] == train_targets_scored['sig_id']) == 1.0)
print('ID`s aligned: ', np.mean(train_features['sig_id'] == train_targets_scored['sig_id']) == 1.0)
print('ID`s aligned: ', np.mean(train_targets_scored['sig_id'] == train_targets_nonscored['sig_id']) == 1.0)

In [None]:
train_all_targets = pd.concat([train_targets_scored, train_targets_nonscored.drop(['sig_id'], axis=1)], axis=1)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(21, 7))

sns.distplot(np.sum(train_targets_nonscored, axis=1), ax=axes[0], kde=False, bins=20)
sns.distplot(np.sum(train_targets_scored, axis=1), ax=axes[1], kde=False, bins=20)
sns.distplot(np.sum(train_all_targets, axis=1), ax=axes[2], kde=False, bins=20)

plt.show()

In [None]:
has_one_label_nonscored = (np.sum(train_targets_nonscored, axis=1) <= 1)
print('Has one labels nonscored:\t{0:.4f}\t{1:d}/{2:d}'.format(
    np.mean(has_one_label_nonscored), np.sum(has_one_label_nonscored), has_one_label_nonscored.shape[0]
))

has_one_label_scored = (np.sum(train_targets_scored, axis=1) <= 1)
print('Has one labels scored:\t\t{0:.4f}\t{1:d}/{2:d}'.format(
    np.mean(has_one_label_scored), np.sum(has_one_label_scored), has_one_label_scored.shape[0]
))

has_one_label_all = (np.sum(train_all_targets, axis=1) <= 1)
print('Has one labels all:\t\t{0:.4f}\t{1:d}/{2:d}'.format(
    np.mean(has_one_label_all), np.sum(has_one_label_all), has_one_label_all.shape[0]
))

In [None]:
X = pd.get_dummies(train_features.drop(['sig_id'], axis=1), columns=['cp_type', 'cp_time', 'cp_dose']).to_numpy()
y = train_targets_scored.drop(['sig_id'], axis=1).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

subX = pd.get_dummies(test_features.drop(['sig_id'], axis=1), columns=['cp_type', 'cp_time', 'cp_dose'])

In [None]:
train_ds = TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))
train_dl = DataLoader(train_ds, batch_size=1024, shuffle=False, num_workers=6)

sub_ds = TensorDataset(torch.tensor(subX.to_numpy(), dtype=torch.float32))
sub_dl = DataLoader(sub_ds, batch_size=1024, shuffle=False, num_workers=6)

In [None]:
training_kwargs = {
    'dataloader': {
        'batch_size': 128,
        'num_workers': 0
    },
    'model': {
        'class': SimpleModel,
        'kwargs': {
            'n_in': X.shape[1], 'n_classes': y.shape[1], 'hidden_size': 2048, 'dropout_rate': 0.5
        }
    },
    'loss': {
        'class': torch.nn.BCEWithLogitsLoss,
        'kwargs': {}
    },
    'optimizer': {
        'class': torch.optim.Adam,
        'kwargs': {
            'lr': 1e-2, 'weight_decay': 1e-5
        }
    },
    'scheduler': {
        'class': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'kwargs': {
            'factor': 0.1, 'patience': 3, 'eps': 1e-4
        }
    },
    'catalyst': {
        'callbacks': [
#             _getMultiLabelAccuracyCallback
        ],
        'num_epochs': 50,
        'verbose': False,
        'logdir': './logs',
        'load_best_on_end': True
    },
    'dtype': dtype, 'cuda_device_id': 0
}

In [None]:
# test_dl, train_dl, model, optimizer, scheduler, runner, _, _ = train(
#     train_data=(X, y), test_data=(X_test[:2], y_test[:2]), training_kwargs=training_kwargs
# )

In [None]:
batch_generator = KFoldGroupStratifier(
    shuffle=True, n_splits=7, group_mapping=train_drug, skip_group_key={}
)
# batch_generator = None
results, results_mean, models = kfold_score(
    data=(X, y), training_kwargs=training_kwargs,
    metrics={
        'Loss': lambda pred, y: torch.nn.BCEWithLogitsLoss()(pred, y),
        'MultiLabelAccuracy': lambda pred, y: torch.mean(((torch.sigmoid(pred) > 0.5) == y).to(torch.float32))
    }, n_splits=5, get_models=True, batch_generator=batch_generator
)

In [None]:
results_mean

In [None]:
train_final_score = 0.0

with torch.no_grad():
    for x, y in train_dl:
        preds = None
        for model in models:
            model = model.to(dtype=dtype, device=device)
            model.eval()

            if preds is None:
                preds = torch.sigmoid(model(x.to(device=device, dtype=dtype)).cpu())
            else:
                pass
                preds = preds + torch.sigmoid(model(x.to(device=device, dtype=dtype)).cpu())

        train_final_score += torch.nn.BCELoss(reduction='sum')(preds / len(models), y).item()
    
train_final_score /= len(train_dl.dataset) * y.shape[1]
print('Final Score: {0:.4f}'.format(train_final_score))

In [None]:
sub_result = np.zeros([sample_submission.shape[0], sample_submission.shape[1] - 1])

for model in models:
    model = model.to(dtype=dtype, device=device)
    model.eval()

    with torch.no_grad():
        idx = 0
        for (x, ) in sub_dl:
            preds = model(x.to(device=device, dtype=dtype)).cpu()
        
            sub_result[idx: idx + x.shape[0]] += torch.sigmoid(preds).numpy()
        
            idx += x.shape[0]
            
sub_result /= len(models)

In [None]:
sub_result[subX['cp_type_ctl_vehicle'] == 1] = 0.000

In [None]:
with open('submission.csv', 'w') as file:
    file.write(
        'sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,autotaxin_inhibitor,bacterial_30s_ribosomal_subunit_inhibitor,bacterial_50s_ribosomal_subunit_inhibitor,bacterial_antifolate,bacterial_cell_wall_synthesis_inhibitor,bacterial_dna_gyrase_inhibitor,bacterial_dna_inhibitor,bacterial_membrane_integrity_inhibitor,bcl_inhibitor,bcr-abl_inhibitor,benzodiazepine_receptor_agonist,beta_amyloid_inhibitor,bromodomain_inhibitor,btk_inhibitor,calcineurin_inhibitor,calcium_channel_blocker,cannabinoid_receptor_agonist,cannabinoid_receptor_antagonist,carbonic_anhydrase_inhibitor,casein_kinase_inhibitor,caspase_activator,catechol_o_methyltransferase_inhibitor,cc_chemokine_receptor_antagonist,cck_receptor_antagonist,cdk_inhibitor,chelating_agent,chk_inhibitor,chloride_channel_blocker,cholesterol_inhibitor,cholinergic_receptor_antagonist,coagulation_factor_inhibitor,corticosteroid_agonist,cyclooxygenase_inhibitor,cytochrome_p450_inhibitor,dihydrofolate_reductase_inhibitor,dipeptidyl_peptidase_inhibitor,diuretic,dna_alkylating_agent,dna_inhibitor,dopamine_receptor_agonist,dopamine_receptor_antagonist,egfr_inhibitor,elastase_inhibitor,erbb2_inhibitor,estrogen_receptor_agonist,estrogen_receptor_antagonist,faah_inhibitor,farnesyltransferase_inhibitor,fatty_acid_receptor_agonist,fgfr_inhibitor,flt3_inhibitor,focal_adhesion_kinase_inhibitor,free_radical_scavenger,fungal_squalene_epoxidase_inhibitor,gaba_receptor_agonist,gaba_receptor_antagonist,gamma_secretase_inhibitor,glucocorticoid_receptor_agonist,glutamate_inhibitor,glutamate_receptor_agonist,glutamate_receptor_antagonist,gonadotropin_receptor_agonist,gsk_inhibitor,hcv_inhibitor,hdac_inhibitor,histamine_receptor_agonist,histamine_receptor_antagonist,histone_lysine_demethylase_inhibitor,histone_lysine_methyltransferase_inhibitor,hiv_inhibitor,hmgcr_inhibitor,hsp_inhibitor,igf-1_inhibitor,ikk_inhibitor,imidazoline_receptor_agonist,immunosuppressant,insulin_secretagogue,insulin_sensitizer,integrin_inhibitor,jak_inhibitor,kit_inhibitor,laxative,leukotriene_inhibitor,leukotriene_receptor_antagonist,lipase_inhibitor,lipoxygenase_inhibitor,lxr_agonist,mdm_inhibitor,mek_inhibitor,membrane_integrity_inhibitor,mineralocorticoid_receptor_antagonist,monoacylglycerol_lipase_inhibitor,monoamine_oxidase_inhibitor,monopolar_spindle_1_kinase_inhibitor,mtor_inhibitor,mucolytic_agent,neuropeptide_receptor_antagonist,nfkb_inhibitor,nicotinic_receptor_agonist,nitric_oxide_donor,nitric_oxide_production_inhibitor,nitric_oxide_synthase_inhibitor,norepinephrine_reuptake_inhibitor,nrf2_activator,opioid_receptor_agonist,opioid_receptor_antagonist,orexin_receptor_antagonist,p38_mapk_inhibitor,p-glycoprotein_inhibitor,parp_inhibitor,pdgfr_inhibitor,pdk_inhibitor,phosphodiesterase_inhibitor,phospholipase_inhibitor,pi3k_inhibitor,pkc_inhibitor,potassium_channel_activator,potassium_channel_antagonist,ppar_receptor_agonist,ppar_receptor_antagonist,progesterone_receptor_agonist,progesterone_receptor_antagonist,prostaglandin_inhibitor,prostanoid_receptor_antagonist,proteasome_inhibitor,protein_kinase_inhibitor,protein_phosphatase_inhibitor,protein_synthesis_inhibitor,protein_tyrosine_kinase_inhibitor,radiopaque_medium,raf_inhibitor,ras_gtpase_inhibitor,retinoid_receptor_agonist,retinoid_receptor_antagonist,rho_associated_kinase_inhibitor,ribonucleoside_reductase_inhibitor,rna_polymerase_inhibitor,serotonin_receptor_agonist,serotonin_receptor_antagonist,serotonin_reuptake_inhibitor,sigma_receptor_agonist,sigma_receptor_antagonist,smoothened_receptor_antagonist,sodium_channel_inhibitor,sphingosine_receptor_agonist,src_inhibitor,steroid,syk_inhibitor,tachykinin_antagonist,tgf-beta_receptor_inhibitor,thrombin_inhibitor,thymidylate_synthase_inhibitor,tlr_agonist,tlr_antagonist,tnf_inhibitor,topoisomerase_inhibitor,transient_receptor_potential_channel_antagonist,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor\n'
    )
    for idx in range(test_features.shape[0]):
        file.write(
            test_features.iloc[idx]['sig_id'] + ',' + ','.join(['{0:.4f}'.format(_) for _ in sub_result[idx]]) + '\n'
        )

In [None]:
pd.read_csv('submission.csv')