https://www.kaggle.com/gogo827jz/rapids-svm-on-gpu-6000-models-in-1-hour

In [None]:
# OSError: [Errno 28] No space left on device: 対策
# https://www.kaggle.com/getting-started/45288
%env JOBLIB_TEMP_FOLDER=/tmp

# これ入れてもだめ。出力ファイルは20GBまでしか保存できないため5seedは失敗する。1seedで8GBぐらい出力
# https://www.kaggle.com/product-feedback/155538
# https://www.kaggle.com/docs/notebooks#technical-specifications

In [None]:
import warnings, sys
warnings.filterwarnings("ignore")

# Thanks to Chris's RAPIDS dataset, it only takes around 1 min to install offline
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

import cuml
print('RAPIDS',cuml.__version__)

In [None]:
import os
import gc
import pickle
import joblib
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
from time import time

In [None]:
import os
import random as rn
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)

In [None]:
from sklearn.metrics import log_loss


def score(Y, Y_pred):
    _, n_classes = Y.shape

    losses = []

    for j in range(n_classes):
        loss = log_loss(Y.iloc[:, j], Y_pred.iloc[:, j], labels=[0, 1])

        losses.append(loss)

    return np.mean(losses)

In [None]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelGroupStratifiedKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, Y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regular_index = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregular_index = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = Y.groupby(groups).mean().loc[regular_index]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = Y.loc[groups.isin(irregular_index)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
#import pandas as pd
#
#
#def compute_row_statistics(X, prefix=""):
#    Xt = pd.DataFrame()
#
#    for agg_func in [
#        # "min",
#        # "max",
#        "mean",
#        "std",
#        "kurtosis",
#        "skew",
#    ]:
#        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)
#
#    return Xt

# Data Preparation

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

train_features = pd.read_csv(
   "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
Y_nonscored = pd.read_csv(
   "../input/lish-moa/train_targets_nonscored.csv", index_col=index_col
)
Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
   "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

columns = Y.columns

In [None]:
clipped_features = ClippedFeatures()
X = clipped_features.fit_transform(X)

with open("clipped_features.pkl", "wb") as f:
    pickle.dump(clipped_features, f)
# アンサンブルのために統計値, nonscoredは入れない 
#c_prefix = "c-"
#g_prefix = "g-"
#c_columns = X.columns.str.startswith(c_prefix)
#g_columns = X.columns.str.startswith(g_prefix)
#X_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
#X_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)
#X = pd.concat([X, X_c, X_g], axis=1)

In [None]:
seeds = [0, 1]
#seeds = [2, 3]
#seeds = [4]
n_splits = 5

#DEBUG = True
DEBUG = False
if DEBUG:
    columns = [
        "atp-sensitive_potassium_channel_antagonist",  # 陽性ラベル1個だけ
        "erbb2_inhibitor",  # 陽性ラベル1個だけ
        "antiarrhythmic",  # 陽性ラベル6個だけ
        "aldehyde_dehydrogenase_inhibitor",  # 陽性ラベル7個だけ
#        "lipase_inhibitor",  # 陽性ラベル12個だけ
#        "sphingosine_receptor_agonist",  # 陽性ラベル25個だけ
#        "igf-1_inhibitor",  # 陽性ラベル37個だけ
#        "potassium_channel_activator",  # 陽性ラベル55個だけ
#        "potassium_channel_antagonist",  # 陽性ラベル98個だけ
#        "dopamine_receptor_agonist",  # 陽性ラベル121個だけ
#        "nfkb_inhibitor",  # 陽性ラベル832個
#        "cyclooxygenase_inhibitor",  # 陽性ラベル435個
#        "dna_inhibitor",  # 陽性ラベル402個
#        "glutamate_receptor_antagonist",  # 陽性ラベル367個
#        "tubulin_inhibitor",  # 陽性ラベル316個
#        "pdgfr_inhibitor",  # 陽性ラベル297個
#        "calcium_channel_blocker",  # 陽性ラベル281個
        "flt3_inhibitor",  # 陽性ラベル279個
        "progesterone_receptor_agonist",  # 陽性ラベル119個
        "hdac_inhibitor",  # 陽性ラベル106個
    ]
    Y = Y[columns]
    
    seeds = [0]
    n_splits = 5
    print(f"DEBUG: {DEBUG}")

In [None]:
train_size, n_features = X.shape
_, n_classes_nonscored = Y_nonscored.shape
_, n_classes = Y.shape

# CuML SVM Models

In [None]:
# from sklearn.svm import SVC, SVR
from cuml.svm import SVC, SVR


Y_pred = Y.copy()
Y_pred.loc[:, Y.columns] = 0

counts = []
for i in tqdm(seeds):
    print(f"------------ seed:{i} ------------")
    set_seed(seed=i)

    cv = MultilabelGroupStratifiedKFold(n_splits=n_splits, random_state=i, shuffle=True)
    cv_split = cv.split(X, Y, groups)
        
    for j, (trn_idx, val_idx) in enumerate(cv_split):

        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        Y_train_targets, Y_val_targets = Y.iloc[trn_idx], Y.iloc[val_idx]
    
        targets_counts = []
    
        for tar, tar_col in enumerate(Y.columns):
            
            Y_train, Y_val = Y_train_targets.values[:, tar], Y_val_targets.values[:, tar]  

            if Y_train.sum() >= 5:
            
                model = SVC(C=10, cache_size=2000, probability=True)
                model.fit(X_train, Y_train)
                Y_pred[Y.columns[tar]][val_idx] += model.predict_proba(X_val)[:,1] / len(seeds)
                
                joblib.dump(model, f"model_seed_{i}_fold_{j}_{Y.columns[tar]}.jlb", compress=True)
                
            else:
                Y_pred[Y.columns[tar]][val_idx] += Y_train.mean() / len(seeds)
                
            targets_counts.append(Y_train.sum())
                
        counts.append(targets_counts)

counts = np.array(counts)
assert (
    counts.shape == np.empty((len(seeds) * n_splits, n_classes)).shape
), f"countsのshapeおかしい. {counts.shape}"

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

with open("counts.pkl", "wb") as f:
    pickle.dump(counts, f)

with open("Y_pred.pkl", "wb") as f:
    pickle.dump(Y_pred[columns], f)

score(Y[columns], Y_pred[columns])

# pkl check

In [None]:
path = r"counts.pkl"
with open(path, 'rb') as f:
    counts = pickle.load(f)
counts

In [None]:
path = r"Y_pred.pkl"
with open(path, 'rb') as f:
    Y_pred = pickle.load(f)
Y_pred

# predict test

In [None]:
test_features = pd.read_csv(
    "../input/lish-moa/test_features.csv", dtype=dtype, index_col=index_col
    #f"{DATADIR}/test_features.csv", dtype=dtype, index_col=index_col,
)
X_test = test_features.select_dtypes("number")

with open("./clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)
X_test = clipped_features.transform(X_test)

Y_test_pred = np.zeros((X_test.shape[0], len(columns)))
Y_test_pred = pd.DataFrame(Y_test_pred, columns=columns, index=test_features.index)

for i in seeds:
    for j in range(n_splits):
        for tar in range(Y.shape[1]):
            
            m_path = f"model_seed_{i}_fold_{j}_{Y.columns[tar]}.jlb"
            if os.path.exists(m_path):
                model = joblib.load(m_path)
                Y_test_pred.iloc[:,tar] += model.predict_proba(X_test)[:,1] / (len(seeds) * n_splits)
            else:
                Y_test_pred.iloc[:,tar] += np.array([Y_pred.iloc[:,tar].mean()] * X_test.shape[0]) / (len(seeds) * n_splits)

Y_test_pred[test_features["cp_type"] == "ctl_vehicle"] = 0.0
                
print(Y_test_pred.shape) 
display(Y_test_pred)

Y_test_pred.to_csv("submission.csv")