In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.metrics import log_loss
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.backend import clear_session

In [None]:

'''All code come from : https://github.com/trent-b/iterative-stratification'''

"""This file includes multilabel cross validators based on an implementation of
the Iterative Stratification algorithm described in the following paper:
Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-
Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. (eds)
Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2011. Lecture
Notes in Computer Science, vol 6913. Springer, Berlin, Heidelberg.

From scikit-learn 0.19.0, StratifiedKFold, RepeatedStratifiedKFold, and
StratifiedShuffleSplit were copied and modified, retaining compatibility
with scikit-learn.

Attribution to authors of scikit-learn/model_selection/_split.py under BSD 3 clause:
    Alexandre Gramfort <alexandre.gramfort@inria.fr>,
    Gael Varoquaux <gael.varoquaux@normalesup.org>,
    Olivier Grisel <olivier.grisel@ensta.org>,
    Raghav RV <rvraghav93@gmail.com>
"""

# Author: Trent J. Bradberry <trentjason@hotmail.com>
# License: BSD 3 clause

import numpy as np

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \
    BaseShuffleSplit, _validate_shuffle_split


def IterativeStratification(labels, r, random_state):
    """This function implements the Iterative Stratification algorithm described
    in the following paper:
    Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
    Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
    (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
    2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
    Heidelberg.
    """

    n_samples = labels.shape[0]
    test_folds = np.zeros(n_samples, dtype=int)

    # Calculate the desired number of examples at each subset
    c_folds = r * n_samples

    # Calculate the desired number of examples of each label at each subset
    c_folds_labels = np.outer(r, labels.sum(axis=0))

    labels_not_processed_mask = np.ones(n_samples, dtype=bool)

    while np.any(labels_not_processed_mask):
        # Find the label with the fewest (but at least one) remaining examples,
        # breaking ties randomly
        num_labels = labels[labels_not_processed_mask].sum(axis=0)

        # Handle case where only all-zero labels are left by distributing
        # across all folds as evenly as possible (not in original algorithm but
        # mentioned in the text). (By handling this case separately, some
        # code redundancy is introduced; however, this approach allows for
        # decreased execution time when there are a relatively large number
        # of all-zero labels.)
        if num_labels.sum() == 0:
            sample_idxs = np.where(labels_not_processed_mask)[0]

            for sample_idx in sample_idxs:
                fold_idx = np.where(c_folds == c_folds.max())[0]

                if fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]

                test_folds[sample_idx] = fold_idx
                c_folds[fold_idx] -= 1

            break

        label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
        if label_idx.shape[0] > 1:
            label_idx = label_idx[random_state.choice(label_idx.shape[0])]

        sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]

        for sample_idx in sample_idxs:
            # Find the subset(s) with the largest number of desired examples
            # for this label, breaking ties by considering the largest number
            # of desired examples, breaking further ties randomly
            label_folds = c_folds_labels[:, label_idx]
            fold_idx = np.where(label_folds == label_folds.max())[0]

            if fold_idx.shape[0] > 1:
                temp_fold_idx = np.where(c_folds[fold_idx] ==
                                         c_folds[fold_idx].max())[0]
                fold_idx = fold_idx[temp_fold_idx]

                if temp_fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]

            test_folds[sample_idx] = fold_idx
            labels_not_processed_mask[sample_idx] = False

            # Update desired number of examples
            c_folds_labels[fold_idx, labels[sample_idx]] -= 1
            c_folds[fold_idx] -= 1

    return test_folds


class MultilabelStratifiedKFold(_BaseKFold):
    """Multilabel stratified K-Folds cross-validator
    Provides train/test indices to split multilabel data into train/test sets.
    This cross-validation object is a variation of KFold that returns
    stratified folds for multilabel data. The folds are made by preserving
    the percentage of samples for each label.
    Parameters
    ----------
    n_splits : int, default=3
        Number of folds. Must be at least 2.
    shuffle : boolean, optional
        Whether to shuffle each stratification of the data before splitting
        into batches.
    random_state : int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedKFold that only uses random_state
        when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
    >>> mskf.get_n_splits(X, y)
    2
    >>> print(mskf)  # doctest: +NORMALIZE_WHITESPACE
    MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
    >>> for train_index, test_index in mskf.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    Notes
    -----
    Train and test sizes may be slightly different in each fold.
    See also
    --------
    RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
    n times.
    """

    def __init__(self, n_splits=3, shuffle=False, random_state=None):
        super(MultilabelStratifiedKFold, self).__init__(n_splits, shuffle, random_state)

    def _make_test_folds(self, X, y):
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

        num_samples = y.shape[0]

        rng = check_random_state(self.random_state)
        indices = np.arange(num_samples)

        if self.shuffle:
            rng.shuffle(indices)
            y = y[indices]

        r = np.asarray([1 / self.n_splits] * self.n_splits)

        test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

        return test_folds[np.argsort(indices)]

    def _iter_test_masks(self, X=None, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedKFold, self).split(X, y, groups)


class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
    """Repeated Multilabel Stratified K-Fold cross validator.
    Repeats Mulilabel Stratified K-Fold n times with different randomization
    in each repetition.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    n_repeats : int, default=10
        Number of times cross-validator needs to be repeated.
    random_state : None, int or RandomState, default=None
        Random state to be used to generate random state for each
        repetition as well as randomly breaking ties within the iterative
        stratification algorithm.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
    ...     random_state=0)
    >>> for train_index, test_index in rmskf.split(X, y):
    ...     print("TRAIN:", train_index, "TEST:", test_index)
    ...     X_train, X_test = X[train_index], X[test_index]
    ...     y_train, y_test = y[train_index], y[test_index]
    ...
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [0 1 4 5] TEST: [2 3 6 7]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    See also
    --------
    RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
    n times.
    """
    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
        super(RepeatedMultilabelStratifiedKFold, self).__init__(
            MultilabelStratifiedKFold, n_repeats, random_state,
            n_splits=n_splits)


class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
    """Multilabel Stratified ShuffleSplit cross-validator
    Provides train/test indices to split data into train/test sets.
    This cross-validation object is a merge of MultilabelStratifiedKFold and
    ShuffleSplit, which returns stratified randomized folds for multilabel
    data. The folds are made by preserving the percentage of each label.
    Note: like the ShuffleSplit strategy, multilabel stratified random splits
    do not guarantee that all folds will be different, although this is
    still very likely for sizeable datasets.
    Parameters
    ----------
    n_splits : int, default 10
        Number of re-shuffling & splitting iterations.
    test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. By default, the value is set to 0.1.
        The default will change in version 0.21. It will remain 0.1 only
        if ``train_size`` is unspecified, otherwise it will complement
        the specified ``train_size``.
    train_size : float, int, or None, default is None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedShuffleSplit that only uses
        random_state when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
    ...    random_state=0)
    >>> msss.get_n_splits(X, y)
    3
    >>> print(mss)       # doctest: +ELLIPSIS
    MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
                                     train_size=None)
    >>> for train_index, test_index in msss.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    TRAIN: [1 2 5 6] TEST: [0 3 4 7]
    Notes
    -----
    Train and test sizes may be slightly different from desired due to the
    preference of stratification over perfectly sized folds.
    """

    def __init__(self, n_splits=10, test_size="default", train_size=None,
                 random_state=None):
        super(MultilabelStratifiedShuffleSplit, self).__init__(
            n_splits, test_size, train_size, random_state)

    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)

In [None]:
train_x = pd.read_csv('../input/lish-moa/train_features.csv')
test_x = pd.read_csv('../input/lish-moa/test_features.csv')

In [None]:
train_y = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
data = pd.concat([train_x, test_x], axis=0) #두 dataframe을 합치세요
data = data.reset_index() #인덱스를 새로 만드세요

In [None]:
c_cols = []
g_cols = []

for colname in data.columns:
    if colname.startswith('c-'): # c-로 시작하는 column들을 c_cols에 넣으세요
        c_cols.append(colname)
    if colname.startswith('g-'): # g-로 시작하는 column들을 g_cols에 넣으세요
        g_cols.append(colname)

In [None]:
somthing_rate = 1e-15
P_MIN = somthing_rate
P_MAX = 1 - P_MIN

def loss_fn(yt, yp):
    yp = np.clip(yp, P_MIN, P_MAX)
    return log_loss(yt, yp, labels=[0,1])

In [None]:
train = data.copy()
train = train.drop(['sig_id', 'index'], axis=1) #train에서 'sig_id', 'index' 두 column들을 없애세요

이미 correlation은 PCA를 통하여 줄여주었다. https://m.blog.naver.com/PostView.nhn?blogId=pmw9440&logNo=221478954228&proxyReferer=https:%2F%2Fwww.google.com%2F
이 링크에 따르면 correlation이 높은 계수들은 PCA나 제거하는 것이 좋다고 한다..... 
음.... 그렇다면 g-37과 g-50을 제외하고 제거하는 것이 좋지 않을까?

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder()

In [None]:
cp_dose_num = pd.DataFrame(label_encoder.fit_transform(train['cp_dose']), columns=['cp_dose_num']) #label encoding을 하세요
cp_type_num = pd.DataFrame(label_encoder.fit_transform(train['cp_type']), columns=['cp_type_num']) #label encoding을 하세요

In [None]:
train = pd.concat([cp_dose_num, cp_type_num, train], axis=1) # cp_type_num, cp_dose_num, train 세 dataframe을 합치세요
train = train.drop(['cp_type', 'cp_dose'], axis=1) #기존의 'cp_type', 'cp_dose' columns을 없애세요
train.head()

In [None]:
#one hot encoding을 하세요
cp_time_onehot = pd.DataFrame(onehot_encoder.fit_transform(train['cp_time'].to_numpy().reshape(-1, 1)).toarray())

#one hot columns에 'cp_time_onehot_'이라는 prefix을 앞에 붙혀주세요
cp_time_onehot = cp_time_onehot.add_prefix('cp_time_onehot_')

In [None]:
train = pd.concat([cp_time_onehot, train], axis=1)
train = train.drop(['cp_time'], axis=1)
#train.head()

 # Gauss Rank Scaler
https://github.com/aldente0630/gauss_rank_scaler/blob/master/gauss_rank_scaler.py   PCA 이전에 scaler를 써보자

In [None]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted


class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

In [None]:
gauss_scaler = GaussRankScaler()
train_gauss = gauss_scaler.fit_transform(train.iloc[:,5:])
train_gauss = pd.DataFrame(train_gauss, columns=train.columns[5:]) #(27796, 974)
train = pd.concat([train.iloc[:,:5], train_gauss], axis=1) #(27796, 979)

# PCA
* gene columns : 772
* cell columns : 100

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_g = PCA(n_components=100) #pca 벡터수를 100개로 해주세요
pca_c = PCA(n_components=2) #pca 벡터수를 2개로 해주세요

In [None]:
train_pca_g = pca_g.fit_transform(train[g_cols]) #pca를 g_cols에 적용시켜주세요
train_pca_c = pca_c.fit_transform(train[c_cols]) #pca를 g_cols에 적용시켜주세요

In [None]:
train = pd.concat([train, pd.DataFrame(train_pca_g).add_prefix('pca_g_'), 
                  pd.DataFrame(train_pca_c).add_prefix('pca_c_')], axis=1)

VarianceThreshold 특성제거는 별로

In [None]:
from sklearn.cluster import KMeans #클러스터화 하여서 그 features를 추가해보기

def fe_cluster(train, test, n_clusters_g = 22, n_clusters_c = 4, SEED = 42):
    
    features_g = g_cols
    features_c = c_cols
    
    def create_cluster(train, test, features, kind = 'g', n_clusters = n_clusters_g):
        train_ = train[features].copy()
        test_ = test[features].copy()
        data = pd.concat([train_, test_], axis = 0)
        kmeans = KMeans(n_clusters = n_clusters, random_state = SEED).fit(data)
        train[f'clusters_{kind}'] = kmeans.labels_[:train.shape[0]]
        test[f'clusters_{kind}'] = kmeans.labels_[train.shape[0]:]
        train = pd.get_dummies(train, columns = [f'clusters_{kind}'])
        test = pd.get_dummies(test, columns = [f'clusters_{kind}'])
        return train, test
    
    train, test = create_cluster(train, test, features_g, kind = 'g', n_clusters = n_clusters_g)
    train, test = create_cluster(train, test, features_c, kind = 'c', n_clusters = n_clusters_c)
    return train, test

train_Kmeans = train.iloc[:len(train_x),5:] 
test_Kmeans = train.iloc[len(train_x):,5:] 
train_Kmeans, test_Kmeans = fe_cluster(train_Kmeans, test_Kmeans )

In [None]:
train = pd.concat([train.iloc[:, :5], pd.concat([train_Kmeans, test_Kmeans], axis = 0)], axis = 1)
train #(27796,1005)

In [None]:
#c_cols와 g_cols의 평균을 구해 주세요
#마지막에 추가하기로 함
means = pd.concat([train[g_cols].mean(axis=1), train[c_cols].mean(axis=1)], 
                   keys=['c_mean', 'g_mean'], axis=1)

train = pd.concat([means,train], axis=1) #(27796,1007)

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
selection_train_y = train_y.drop(['sig_id'], axis=1)
selected_features =[]
for label in selection_train_y.columns:
    selector = SelectKBest(f_classif, k = 'all')
    selector.fit(train.iloc[:len(train_x)], selection_train_y[label])
    selected_features.append(list(selector.scores_))
    
selected_features = np.array(selected_features)
select_standard = np.median(np.mean(selected_features,axis=0)) #각 feature마다 점수의 평균을 구하고, 그 feature 마다의 점수가 차이가 크기 때문에 median 값을 넣어주었다,
selected_features = np.mean(selected_features, axis = 0) > select_standard
train = train.iloc[:, selected_features]
train

# NN models

In [None]:
from sklearn.model_selection import train_test_split

#train_test_split을 해주세요
X_train, X_val, y_train, y_val = train_test_split(train.iloc[:len(train_x)], 
                                                  train_y.drop(['sig_id'], axis=1), 
                                                  test_size=0.2, random_state=224)

In [None]:
def build_model(hidden_layers, neurons, dropout_rate, input_shape):
    #Sequential로 설정해주세요
    model = tf.keras.Sequential([tf.keras.layers.Input(input_shape)])

    for i in range(hidden_layers):
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout_rate)) #dropout을 해주세요
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(neurons // 2**i, activation='swish')))
        #//은 나누기를 하고 소수점 이하를 버리는 연산자
    #============ Final Layer =================
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid")))
    
    model.compile(optimizer=tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 756), 
                  loss=BinaryCrossentropy(label_smoothing=0.001)) 
    
    return model

In [None]:
layer2_model = build_model(2, 412, 0.5012546298076606, X_train.shape[1]) #2 layer model

In [None]:
layer3_model = build_model(3, 824, 0.5012546298076606, X_train.shape[1] )

In [None]:
layer4_model = build_model(4, 1600, 0.5012546298076606, X_train.shape[1] )

In [None]:
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.2, min_lr=1e-6, patience=4, verbose=1, mode='auto')
early = EarlyStopping(monitor="val_loss", mode="min", restore_best_weights=True, patience= 10, verbose = 1)

checkpoint_path = 'model.weights'
cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 1, save_best_only = True, 
                             save_weights_only=True, mode = 'auto')

In [None]:
def fit_model(model):
    model.fit(X_train, y_train,
                    batch_size = 64,
                    epochs = 100,
                    validation_data = (X_val, y_val),
                    callbacks = [early, reduce_lr_loss, cb_checkpt])

In [None]:
fit_model(layer2_model)

In [None]:
fit_model(layer3_model)

In [None]:
fit_model(layer4_model)

1. tfa.Lookahead 사용 --> val_loss 0.0185 prediction 0.1934 실패 원래대로하자
2. Kmeans 사용 --> loss 0.0167 val_loss 0.0184, prediction 
3. Gauss Scaler까지 사용 --> loss 0.016 val_loss 0.0181 최소

In [None]:
test = train.iloc[len(train_x):]

# Ensemble

blending - https://3months.tistory.com/486 --> blending은 0.20 정도로 성능이 낮아졌다

In [None]:
###blending###

val_pred1 = layer2_model.predict(X_val)
test_pred1 = layer2_model.predict(test)

val_pred2 = layer3_model.predict(X_val)
test_pred2 = layer3_model.predict(test)


val_pred3 = layer4_model.predict(X_val)
test_pred3 = layer4_model.predict(test)

In [None]:
blend_val = np.concatenate((X_val.to_numpy(), val_pred1, val_pred2, val_pred3), axis = 1)
blend_test = np.concatenate((test.to_numpy(), test_pred1, test_pred2, test_pred3), axis = 1)

In [None]:
final_model = build_model(3, 824, 0.5012546298076606, blend_val.shape[1] )

In [None]:
final_model.fit(blend_val, y_val, epochs = 100, validation_split=0.1, callbacks = [early, reduce_lr_loss, cb_checkpt])

cv(Kfold) 기반 stacking ensemble - https://lsjsj92.tistory.com/559?category=853217

In [None]:
#train_test_split은 의미가 없다! 
pre_stack_X_train = train.iloc[:len(train_x)]
pre_stack_y_train = train_y.drop(['sig_id'], axis=1)
pre_stack_test = train.iloc[len(train_x):]

In [None]:
#https://github.com/trent-b/iterative-stratification
def get_stacking_data(model, X_train, y_train, X_test, n_folds=3):
    stk = MultilabelStratifiedKFold(n_splits=n_folds)
    train_fold_predict = np.zeros((X_train.shape[0], y_train.shape[1]))
    test_predict = np.zeros((X_test.shape[0], y_train.shape[1], n_folds))
    print("model : ", model.__class__.__name__)
    
    for cnt, (train_index, valid_index) in enumerate(stk.split(X_train, y_train)):
        X_train_ = X_train.iloc[train_index]#(?,1007)
        y_train_ = y_train.iloc[train_index]#(?,206)
        X_validation = X_train.iloc[valid_index]#(?,1007)
        
        model.fit(X_train_, y_train_, batch_size = 64,
                    epochs = 100,
                    validation_split = 0.2,
                    callbacks = [early, reduce_lr_loss, cb_checkpt])
        
        train_fold_predict[valid_index, :] = model.predict(X_validation) #(6351,206)
        test_predict[:,:,cnt] = model.predict(X_test) #(4763,206)
    test_predict_mean = np.mean(test_predict, axis=2)
    
    return train_fold_predict, test_predict_mean
        

In [None]:
stack_pred1, stack_test1 = get_stacking_data(layer2_model, pre_stack_X_train, pre_stack_y_train, pre_stack_test)
stack_pred2, stack_test2 = get_stacking_data(layer3_model, pre_stack_X_train, pre_stack_y_train, pre_stack_test)
stack_pred3, stack_test3 = get_stacking_data(layer4_model, pre_stack_X_train, pre_stack_y_train, pre_stack_test)

In [None]:
stack_train = np.concatenate((stack_pred1, stack_pred2,stack_pred3), axis = 1) #(19051, 618) --> final X_train
stack_test = np.concatenate((stack_test1, stack_test2,stack_test3), axis=1) # --> final X_val

In [None]:
final_model = build_model(2, 306, 0.5012546298076606, stack_train.shape[1])

In [None]:
final_model.fit(stack_train, pre_stack_y_train, epochs=100, validation_split = 0.2,
                    callbacks = [early, reduce_lr_loss, cb_checkpt])

# Submission

In [None]:
pred = final_model.predict(stack_test)

In [None]:
pred_df = pd.DataFrame(pred, columns=train_y.columns[1:])
submmission_df = pd.concat([test_x['sig_id'], pred_df], axis=1)

In [None]:
submmission_df.to_csv('submission.csv', index=False) #csv파일로 export하세요

In [None]:
pd.read_csv('submission.csv')