In [None]:
"""This file includes multilabel cross validators based on an implementation of
the Iterative Stratification algorithm described in the following paper:
Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-
Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. (eds)
Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2011. Lecture
Notes in Computer Science, vol 6913. Springer, Berlin, Heidelberg.

From scikit-learn 0.19.0, StratifiedKFold, RepeatedStratifiedKFold, and
StratifiedShuffleSplit were copied and modified, retaining compatibility
with scikit-learn.

Attribution to authors of scikit-learn/model_selection/_split.py under BSD 3 clause:
    Alexandre Gramfort <alexandre.gramfort@inria.fr>,
    Gael Varoquaux <gael.varoquaux@normalesup.org>,
    Olivier Grisel <olivier.grisel@ensta.org>,
    Raghav RV <rvraghav93@gmail.com>
"""

# Author: Trent J. Bradberry <trentjason@hotmail.com>
# License: BSD 3 clause

import numpy as np

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \
    BaseShuffleSplit, _validate_shuffle_split


def IterativeStratification(labels, r, random_state):
    """This function implements the Iterative Stratification algorithm described
    in the following paper:
    Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
    Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
    (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
    2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
    Heidelberg.
    """

    n_samples = labels.shape[0]
    test_folds = np.zeros(n_samples, dtype=int)

    # Calculate the desired number of examples at each subset
    c_folds = r * n_samples

    # Calculate the desired number of examples of each label at each subset
    c_folds_labels = np.outer(r, labels.sum(axis=0))

    labels_not_processed_mask = np.ones(n_samples, dtype=bool)

    while np.any(labels_not_processed_mask):
        # Find the label with the fewest (but at least one) remaining examples,
        # breaking ties randomly
        num_labels = labels[labels_not_processed_mask].sum(axis=0)

        # Handle case where only all-zero labels are left by distributing
        # across all folds as evenly as possible (not in original algorithm but
        # mentioned in the text). (By handling this case separately, some
        # code redundancy is introduced; however, this approach allows for
        # decreased execution time when there are a relatively large number
        # of all-zero labels.)
        if num_labels.sum() == 0:
            sample_idxs = np.where(labels_not_processed_mask)[0]

            for sample_idx in sample_idxs:
                fold_idx = np.where(c_folds == c_folds.max())[0]

                if fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]

                test_folds[sample_idx] = fold_idx
                c_folds[fold_idx] -= 1

            break

        label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
        if label_idx.shape[0] > 1:
            label_idx = label_idx[random_state.choice(label_idx.shape[0])]

        sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]

        for sample_idx in sample_idxs:
            # Find the subset(s) with the largest number of desired examples
            # for this label, breaking ties by considering the largest number
            # of desired examples, breaking further ties randomly
            label_folds = c_folds_labels[:, label_idx]
            fold_idx = np.where(label_folds == label_folds.max())[0]

            if fold_idx.shape[0] > 1:
                temp_fold_idx = np.where(c_folds[fold_idx] ==
                                         c_folds[fold_idx].max())[0]
                fold_idx = fold_idx[temp_fold_idx]

                if temp_fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]

            test_folds[sample_idx] = fold_idx
            labels_not_processed_mask[sample_idx] = False

            # Update desired number of examples
            c_folds_labels[fold_idx, labels[sample_idx]] -= 1
            c_folds[fold_idx] -= 1

    return test_folds


class MultilabelStratifiedKFold(_BaseKFold):
    """Multilabel stratified K-Folds cross-validator
    Provides train/test indices to split multilabel data into train/test sets.
    This cross-validation object is a variation of KFold that returns
    stratified folds for multilabel data. The folds are made by preserving
    the percentage of samples for each label.
    Parameters
    ----------
    n_splits : int, default=3
        Number of folds. Must be at least 2.
    shuffle : boolean, optional
        Whether to shuffle each stratification of the data before splitting
        into batches.
    random_state : int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedKFold that only uses random_state
        when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
    >>> mskf.get_n_splits(X, y)
    2
    >>> print(mskf)  # doctest: +NORMALIZE_WHITESPACE
    MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
    >>> for train_index, test_index in mskf.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    Notes
    -----
    Train and test sizes may be slightly different in each fold.
    See also
    --------
    RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
    n times.
    """

    def __init__(self, n_splits=3, shuffle=False, random_state=None):
        super(MultilabelStratifiedKFold, self).__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def _make_test_folds(self, X, y):
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

        num_samples = y.shape[0]

        rng = check_random_state(self.random_state)
        indices = np.arange(num_samples)

        if self.shuffle:
            rng.shuffle(indices)
            y = y[indices]

        r = np.asarray([1 / self.n_splits] * self.n_splits)

        test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

        return test_folds[np.argsort(indices)]

    def _iter_test_masks(self, X=None, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedKFold, self).split(X, y, groups)


class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
    """Repeated Multilabel Stratified K-Fold cross validator.
    Repeats Mulilabel Stratified K-Fold n times with different randomization
    in each repetition.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    n_repeats : int, default=10
        Number of times cross-validator needs to be repeated.
    random_state : None, int or RandomState, default=None
        Random state to be used to generate random state for each
        repetition as well as randomly breaking ties within the iterative
        stratification algorithm.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
    ...     random_state=0)
    >>> for train_index, test_index in rmskf.split(X, y):
    ...     print("TRAIN:", train_index, "TEST:", test_index)
    ...     X_train, X_test = X[train_index], X[test_index]
    ...     y_train, y_test = y[train_index], y[test_index]
    ...
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [0 1 4 5] TEST: [2 3 6 7]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    See also
    --------
    RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
    n times.
    """
    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
        super(RepeatedMultilabelStratifiedKFold, self).__init__(
            MultilabelStratifiedKFold, n_repeats=n_repeats, random_state=random_state,
            n_splits=n_splits)


class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
    """Multilabel Stratified ShuffleSplit cross-validator
    Provides train/test indices to split data into train/test sets.
    This cross-validation object is a merge of MultilabelStratifiedKFold and
    ShuffleSplit, which returns stratified randomized folds for multilabel
    data. The folds are made by preserving the percentage of each label.
    Note: like the ShuffleSplit strategy, multilabel stratified random splits
    do not guarantee that all folds will be different, although this is
    still very likely for sizeable datasets.
    Parameters
    ----------
    n_splits : int, default 10
        Number of re-shuffling & splitting iterations.
    test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. By default, the value is set to 0.1.
        The default will change in version 0.21. It will remain 0.1 only
        if ``train_size`` is unspecified, otherwise it will complement
        the specified ``train_size``.
    train_size : float, int, or None, default is None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedShuffleSplit that only uses
        random_state when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
    ...    random_state=0)
    >>> msss.get_n_splits(X, y)
    3
    >>> print(mss)       # doctest: +ELLIPSIS
    MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
                                     train_size=None)
    >>> for train_index, test_index in msss.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    TRAIN: [1 2 5 6] TEST: [0 3 4 7]
    Notes
    -----
    Train and test sizes may be slightly different from desired due to the
    preference of stratification over perfectly sized folds.
    """

    def __init__(self, n_splits=10, test_size="default", train_size=None,
                 random_state=None):
        super(MultilabelStratifiedShuffleSplit, self).__init__(
            n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)

    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:

TEST_FEATURES_PATH = "/kaggle/input/lish-moa/test_features.csv"
TRAIN_FEATURES_PATH = "/kaggle/input/lish-moa/train_features.csv"
TRAIN_TARGETS_PATH = "/kaggle/input/lish-moa/train_targets_scored.csv"
TRAIN_TARGETS_NONSCORED_PATH = "/kaggle/input/lish-moa/train_targets_nonscored.csv"
SAMPLE_SUB_PATH = "/kaggle/input/lish-moa/sample_submission.csv"

In [None]:
test_features_df = pd.read_csv(TEST_FEATURES_PATH).sort_values(by='sig_id')
train_features_df = pd.read_csv(TRAIN_FEATURES_PATH).sort_values(by='sig_id')
train_targets_df = pd.read_csv(TRAIN_TARGETS_PATH).sort_values(by='sig_id')
train_targets_nonscored_df = pd.read_csv(TRAIN_TARGETS_NONSCORED_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH).sort_values(by='sig_id')

In [None]:
train_features_df.shape

In [None]:
train_targets_df.head()

In [None]:
train_targets_df['5-alpha_reductase_inhibitor'].value_counts()

In [None]:
value_counts_arr = np.sort([train_targets_df[col].value_counts()[1] for col in train_targets_df.columns])

print(value_counts_arr)

In [None]:
matplotlib.rcParams['figure.figsize'] = [10, 5]

plt.hist(value_counts_arr, 50, facecolor='g', alpha=0.75)
plt.xlabel('Number of 1\'s')
plt.ylabel('Number of classes')
plt.title('Value Counts of 1\'s in classes')
plt.show()

In [None]:
train_features_df.select_dtypes(include=[object])

In [None]:
train_features_df.cp_dose.unique()

In [None]:
### Dealing w/ categorical features


# Encode training categorical features

# train_features_df = pd.get_dummies(train_features_df, columns=['cp_dose', 'cp_time'])
# test_features_df = pd.get_dummies(test_features_df, columns=['cp_dose', 'cp_time'])


In [None]:
train_targets_df.tail()

In [None]:
# p = 23814
# train_targets_df.loc[p] = train_targets_df[train_targets_df.sig_id =='id_53b38e3be'].values[0]
# train_targets_df.loc[p+1] = train_targets_df[train_targets_df.sig_id =='id_dc2606109'].values[0]
# train_targets_df.loc[p, 'sig_id']  = 'id_53b10e3be'
# train_targets_df.loc[p+1, 'sig_id']  = 'id_53b50e2be'

# train_features_df.loc[p] = train_features_df[train_features_df.sig_id =='id_53b38e3be'].values[0]
# train_features_df.loc[p+1] = train_features_df[train_features_df.sig_id =='id_dc2606109'].values[0]
# train_features_df.loc[p, 'sig_id']  = 'id_53b10e3be'
# train_features_df.loc[p+1, 'sig_id']  = 'id_53b50e2be'

In [None]:
X = train_features_df.drop(columns=['sig_id'])
X_test = test_features_df.drop(columns=['sig_id'])
y = train_targets_df.drop(columns=['sig_id'])

In [None]:
#rank gauss
from sklearn.preprocessing import QuantileTransformer


transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
transformer.fit(X[genes+cells])

In [None]:
X.loc[:,genes+cells] = transformer.transform(X[genes+cells])
X_test.loc[:,genes+cells] = transformer.transform(X_test[genes+cells])


In [None]:
from sklearn.decomposition import PCA

In [None]:
def pca_number(X_genes, X_test_genes, X_cells, X_test_cells, X, X_test, n_gene=200, n_cells=10):
    pca = PCA(n_components=n_gene)
    data = pca.fit_transform(pd.concat([X_genes,X_test_genes]))
    
    X_temp = pd.DataFrame(data=data, columns = ['pca_gene'+str(i) for i in range(n_gene)])
    X_genes = X_temp[:len(X_genes)]
    X_test_genes = X_temp[len(X_genes):].reset_index(drop=True)
    
    
    pca = PCA(n_components=n_cells)
    data = pca.fit_transform(pd.concat([X_cells,X_test_cells]))
    
    X_temp = pd.DataFrame(data=data, columns = ['pca_cells'+str(i) for i in range(n_cells)])
    
    X_cells = X_temp[:len(X_cells)]
    X_test_cells = X_temp[len(X_cells.index):].reset_index(drop=True)
    New_x = pd.concat([X, X_genes, X_cells], axis=1)
    New_xtest = pd.concat([X_test, X_test_genes, X_test_cells], axis=1)
    
    return New_x, New_xtest

In [None]:
# with PCA features
features = X.columns
genes = [col for col in features if col.startswith('g-')]

In [None]:
cells = [col for col in features if col.startswith('c-')]

In [None]:
from sklearn.cluster import KMeans
n_genes=35
n_cells=5
# def create_cluster(X, X_test, genes,cells, n_genes=40, n_cells=10):
kmeans = KMeans(n_genes)
data = kmeans.fit(X[genes].append(X_test[genes]))
X['kmeans_g'] = data.labels_[:X.shape[0]]
X_test['kmeans_g'] = data.labels_[X.shape[0]:]


kmeans = KMeans(n_cells)
data = kmeans.fit(X[cells].append(X_test[cells]))
X['kmeans_c'] = data.labels_[:X.shape[0]]
X_test['kmeans_c'] = data.labels_[X.shape[0]:]


In [None]:
'kmeans_g', 'kmeans_c','g_sum', 'g_mean', 'g_std', 'g_kurt','g_skew', 'c_sum','c_mean', 'c_std', 'c_kurt', 'c_skew', 'gc_sum','gc_mean','gc_std','gc_kurt','gc_skew','g_sum_cp_dose','g_mean_cp_dose','g_std_cp_dose','c_sum_cp_dose', 'c_mean_cp_dose', 'c_std_cp_dose'

In [None]:
for df in X, X_test:
    df['g_sum'] = df[genes].sum(axis = 1)
    df['g_mean'] = df[genes].mean(axis = 1)
    df['g_std'] = df[genes].std(axis = 1)
    df['g_kurt'] = df[genes].kurtosis(axis = 1)
    df['g_skew'] = df[genes].skew(axis = 1)
    df['c_sum'] = df[cells].sum(axis = 1)
    df['c_mean'] = df[cells].mean(axis = 1)
    df['c_std'] = df[cells].std(axis = 1)
    df['c_kurt'] = df[cells].kurtosis(axis = 1)
    df['c_skew'] = df[cells].skew(axis = 1)
    df['gc_sum'] = df[genes + cells].sum(axis = 1)
    df['gc_mean'] = df[genes + cells].mean(axis = 1)
    df['gc_std'] = df[genes + cells].std(axis = 1)
    df['gc_kurt'] = df[genes + cells].kurtosis(axis = 1)
    df['gc_skew'] = df[genes + cells].skew(axis = 1)
    df['g_sum_cp_dose'] = df['cp_dose'].map(dict(zip(['D1','D2'],df[genes+['cp_dose']].groupby(['cp_dose']).sum().values.sum(axis=1))))
    df['g_mean_cp_dose'] = df['cp_dose'].map(dict(zip(['D1','D2'],df[genes+['cp_dose']].groupby(['cp_dose']).mean().values.mean(axis=1))))
    df['g_std_cp_dose'] = df['cp_dose'].map(dict(zip(['D1','D2'],df[genes+['cp_dose']].groupby(['cp_dose']).std().values.std(axis=1))))
    
    df['c_sum_cp_dose'] = df['cp_dose'].map(dict(zip(['D1','D2'],df[cells+['cp_dose']].groupby(['cp_dose']).sum().values.sum(axis=1))))
    df['c_mean_cp_dose'] = df['cp_dose'].map(dict(zip(['D1','D2'],df[cells+['cp_dose']].groupby(['cp_dose']).mean().values.mean(axis=1))))
    df['c_std_cp_dose'] = df['cp_dose'].map(dict(zip(['D1','D2'],df[cells+['cp_dose']].groupby(['cp_dose']).std().values.std(axis=1))))
    

In [None]:
X = pd.get_dummies(X, columns=['cp_dose', 'cp_time'])
X_test = pd.get_dummies(X_test, columns=['cp_dose', 'cp_time'])

In [None]:
X_genes = X[genes].copy()
X_test_genes = X_test[genes].copy()
#cells
X_cells = X[cells].copy()
X_test_cells = X_test[cells].copy()

In [None]:
import random
import torch
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
X, X_test = pca_number(X_genes, X_test_genes, X_cells, X_test_cells, X, X_test, n_gene=650, n_cells=50)

In [None]:
X.head()

In [None]:
test_features_df.head()

In [None]:
control_mask = test_features_df['cp_type']=='ctl_vehicle'

In [None]:
# y = y[X['cp_type']!='ctl_vehicle'].reset_index(drop=True).to_numpy()
# X = X[X['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# X_test = X_test[X_test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

In [None]:
X = X.drop(['cp_type'], axis=1)
X_test = X_test.drop(['cp_type'], axis=1)

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)

data = X.append(X_test)
var_thresh.fit(data)

In [None]:
selected_columns=list(var_thresh.get_support())

In [None]:
cats = ['cp_dose_D1','cp_dose_D2','cp_time_24','cp_time_48','cp_time_72']

In [None]:
extras = ['kmeans_g', 'kmeans_c','g_sum', 'g_mean', 'g_std', 'g_kurt','g_skew', 'c_sum','c_mean', 'c_std', 'c_kurt', 'c_skew', 'gc_sum','gc_mean','gc_std','gc_kurt','gc_skew','g_sum_cp_dose','g_mean_cp_dose','g_std_cp_dose','c_sum_cp_dose', 'c_mean_cp_dose', 'c_std_cp_dose']

In [None]:
sum(selected_columns)

In [None]:
len(selected_columns)

In [None]:
selected_columns[872:900] = [True]*28

In [None]:
t = X.loc[:, selected_columns]
t1 = X_test.loc[:, selected_columns]


In [None]:
X = X.loc[:, selected_columns]
X_test = X_test.loc[:, selected_columns]


In [None]:
X.head()

In [None]:
X.shape

In [None]:
y.shape, X_test.shape

In [None]:
!pip install transformers &> /dev/null

!git clone https://github.com/fastai/fastai &> /dev/null
!pip install -e "fastai[dev]" &> /dev/null
!pip install fastai2 &> /dev/null

In [None]:
from fastai.tabular.all import *
from fastai.tabular.data import *


In [None]:
import torch
import torch.nn as nn

import torch.nn.functional as F

class Model(nn.Module):      # <-- Update
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.25)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [None]:
class MoaDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X.values
        self.y = y.astype(float)

    def __getitem__(self, idx):
        # token_type_ids = inputs["token_type_ids"]
        return (torch.tensor(self.X[idx], dtype=torch.float), torch.tensor(self.y[idx], dtype=torch.float))
        

    def __len__(self):
        return len(self.y)

In [None]:
class MoaTestDataset(torch.utils.data.Dataset):
    def __init__(self, X):
        self.X = X.values
        self.y = np.zeros(len(X))

    def __getitem__(self, idx):
        # token_type_ids = inputs["token_type_ids"]
        
        return (torch.tensor(self.X[idx], dtype=torch.float), torch.tensor(self.y[idx], dtype=torch.float))
    
        

    def __len__(self):
        return len(self.X)

In [None]:

# test_dls = DataLoaders.from_dsets(test_dls, bs=128)

In [None]:
@delegates(torch.optim.AdamW.__init__)
def pytorch_adamw(param_groups, **kwargs):
    return OptimWrapper(torch.optim.AdamW([{'params': ps, **kwargs} for ps in param_groups]))

In [None]:
CustomAdamW =partial(pytorch_adamw, lr=1e-3, weight_decay=1e-5)

In [None]:
y = y.to_numpy()

In [None]:
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F


In [None]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)
        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [None]:
test_preds = np.zeros((test_features_df.shape[0], y.shape[1]))
losses = []
SEED = [0]
from fastai.data.core import DataLoaders
model = Model(
        num_features=X.shape[1],
        num_targets=y.shape[1],
        hidden_size=1500,
    )
skf = MultilabelStratifiedKFold(n_splits = 5)

for i in SEED:
    seed_everything(i)
    for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
        import gc
        gc.collect()
        torch.cuda.empty_cache()
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        valid_dataset = MoaDataset(X_valid, y_valid)
        train_dataset = MoaDataset(X_train, y_train)
        dls = DataLoaders.from_dsets(train_dataset, valid_dataset, bs=128)
        
        dls = dls.cuda()
        model = Model(
        num_features=X.shape[1],
        num_targets=y.shape[1],
        hidden_size=2000,
        )
        cbs=SaveModelCallback()
        model.to('cuda')
        loss_func = SmoothBCEwLogits(smoothing=0.001)
#         loss_func = nn.BCEWithLogitsLoss()
        
        learner = Learner(dls, model, loss_func = loss_func, opt_func=CustomAdamW)
        learner.fit_one_cycle(100, cbs=[EarlyStoppingCallback(monitor='valid_loss', patience=5),cbs],pct_start=0.1, div_factor=1000, 
                                              max_lr=1e-3)
        log_loss = np.array(learner.recorder.values)[:,1].min()
        learner.load('./model')
#         log_loss = np.array(learner.recorder.values)[:,1].min() # validation loss
        learner.predict(torch.tensor(X_test.values, dtype=torch.float))
        preds = learner.pred
        test_preds += preds.sigmoid().detach().cpu().numpy()
#         model = create_model(X.shape[1])
#         reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.15, patience=3, verbose=1, epsilon=1e-4, mode='min')
#         early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, mode= 'min')
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]
#         history = model.fit(X_train.values,
#                       y_train,
#                       validation_data=(X_valid.values, y_valid),
#                       epochs=100, batch_size=128,
#                       callbacks=[reduce_lr_loss, early_stop], verbose=2
#                      )

#         test_predict = model.predict(X_test.values)
#         test_preds+=test_predict
# #         val_preds = model.predict(X_valid.values)
# #         loss_one = log_loss(np.ravel(y_valid), np.ravel(val_preds))
        losses.append(log_loss)
        

print(np.array(losses).sum()/7)
test_preds[control_mask] = 0

In [None]:
sample_sub_df.iloc[:,1:] = test_preds/5

sample_sub_df.to_csv('submission.csv', index=False)