In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Dataset

In [None]:
import json
import time
import re
import random
import datetime
import pickle
import gc
import warnings

import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm

# sklearn
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.cluster import KMeans

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

# tensorflow
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras import layers, regularizers, Sequential, backend, callbacks, optimizers, metrics, losses

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 300)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline

In [None]:

import os
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


In [None]:
path = "/kaggle/"
df_Train = pd.read_csv(path+"input/lish-moa/train_features.csv")
df_Test = pd.read_csv(path+"input/lish-moa/test_features.csv")
df_Train_target_nonscored = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
df_Train_target_scored = pd.read_csv(path+"input/lish-moa/train_targets_scored.csv")
sample_submission = pd.read_csv(path+"input/lish-moa/sample_submission.csv")


target_cols = df_Train_target_scored.columns[1:]

In [None]:
class Config():
    # setting
    is_debug = False
    is_kaggle = True
    is_pretrain = False

    # features
    do_variancethreshold = False
    do_kmeans = False
    do_filter = True
    do_feature_squared = True
    do_feature_stats = True
    do_feature_pca = True
    do_feature_svd = True
    do_feature_fa = True

    # constant
    seed = 42
    n_gene_comp = 70
    n_cell_comp = 10
    n_gene_kmeans_cluster = 30
    n_cell_kmeans_cluster = 5
    n_variance_threshold = 0.7
    scaler = 'Rankgauss' # Standard, Robust, MinMax

    # HyperParameters
    epochs = 80
    seed_avg = [0, 101, 202, 303 ,404, 999]
    nfold = 7
    verbose = 0
    lr = 1e-3
    weight_decay = 1e-5
    batch_size = 128

config = Config()

In [None]:
def HighestCorrelation(corrmat):
    c = corrmat.abs()
    shape = c.shape
    s = c.unstack()
    so = s.sort_values(kind="quicksort")
    return so[-(shape[0] + 10):-shape[0]]

def data_filter(train, test):
    """cp_type = ctl_vehicle
    """
    train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
    test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)
    train = train.drop('cp_type', axis=1)
    test = test.drop('cp_type', axis=1)
    return train, test

def one_hot_encoder(df, cols):
    """sklearn.OneHotEncoder.Encoding
    """
    for col in cols:
        ohe = OneHotEncoder(sparse=False)
        ohe_df = pd.DataFrame(ohe.fit_transform(df[[col]])).add_prefix(col + '_ohe_')
        # 元のDFに結合
        df = pd.concat([df, ohe_df], axis=1)
        # oheしたカラムを除外
        df = df.drop(col, axis=1)
    return df

def feature_stats(df):
    """ features distibution stats
    """
    df.loc[:, 'g-sum'] = df[GENES].sum(axis=1)
    df.loc[:, 'g-mean'] = df[GENES].mean(axis=1)
    df.loc[:, 'g-std'] = df[GENES].std(axis=1)
    df.loc[:, 'g-kurt'] = df[GENES].kurtosis(axis=1)
    df.loc[:, 'g-skew'] = df[GENES].skew(axis=1)

    df.loc[:, 'c-sum'] = df[CELLS].sum(axis=1)
    df.loc[:, 'c-mean'] = df[CELLS].mean(axis=1)
    df.loc[:, 'c-std'] = df[CELLS].std(axis=1)
    df.loc[:, 'c-kurt'] = df[CELLS].kurtosis(axis=1)
    df.loc[:, 'c-skew'] = df[CELLS].skew(axis=1)

    df.loc[:, 'gc-sum'] = df[GENES + CELLS].sum(axis=1)
    df.loc[:, 'gc-mean'] = df[GENES + CELLS].mean(axis=1)
    df.loc[:, 'gc-std'] = df[GENES + CELLS].std(axis=1)
    df.loc[:, 'gc-kurt'] = df[GENES + CELLS].kurtosis(axis=1)
    df.loc[:, 'gc-skew'] = df[GENES + CELLS].skew(axis=1)
    return df


def feature_pca(df, col_list, n_comp, col_type='g', seed=config.seed):
    """PCA for important features
    """
    pca = (PCA(n_components=n_comp, random_state=seed).fit_transform(df[col_list]))
    pca_df = pd.DataFrame(pca, columns=[f'{col_type}-pca_{i}' for i in range(n_comp)])
    df = pd.concat([df, pca_df], axis=1)
    return df

def feature_svd(df, col_list, n_comp, col_type='g', seed=config.seed):
    """SVD
    """
    svd = (TruncatedSVD(n_components=n_comp, random_state=seed).fit_transform(df[col_list]))
    svd_df = pd.DataFrame(svd, columns=[f'{col_type}-svd_{i}' for i in range(n_comp)])
    df = pd.concat([df, svd_df], axis=1)
    return df

def feature_fa(df, col_list, n_comp, col_type='g', seed=config.seed):

    svd = (FactorAnalysis(n_components=n_comp, random_state=seed).fit_transform(df[col_list]))
    svd_df = pd.DataFrame(svd, columns=[f'{col_type}-fa_{i}' for i in range(n_comp)])
    df = pd.concat([df, svd_df], axis=1)
    return df

def feature_squared(df, cols_list):

    for feature in cols_list:
        df.loc[:, f'{feature}_squared'] = df[feature] ** 2
    return df


def variance_threshold(df, n):
 
    var_thresh = VarianceThreshold(threshold=n)
    df = pd.DataFrame(var_thresh.fit_transform(df))
    return df


def rankgauss(df, cols, seed=config.seed):

    for col in cols:
        transformer = QuantileTransformer(n_quantiles=100, random_state=seed, output_distribution="normal")
        vec_len = len(df[col].values)
        raw_vec = df[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        
    return df



def feature_engineering(train_features, test_features):

    global GENES, CELLS


    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]

    cat_columns = ['cp_time', 'cp_dose']

    # filter
    if config.do_filter:
        print('do filter')
        train, test = data_filter(train_features, test_features)

    df = pd.concat([train, test])
    df = df.reset_index(drop=True)

    # Stats feature
    if config.do_feature_stats:
        print('do feature_stats')
        df = feature_stats(df)

    # squared
    if config.do_feature_squared:
        print('do feature_squared')
        df = feature_squared(df, CELLS)

    # PCA feature
    if config.do_feature_pca:
        print('do feature_pca')
        df = feature_pca(df, GENES, n_comp=config.n_gene_comp, col_type='g')
        df = feature_pca(df, CELLS, n_comp=config.n_cell_comp, col_type='c')

    # SVD feature
    if config.do_feature_svd:
        print('do feature_svd')
        df = feature_svd(df, GENES, n_comp=config.n_gene_comp, col_type='g')
        df = feature_svd(df, CELLS, n_comp=config.n_cell_comp, col_type='c')

    # FA feature
    if config.do_feature_fa:
        print('do feature_fa')
        df = feature_fa(df, GENES, n_comp=config.n_gene_comp, col_type='g')
        df = feature_fa(df, CELLS, n_comp=config.n_cell_comp, col_type='c')

    cat_df = df[['sig_id'] + cat_columns]
    num_df = df.drop(['sig_id'] + cat_columns, axis=1)

    # VarianceThreshold
    if config.do_variancethreshold:
        print('do variancethreshold')
        num_df = variance_threshold(num_df, n=config.n_variance_threshold)

    if config.scaler == 'Rankgauss':
        print('do Rankgauss')
        df = rankgauss(df, num_df.columns.tolist())

    elif config.scaler == 'Standard':
        print('do Standard')
        sscaler = StandardScaler()
        num_df.iloc[:, :] = sscaler.fit_transform(num_df)

    elif config.scaler == 'Robust':
        print('do Robust')
        rscaler = RobustScaler()
        num_df.iloc[:, :] = rscaler.fit_transform(num_df)

    elif config.scaler == 'MinMax':
        print('do MinMax')
        mmscaler = MinMaxScaler()
        num_df.iloc[:, :] = mmscaler.fit_transform(num_df)

    #one-hot-encode
    cat_df = one_hot_encoder(cat_df, cat_columns)


    df = pd.concat([cat_df, num_df], axis=1)

    # train & test
    train = df.iloc[:len(train), :]
    test = df.iloc[len(train):, :]
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)

    return train, test



In [None]:
train, test = feature_engineering(df_Train, df_Test)


In [None]:
target_cols = df_Train_target_scored.drop('sig_id', axis=1).columns.values.tolist()  # Predicted value 
target_cols_non_scored = df_Train_target_nonscored.drop('sig_id', axis=1).columns.values.tolist()  # pretrain variable column list for
feature_cols = [c for c in train.columns if c not in ['sig_id']]  

train = train.merge(df_Train_target_scored, on='sig_id')
target = train[df_Train_target_scored.columns]

# pretrain
train_non_scored = train[['sig_id'] + feature_cols].merge(df_Train_target_nonscored, on='sig_id')
target_non_scored = train_non_scored[df_Train_target_nonscored.columns]

In [None]:
train

In [None]:
def seed_everything(seed=config.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

In [None]:
# Function to calculate the mean log loss of the targets including clipping
def mean_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    metrics = []
    for target in range(len(target_cols)):
        metrics.append(log_loss(y_true[:, target], y_pred[:, target]))
    return np.mean(metrics)


def create_model_3l(feature_len, target_len):
    inp = tf.keras.layers.Input(shape = (feature_len))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(0.4914099166744246)(x)
    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(1159, activation = 'relu'))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.18817607797795838)(x)
    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(960, activation = 'relu'))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.12542057776853896)(x)
    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(1811, activation = 'relu'))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.20175242230280122)(x)
    out = tfa.layers.WeightNormalization(tf.keras.layers.Dense(target_len, activation = 'sigmoid'))(x)
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    opt = tf.optimizers.Adam(learning_rate = LEARNING_RATE)
    opt = tfa.optimizers.Lookahead(opt, sync_period = 10)
    model.compile(optimizer = opt, 
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.0015),
                  metrics = tf.keras.metrics.BinaryCrossentropy())
    return model

In [None]:

LEARNING_RATE = config.lr
WEIGHT_DECAY = config.weight_decay
BATCH_SIZE = config.batch_size
EPOCHS = config.epochs
SEED_AVG = config.seed_avg
NFOLDS = config.nfold
VERBOSE = config.verbose

if config.is_debug:
    EPOCHS = 3
    SEED_AVG = [0, 101]
    NFOLDS = 3
    VERBOSE = 2

In [None]:
def run_training(tr_idx, va_idx, fold, seed):

    # SEED
    seed_everything(seed)

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))

    # train& validation
    train_df = train.iloc[tr_idx]
    valid_df = train.iloc[va_idx]
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values

    K.clear_session()
    model = create_model_3l(len(feature_cols), len(target_cols))
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor = 'val_binary_crossentropy',
        mode = 'min',
        patience = 10,
        restore_best_weights = True,
        verbose = 2
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor = 'val_binary_crossentropy',
        mode = 'min',
        factor = 0.3,
        patience = 3,
        verbose = 2
    )
    
    model.fit(
        x_train, y_train,
        validation_data = (x_valid, y_valid),
        epochs = EPOCHS, 
        batch_size = BATCH_SIZE,
        callbacks = [early_stopping, reduce_lr],
        verbose = 2
    )

    oof[va_idx] = model.predict(x_valid)
    predictions = model.predict(test[feature_cols].values)
    
    return oof, predictions

In [None]:
SEED_AVG

In [None]:

%%time

oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED_AVG:
    print(f'============== Training SEED is {seed} ==============')
    start = time.time()

    oof_fold = np.zeros((len(train), len(target_cols)))
    predictions_fold = np.zeros((len(test), len(target_cols)))

    # CV
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=seed, shuffle=True)

    for i_fold, (tr_idx, va_idx) in enumerate(tqdm(mskf.split(X=train, y=target))):

        oof_, pred_ = run_training(tr_idx, va_idx, i_fold, seed)


        oof_fold += oof_
        predictions_fold += pred_ / NFOLDS

    oof_score = mean_log_loss(target.drop('sig_id', axis=1).values, oof_fold)
    print(f'seed: {seed} fold mean log loss score is {oof_score}')


    oof += oof_fold / len(SEED_AVG)
    predictions += predictions_fold / len(SEED_AVG)
    
    elapsed_time = time.time() - start
    print(f'SEED: {seed} Elapsed_time:{elapsed_time:.4f} sec')

seed_log_loss = mean_log_loss(target.drop('sig_id', axis=1).values, oof)
print(f'Our out of folds log loss for our seed blend model is {seed_log_loss}')

print(f'==================== Training END ====================')

In [None]:


train[target_cols] = oof
test[target_cols] = predictions
print(len(target_cols))

valid_results = df_Train_target_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
print(df_Train_target_scored.shape, valid_results.shape)

In [None]:

y_true = df_Train_target_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

In [None]:
config.is_kaggle

In [None]:

if config.is_kaggle:
    sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
    sub.to_csv('submission.csv', index=False)

In [None]:
config_str = ''
for key, value in Config.__dict__.items():
    if not key.startswith('__'):
        config_str += f'{key}: {value}\n'

config_str += 'CV log_loss:' + str(score)
# 日付を取得
data_str = datetime.datetime.now().strftime("%m%d%H%M")

if config.is_kaggle:
    config_dir = './'
else:
    config_dir = 'score/'

with open(config_dir + str(round(score, 7)) + '_' + data_str + '_config_score.txt', mode='w') as f:
    f.write(config_str)