In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from functools import partial
from collections import defaultdict

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split, Subset

import catalyst
from catalyst import dl

import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold

import os
import sys

import plotly.express as px

if os.path.exists("/kaggle"):
    DATA_DIRECTORY = f"/kaggle/input/lish-moa"
    
    sys.path.append("/kaggle/input/iterative-stratification/iterative-stratification-master/")

    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
else:
    DATA_DIRECTORY = f"data"
    
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
RANDOM_SEED = 42

def file_path(filename):
    global DATA_DIRECTORY
    return os.path.join(DATA_DIRECTORY, filename)

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
train = pd.read_csv(file_path("train_features.csv")).sort_values(by='sig_id')
targets = pd.read_csv(file_path("train_targets_scored.csv")).sort_values(by='sig_id')
df_target_ns = pd.read_csv(file_path("train_targets_nonscored.csv")).sort_values(by='sig_id')
test = pd.read_csv(file_path("test_features.csv"))
submission = test[['sig_id']].assign(**targets.iloc[:, 1:].mean())

In [None]:
mask = test['cp_type'] != 'ctl_vehicle'
submission.iloc[~mask, 1:] = 0

In [None]:
df_train = train
df_test = test
df_target_s = targets

In [None]:
transformer = QuantileTransformer(n_quantiles=100,random_state=42, output_distribution="normal")

def preprocess(df):
    df['cp_time'] = df['cp_time'].map({24:1, 48:2, 72:3})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    g_features = [cols for cols in df.columns if cols.startswith('g-')]
    c_features = [cols for cols in df.columns if cols.startswith('c-')]
    for col in (g_features + c_features):
        vec_len = len(df[col].values)
        raw_vec = df[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    return df


X = preprocess(df_train)
X_test = preprocess(df_test)


y = df_target_s.drop('sig_id', axis=1)
y0 =  df_target_ns.drop('sig_id', axis=1)

In [None]:
# Please see reference 3 for this part
g_features = [cols for cols in X.columns if cols.startswith('g-')]
n_comp = 0.95

data = pd.concat([pd.DataFrame(X[g_features]), pd.DataFrame(X_test[g_features])])
data2 = (PCA(0.95, random_state=42).fit_transform(data[g_features]))
train2 = data2[:X.shape[0]]
test2 = data2[-X_test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_g-{i}' for i in range(data2.shape[1])])
test2 = pd.DataFrame(test2, columns=[f'pca_g-{i}' for i in range(data2.shape[1])])

X = pd.concat((X, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)

c_features = [cols for cols in X.columns if cols.startswith('c-')]
n_comp = 0.95

data = pd.concat([pd.DataFrame(X[c_features]), pd.DataFrame(X_test[c_features])])
data2 = (PCA(0.95, random_state=42).fit_transform(data[c_features]))
train2 = data2[:X.shape[0]]
test2 = data2[-X_test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_c-{i}' for i in range(data2.shape[1])])
test2 = pd.DataFrame(test2, columns=[f'pca_c-{i}' for i in range(data2.shape[1])])

X = pd.concat((X, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)  
data = X.append(X_test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : X.shape[0]]
test_features_transformed = data_transformed[-X_test.shape[0] : ]


X = pd.DataFrame(X[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

X = pd.concat([X, pd.DataFrame(train_features_transformed)], axis=1)


X_test = pd.DataFrame(X_test[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

X_test = pd.concat([X_test, pd.DataFrame(test_features_transformed)], axis=1)

display(X.head(2))
print(X.shape)
display(X_test.head(2))
print(X_test.shape)

In [None]:
from sklearn.cluster import KMeans
def fe_cluster(train, test, n_clusters_g = 35, n_clusters_c = 5, SEED = 239):
    
    features_g = list(train.columns[4:776])
    features_c = list(train.columns[776:876])
    def create_cluster(train, test, features, kind = 'g', n_clusters = n_clusters_g):
        train_ = train[features].copy()
        test_ = test[features].copy()
        data = pd.concat([train_, test_], axis = 0)
        kmeans = KMeans(n_clusters = n_clusters, random_state = SEED).fit(data)
        train[f'clusters_{kind}'] = kmeans.labels_[:train.shape[0]]
        test[f'clusters_{kind}'] = kmeans.labels_[train.shape[0]:]
        train = pd.get_dummies(train, columns = [f'clusters_{kind}'])
        test = pd.get_dummies(test, columns = [f'clusters_{kind}'])
        return train, test
    
    train, test = create_cluster(train, test, features_g, kind = 'g', n_clusters = n_clusters_g)
    train, test = create_cluster(train, test, features_c, kind = 'c', n_clusters = n_clusters_c)
    return train, test

X ,X_test=fe_cluster(X,X_test)
display(X.head(2))
print(X.shape)
display(X_test.head(2))
print(X_test.shape)

In [None]:
def fe_stats(train, test):
    
    features_g = list(train.columns[4:776])
    features_c = list(train.columns[776:876])
    
    for df in train, test:
        df['g_sum'] = df[features_g].sum(axis = 1)
        df['g_mean'] = df[features_g].mean(axis = 1)
        df['g_std'] = df[features_g].std(axis = 1)
        df['g_kurt'] = df[features_g].kurtosis(axis = 1)
        df['g_skew'] = df[features_g].skew(axis = 1)
        df['c_sum'] = df[features_c].sum(axis = 1)
        df['c_mean'] = df[features_c].mean(axis = 1)
        df['c_std'] = df[features_c].std(axis = 1)
        df['c_kurt'] = df[features_c].kurtosis(axis = 1)
        df['c_skew'] = df[features_c].skew(axis = 1)
        df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
        df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
        df['gc_std'] = df[features_g + features_c].std(axis = 1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
        df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
        
    return train, test

X,X_test=fe_stats(X,X_test)
display(X.head(2))
print(X.shape)
display(X_test.head(2))
print(X_test.shape)

In [None]:
y0 = y0[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
y = y[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
X = X[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
X.drop(['cp_type','sig_id'], axis=1, inplace=True)
X_test = X_test[X_test['cp_type'] == 'trt_cp'].reset_index(drop = True)
X_test.drop(['cp_type','sig_id'], axis=1, inplace=True)

print('New data shape', X.shape)

In [None]:
def basic_preprocess(X, y=None, delete_veh=True, pca=None):
    if delete_veh:
        mask = np.where(X['cp_type'] == 'ctl_vehicle')[0]
    else:
        mask = []
    X.drop(mask, inplace=True)
    X.reset_index(drop=True, inplace=True)
    if y is not None:
        y.drop(mask, inplace=True)
        y.reset_index(drop=True, inplace=True)
        y.drop(columns='sig_id', inplace=True)
    X.drop(columns=['cp_type', 'sig_id'], inplace=True)
    X['cp_dose'] = ((X['cp_dose'] == 'D2').astype(np.int) - 0.5) * 2
    X['cp_time1'] = ((X['cp_time'] == 24).astype(np.int) - 0.5) * 2
    X['cp_time2'] = ((X['cp_time'] == 48).astype(np.int) - 0.5) * 2
    X.drop(columns='cp_time', inplace=True)
    
    if y is not None:
        return X, y
    return X

def preprocess(train, targets, test, pca=None, scaler=None, label_smoothing=0, threshold=0.2):
    train, targets = basic_preprocess(train, targets)
    test = basic_preprocess(test)
    
    if pca:
        pca = PCA()
        mask = train.columns.str.startswith('g-')
        train.loc[:, mask] = pca.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = pca.transform(test.loc[:, mask])
        
        variance_thresholder = VarianceThreshold(threshold)
        train_features = variance_thresholder.fit_transform(train.loc[:, mask])
        train.drop(columns=train.columns[mask], inplace=True)
        train = train.merge(pd.DataFrame(train_features, columns=[f'gp-{i}' for i in range(train_features.shape[1])]), left_index=True, right_index=True)
        test_features = variance_thresholder.transform(test.loc[:, mask])
        test.drop(columns=test.columns[mask], inplace=True)
        test = test.merge(pd.DataFrame(test_features, columns=[f'gp-{i}' for i in range(test_features.shape[1])]), left_index=True, right_index=True)

        mask = train.columns.str.startswith('c-')
        train.loc[:, mask] = pca.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = pca.transform(test.loc[:, mask])
        
        variance_thresholder = VarianceThreshold(threshold)
        train_features = variance_thresholder.fit_transform(train.loc[:, mask])
        train.drop(columns=train.columns[mask], inplace=True)
        train = train.merge(pd.DataFrame(train_features, columns=[f'cp-{i}' for i in range(train_features.shape[1])]), left_index=True, right_index=True)
        test_features = variance_thresholder.transform(test.loc[:, mask])
        test.drop(columns=test.columns[mask], inplace=True)
        test = test.merge(pd.DataFrame(test_features, columns=[f'cp-{i}' for i in range(test_features.shape[1])]), left_index=True, right_index=True)
        
    if scaler:
        mask = train.columns.str.startswith('g-')
        train.loc[:, mask] = scaler.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = scaler.transform(test.loc[:, mask])

        mask = train.columns.str.startswith('c-')
        train.loc[:, mask] = scaler.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = scaler.transform(test.loc[:, mask])
    
    if label_smoothing:
        targets.clip(label_smoothing, 1 - label_smoothing, inplace=True)
    
    return train, targets, test

label_smoothing = 3e-4
# train, targets, test = preprocess(train, targets, test, pca=False, label_smoothing=0)
# print(train.shape)

In [None]:
X = torch.tensor(X.values.astype(np.float), dtype=torch.float)
y = torch.tensor(y.values.astype(np.float), dtype=torch.float)
X_t = torch.tensor(X_test.values.astype(np.float), dtype=torch.float)

dataset = TensorDataset(X, y)
test_dataset = TensorDataset(X_t, torch.zeros(X_t.shape[0], y.shape[1], dtype=y.dtype))

In [None]:
class CrafterModel(nn.Module):
    def __init__(self, input_size, output_size, label_clip=label_smoothing):
        super().__init__()
        self.model = nn.Sequential()
        if label_clip:
            self.clip_min = torch.log(torch.tensor(label_clip)) - torch.log(torch.tensor(1 - label_clip))
        else:
            self.clip_min = -float("inf")
        
        hidden_sizes = [1536, 2048, output_size]
        dropouts = [0.2, 0.55, 0.55]
        assert len(hidden_sizes) == len(dropouts)
        for i, (hidden_size, dropout) in enumerate(zip(hidden_sizes, dropouts), start=1):
            self.model.add_module(f"batch_norm{i}", nn.BatchNorm1d(input_size))
            if i != 1:
                self.model.add_module(f"dropout{i}", nn.Dropout(dropout))
            linear_layer = torch.nn.utils.weight_norm(nn.Linear(input_size, hidden_size))
            self.model.add_module(f"linear{i}", linear_layer)
            
            if i != len(hidden_sizes):
                self.model.add_module(f"activation{i}", nn.ELU())
            
            input_size = hidden_size
    
    def forward(self, input):
        input = self.model.forward(input)
        input = torch.clamp(input, self.clip_min, -self.clip_min)
        return input
    
class CustomRunner(dl.SupervisedRunner):
    loss = nn.BCEWithLogitsLoss(reduction='mean')
    min_pred = np.log(1e-15) - np.log(1 - 1e-15)
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def _handle_batch(self, batch):
        y_pred = self.model(batch['features'])
        y_pred = torch.clamp(y_pred, CustomRunner.min_pred, 1 - CustomRunner.min_pred)
        loss = CustomRunner.loss(y_pred, batch['targets'])
        self.batch_metrics.update({"loss": loss})
        
class EpochMetricSaverCallback(dl.Callback):
    def __init__(self, metrics=None):
        super().__init__(dl.CallbackOrder.Logging)
        self.metrics_log = defaultdict(lambda: [])
        self.metrics = metrics
        
    def on_epoch_end(self, runner):
        metrics = self.metrics
        if metrics is None:
            metrics = runner.epoch_metrics
        for x in metrics:
            self.metrics_log[x].append(runner.epoch_metrics[x])

In [None]:
def train_model(train_dataset, valid_dataset, num_epochs=50):
    loaders = {
        "train": DataLoader(train_dataset, batch_size=128, shuffle=True),
        "valid": DataLoader(valid_dataset, batch_size=1024, shuffle=False)
    }
    
    input_size = train_dataset[0][0].shape[0]
    output_size = train_dataset[0][1].shape[0]
    
    model = CrafterModel(input_size, output_size, label_clip=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4)
    logdir = "./logs"
    metric_save = EpochMetricSaverCallback()
    
    runner = CustomRunner()

    runner.train(
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        num_epochs=num_epochs,
        logdir=logdir,
        callbacks=[
            dl.EarlyStoppingCallback(10),
            metric_save
        ],
        verbose=True,
        load_best_on_end=True
    )
    
    return model, metric_save.metrics_log

def predict(model, dataset):
    runner = dl.SupervisedRunner()
    results = runner.predict_loader(
        model=model,
        loader=DataLoader(dataset, batch_size=1024)
    )

    total_results = []
    for x in results:
        total_results.append(torch.sigmoid(x['logits']).to('cpu'))
    total_results = torch.cat(total_results)
    return total_results

In [None]:
models = []
predictions = []
metric_logs = []
valid_losses = []

seeds = [42]
n_splits = 5

for seed in seeds:
    for train_idx, valid_idx in tqdm(MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(dataset, y), total=n_splits):
        train_dataset = Subset(dataset, train_idx)
        valid_dataset = Subset(dataset, valid_idx)

        model, metric_log = train_model(train_dataset, valid_dataset)
        models.append(model.to('cpu'))
        metric_logs.append(metric_log)
        val_loss = min(metric_log['valid_loss'])
        print(f"VAL LOSS: {val_loss}")
        valid_losses.append(val_loss)
        predictions.append(predict(model, test_dataset))
    
predictions = torch.stack(predictions)

In [None]:
pd.DataFrame(valid_losses).describe()

count	5.000000  
mean	0.016427  
std	0.000232  
min	0.016092  
25%	0.016293  
50%	0.016495  
75%	0.016624  
max	0.016632  

In [None]:
total_results = predictions.mean(dim=0)
total_results.clamp_(label_smoothing, 1 - label_smoothing)

Best train loss: 0.0144  
Best valid loss: 0.01672   
Real loss: 0.01894  
num_epochs: 15  

In [None]:
submission.iloc[np.where(mask)[0], 1:] = total_results

In [None]:
submission.to_csv("submission.csv", index=False, float_format='%.16f')