In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys
# INSTALL RAPIDS
!cp ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
!pip install -q /kaggle/input/iterative-stratification/iterative-stratification-master/

In [None]:
from fastai.basics import *

In [None]:
datapath = Path("/kaggle/input/lish-moa")

In [None]:
datapath.ls().map(lambda o: o.name)

In [None]:
train_features_df = pd.read_csv(datapath/'train_features.csv')
test_features_df = pd.read_csv(datapath/'test_features.csv')
train_targets_scored_df = pd.read_csv(datapath/'train_targets_scored.csv')
train_targets_nonscored_df = pd.read_csv(datapath/'train_targets_nonscored.csv')

In [None]:
train_features_df.shape, test_features_df.shape, train_targets_scored_df.shape, train_targets_nonscored_df.shape

In [None]:
nonscored_cols = list(train_targets_nonscored_df.columns[1:])
scored_cols = list(train_targets_scored_df.columns[1:])

In [None]:
len(nonscored_cols), len(scored_cols)

In [None]:
assert set(nonscored_cols).intersection(scored_cols) == set()

In [None]:
train_features_df.head()

In [None]:
test_features_df.head()

In [None]:
train_targets_scored_df.head()

In [None]:
train_targets_scored_df.iloc[1:].mean().hist()

In [None]:
train_target_distrib = dict(train_targets_scored_df.iloc[:,1:].sum())

In [None]:
train_target_distrib;

In [None]:
[(k,train_target_distrib[k]) for k in train_target_distrib if train_target_distrib[k] == 1]

In [None]:
n_train, n_test = train_features_df.shape[0], test_features_df.shape[0]
train_test_features_df = pd.concat([train_features_df, test_features_df]).reset_index(drop=True)
g_cols = list(o for o in train_test_features_df.columns if o.startswith("g-"))
c_cols = list(o for o in train_test_features_df.columns if o.startswith("c-"))
gc_cols = g_cols + c_cols

len(g_cols), len(c_cols), len(gc_cols)

### Feature Distributions

In [None]:
from seaborn import distplot

In [None]:
def plot_dist():
    fig, axes = plt.subplots(2, 5, figsize=(15,6))
    axes = axes.flatten()
    for c, ax in zip(list(np.random.choice(c_cols, 5))+list(np.random.choice(g_cols,5)), axes): distplot(train_test_features_df[c], ax=ax)

In [None]:
plot_dist()

### Config

Useful for hyperparameter search

In [None]:
ParamConfig = SimpleNamespace(
    do_rank_gauss = True,
    
    do_quantile_tfms = False,
    n_quantiles = 100,
    
    do_pca = False,
    pca_reduction_factor = 3,

    do_umap = False,
    umap_reduction_factor = 3,
    umap_n_neighbors = 150,
    
    do_knn_encoding = True,
    knn_k = 100,
    
    do_transfer_learning = False,
    tl_n_epochs = 30,
    tl_smoothing = 0.001,
    tl_lr = 0.001,
    
    ft_n_epochs = 30,
    ft_smoothing = 0.001,
    ft_lr = 0.001,
    
    model_n_layers = 2,
    model_layer_width = 512,
    model_ps = 0.25
)

### RankGauss

In [None]:
import cupy as cp
from cupyx.scipy.special import erfinv
import cudf as gd

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.special import erfinv as sp_erfinv

In [None]:
def to_rankgauss(x):
    "https://medium.com/rapids-ai/gauss-rank-transformation-is-100x-faster-with-rapids-and-cupy-7c947e3397da"
    x_cpu = x
    r_cpu = x_cpu.argsort().argsort() 
    epsilon = 1e-6
    r_cpu = (r_cpu/r_cpu.max()-0.5)*2 # scale to (-1,1)
    r_cpu = cp.clip(r_cpu,-1+epsilon,1-epsilon)
    r_cpu = sp_erfinv(r_cpu) # map to gaussian
    return r_cpu

In [None]:
if ParamConfig.do_rank_gauss: 
    for col in gc_cols: 
        train_test_features_df[col] = to_rankgauss(train_test_features_df[col])

### QuantileTransform

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
if ParamConfig.do_quantile_tfms:
    transformer = QuantileTransformer(n_quantiles=ParamConfig.n_quantiles, random_state=0, output_distribution="normal")
    train_test_features_df[gc_cols] = transformer.fit_transform(train_test_features_df[gc_cols])

### Plot Dist

In [None]:
plot_dist()

In [None]:
len(g_cols), len(c_cols), len(gc_cols)

In [None]:
train_test_features_df

In [None]:
train_test_features_df.groupby(['cp_type', 'cp_time', 'cp_dose'])[['sig_id']].count()

In [None]:
len(train_test_features_df['sig_id'].unique())

In [None]:
# ctl_vehicle have no MoA hence all 0 target labels
all(train_test_features_df.merge(train_targets_scored_df, on='sig_id').query("cp_type == 'ctl_vehicle'")[scored_cols].sum()==0)

In [None]:
# ctl_vehicle have no MoA hence all 0 target labels
all(train_test_features_df.merge(train_targets_nonscored_df, on='sig_id').query("cp_type == 'ctl_vehicle'")[nonscored_cols].sum()==0)

In [None]:
test_features_df

### PCA

In [None]:
from cuml.decomposition import PCA as cumlPCA

In [None]:
%%time
if ParamConfig.do_pca:
    # add pca features
    pca_feats = []
    colnames = []
    for name, cols in [("gene", g_cols), ("cell", c_cols)]:
        n_comp = int(len(cols)/ParamConfig.pca_reduction_factor)
        pca = cumlPCA(n_components=n_comp, iterated_power=500)
        pca_feat = pca.fit_transform(train_test_features_df[cols])
        pca_feats += [pca_feat]
        colnames += [f"pca_{name}_{i}" for i in range(n_comp)]
        
    pca_feats = np.hstack(pca_feats)
    train_test_features_df[colnames] = pca_feats

In [None]:
train_test_features_df

### UMAP

In [None]:
from cuml.manifold import UMAP as cumlUMAP

In [None]:
%%time
# add umap features
if ParamConfig.do_umap:
    umap_feats = []
    colnames = []
    for name, cols in [("gene", g_cols), ("cell", c_cols)]:
        n_comp = int(len(cols)/ParamConfig.umap_reduction_factor)
        umap = cumlUMAP(n_components=n_comp, n_neighbors=ParamConfig.umap_n_neighbors)
        umap_feat = umap.fit_transform(train_test_features_df[cols])
        umap_feats += [umap_feat]
        colnames += [f"umap_{name}_{i}" for i in range(n_comp)]

    umap_feats = np.hstack(umap_feats)
    train_test_features_df[colnames] = umap_feats

In [None]:
train_test_features_df

### One-hot Encode

In [None]:
for ohe_col in ['cp_type', 'cp_time', 'cp_dose']:
    vals = train_test_features_df[ohe_col].unique()
    if len(vals) == 2:
        col = ohe_col+"s"
        cat2code = {v:k for k,v in enumerate(vals)}
        train_test_features_df[col] = train_test_features_df[ohe_col].map(cat2code)

    else:
        cat2code = {v:k for k,v in enumerate(vals)}
        encoded = train_test_features_df[ohe_col].map(cat2code)
        ohe_arr = np.zeros((len(vals), len(vals)))
        ohe_arr[np.diag_indices_from(ohe_arr)] = 1
        col = [f"{ohe_col}s_{o}" for o in vals]   
        train_test_features_df[col] = ohe_arr[encoded]

In [None]:
train_test_features_df = train_test_features_df.drop("cp_types", 1)

In [None]:
train_test_features_df

### KNN Non-Scored Target Encoding

In [None]:
from cuml.neighbors import KNeighborsRegressor as cumlKNN

In [None]:
if ParamConfig.do_knn_encoding:
    knn = cumlKNN(n_neighbors=ParamConfig.knn_k)
    merged_nonscored_train_test_df = train_test_features_df.merge(train_targets_nonscored_df, on='sig_id', how='left')
    unique_cp_time, unique_cp_dose = merged_nonscored_train_test_df['cp_time'].unique(), merged_nonscored_train_test_df['cp_dose'].unique()

In [None]:
if ParamConfig.do_knn_encoding:
    # create pairwise groups
    knn_groups = []
    for i in unique_cp_time:
        for j in unique_cp_dose: knn_groups.append((i,j))

    # initialize knn feature cols
    knn_cols = [f"knn{ParamConfig.knn_k}_{i}" for i in range(len(nonscored_cols))]
    for c in knn_cols: train_test_features_df[c] = 0

In [None]:
if ParamConfig.do_knn_encoding:
    
    for time, dose in knn_groups:

        # filter data by time and dose group
        X = merged_nonscored_train_test_df.query(f"cp_time == '{time}' & cp_dose == '{dose}'")[gc_cols]
        y = merged_nonscored_train_test_df.query(f"cp_time == '{time}' & cp_dose == '{dose}'")[nonscored_cols]

        # find corresponding indexes from dataframe
        idxs = array(list(X.index))
        train_idxs = idxs[np.where((y.isna().sum(1) == 0))[0]]
        test_idxs = idxs[np.where((y.isna().sum(1) != 0))[0]]

        # fit KNN
        X_train, y_train, X_test, y_test = X.loc[train_idxs], y.loc[train_idxs], X.loc[test_idxs], y.loc[test_idxs]
        knn.fit(X_train, y_train)

        # predict and put encoded features
        train_preds, test_preds = knn.predict(X_train), knn.predict(X_test)
        train_test_features_df.loc[train_idxs, knn_cols] = train_preds
        train_test_features_df.loc[test_idxs, knn_cols] = test_preds

In [None]:
train_test_features_df

### New train, test 

In [None]:
train_features_df = train_test_features_df[:n_train].reset_index(drop=True)
test_features_df = train_test_features_df[n_train:].reset_index(drop=True)
assert np.sum(train_features_df.isna()).sum() == 0
assert np.sum(test_features_df.isna()).sum() == 0

In [None]:
train_features_df.shape, test_features_df.shape

In [None]:
train_features_df.head()

In [None]:
test_features_df.head()

In [None]:
train_features_df.shape

### Loss/Metric/Model

In [None]:
from fastai.tabular.all import *

In [None]:
# Loss
@log_args
class LabelSmoothingBCEWithLogits(Module):
    y_int = True
    def __init__(self, eps:float=0.1): store_attr('eps')

    def forward(self, output, target):
        target = target + self.eps*(1-2*target)
        return F.binary_cross_entropy_with_logits(output, target)

    def decodes(self, x):    return x>self.thresh
    def activation(self, x): return torch.sigmoid(x)

In [None]:
# Metric
def clipped_bce(inp, targ): return F.binary_cross_entropy(torch.clamp(inp.sigmoid(),1e-15, 1-1e-15), targ.float())
metric = AvgMetric(clipped_bce)

### Pretrained Model (Non-Scored)

In [None]:
cat_names = []
cont_names = L(list(train_features_df.columns[4:]))
y_names = nonscored_cols

In [None]:
merged_nonscored_train_df = train_features_df.merge(train_targets_nonscored_df, on='sig_id', how='left')

In [None]:
logspath = Path("logs")
if not logspath.exists(): logspath.mkdir()

In [None]:
# drop control perturbation
ctl_sig_ids = merged_nonscored_train_df.query("cp_type=='ctl_vehicle'")['sig_id'].values
trn_df = merged_nonscored_train_df[~merged_nonscored_train_df['sig_id'].isin(ctl_sig_ids)]

In [None]:
trn_df

In [None]:
trn_df.shape

In [None]:
procs = []
dls = TabularDataLoaders.from_df(merged_nonscored_train_df,
                                 procs=procs,
                                 cat_names=cat_names, 
                                 cont_names=cont_names, 
                                 y_names=y_names,
                                 valid_idx=[], # use all data
                                 bs=128)

In [None]:
tl_learner = tabular_learner(dls,
                             layers=ParamConfig.model_n_layers*[ParamConfig.model_layer_width],
                             config = {"ps": ParamConfig.model_n_layers*[ParamConfig.model_ps]},
                             n_out=len(nonscored_cols), 
                             loss_func=LabelSmoothingBCEWithLogits(ParamConfig.tl_smoothing),
                             metrics=[metric])

In [None]:
tl_learner.model

In [None]:
if ParamConfig.do_transfer_learning: tl_learner.fit_flat_cos(ParamConfig.tl_n_epochs, lr=ParamConfig.tl_lr)

In [None]:
if ParamConfig.do_transfer_learning:  tl_learner.save("pretrained_tabular", with_opt=False);

### Scored Model

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
merged_scored_train_df = train_features_df.merge(train_targets_scored_df, on='sig_id', how='left')
# drop control perturbation
ctl_sig_ids = merged_scored_train_df.query("cp_type=='ctl_vehicle'")['sig_id'].values
trn_df = merged_scored_train_df[~merged_scored_train_df['sig_id'].isin(ctl_sig_ids)]

In [None]:
y_names = scored_cols

In [None]:
N_FOLDS = 10
mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS)

In [None]:
sig_ids = trn_df['sig_id'].values
cv_idxs = list(mskf.split(sig_ids, y=trn_df[y_names]))

In [None]:
cvpath = Path("cv_sig_ids")
if not cvpath.exists(): cvpath.mkdir()
for i, idxs in enumerate(cv_idxs): pd.to_pickle(sig_ids[idxs[1]], cvpath/f'sig_ids_fold{i}.pkl')

In [None]:
merged_scored_train_df.head()

### Scored Model

In [None]:
if ParamConfig.do_transfer_learning: pretrained_statedict = torch.load("models/pretrained_tabular.pth", map_location=default_device())

In [None]:
def load_top_layers(learner, pretrained_statedict):
    i = 0
    for n, p in learner.model.named_parameters():
        try:
            p.data.copy_(pretrained_statedict[n])
            print(f"Loaded {n}")
            i += 1
        except:
            continue
    
    if i == 0: 
        print("No parameter is loaded")
        if ParamConfig.do_transfer_learning: raise Exception("Transfer learning is set True but no parameter loaded!")
    else: 
        print(f"Total of loaded params: {i}")

In [None]:
for FOLD in range(N_FOLDS):
    valid_sig_ids = pd.read_pickle(cvpath/f'sig_ids_fold{FOLD}.pkl')
    valid_idxs = np.where(trn_df.sig_id.isin(valid_sig_ids))[0]
    procs = []
    dls = TabularDataLoaders.from_df(trn_df,
                                     procs=procs,
                                     cat_names=cat_names, 
                                     cont_names=cont_names,
                                     y_names=y_names,
                                     valid_idx=valid_idxs,
                                     bs=128)
    learner = tabular_learner(dls,
                              layers=ParamConfig.model_n_layers*[ParamConfig.model_layer_width],
                              config = {"ps": ParamConfig.model_n_layers*[ParamConfig.model_ps]},
                              n_out=len(scored_cols), 
                              loss_func=LabelSmoothingBCEWithLogits(ParamConfig.ft_smoothing),
                              metrics=[metric])
    
    if ParamConfig.do_transfer_learning: load_top_layers(learner, pretrained_statedict)
    
    learner.fit_flat_cos(ParamConfig.ft_n_epochs,
                         lr=ParamConfig.ft_lr,
                         cbs=[SaveModelCallback(monitor='clipped_bce', 
                                                fname=f'tabular_fold{FOLD}',
                                                comp=np.less),
                              EarlyStoppingCallback(monitor='clipped_bce', 
                                                    patience=5,
                                                    comp=np.less),
                              CSVLogger(fname=f"logs/tabular_logs_fold{FOLD}.csv")])
    
    learner.export(f'models/tabular_fold{FOLD}_export.pkl')

### Validation

In [None]:
fold_metrics = [pd.read_csv(logspath/f'tabular_logs_fold{FOLD}.csv')['clipped_bce'].min() for FOLD in range(10)]

In [None]:
np.mean(fold_metrics), np.std(fold_metrics)

### Inference

In [None]:
modelspath = Path("models")

In [None]:
fold_preds = []
for i in range(10):
    learner = load_learner(modelspath/f'tabular_fold{i}_export.pkl')
    test_dl = learner.dls.test_dl(test_features_df)
    preds, _ = learner.get_preds(dl=test_dl)
    fold_preds += [preds]
preds = torch.stack(fold_preds)

In [None]:
mean_preds = preds.mean(0)

In [None]:
sub_df = pd.DataFrame(mean_preds, columns=test_dl.y_names)
sub_df['sig_id'] = test_dl.items['sig_id']
sub_df = sub_df[['sig_id']+test_dl.y_names]

In [None]:
ctl_sig_ids = test_features_df.query("cp_type == 'ctl_vehicle'")['sig_id'].values

In [None]:
sub_df.loc[sub_df.sig_id.isin(ctl_sig_ids), test_dl.y_names] = 0

In [None]:
sub_df.to_csv("submission.csv", index=False)