In [None]:
!pip install /kaggle/input/iterative-stratification016py3noneanywhl/iterative_stratification-0.1.6-py3-none-any.whl

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity
from matplotlib.pyplot import figure
from sklearn.decomposition import PCA
import fastai
from fastai.tabular.all import *
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
train_feat = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_feat = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_targ_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

In this notebook we present an approach based on learning embeddings for gene expression data mainly through using the fastai library and explaining code snippets throughout the post. ***Find blog posts about other aspects of the competition in [machine-learnink][3]. There will be more updates on the webpage soon.***

**Initial idea**
 
  According to some posts in the discussion forums ([post][1] and [preprocessing][2]), the informative gene expression values are either signifcantly negative or postive. This means that a whole lot of other gene expression data in between might contain little amount of information or just be noise.
 
  I first compare the difference between gene expression values with random noise from a Gaussian distribution. The random signal in the plot below is generated from a Gaussian distribution based on the description provided in the discussions in this [post][1]. In addition, we average over a sample of 1000 different generated random signals.
    
[1]: <https://www.kaggle.com/c/lish-moa/discussion/184005#1034211> "discussionPost"
[2]: <https://clue.io/connectopedia/glossary#Q> "Preprocessing"
[3]: <https://machine-learnink.com/> "blogPost"

In [None]:
# the data columns corresponding to gene expression data g0 to g772
sorted_rows = train_feat.iloc[:,4:776].values.copy()
sorted_rows.sort()

examples = []
for i in range(1000):
    vals = np.random.randn(772)
    vals = (vals - np.median(vals)) / (1.4826 * np.abs(vals - np.median(vals)).mean())
    vals.sort()
    examples.append(vals)
vals = np.mean(np.array(examples), axis=0)
figure(num=None, figsize=(15,8), dpi=80, facecolor='w', edgecolor='k')

plt.xlim(0, 772)
plt.ylim(-4, 4)
vals[np.abs(vals) > 2]

mu = np.mean(sorted_rows, axis=0)
stds = np.std(sorted_rows, axis=0)

plt.subplot(121)
plt.plot(range(0, mu.shape[0]), mu, color='b')
plt.plot(vals, c='k')
plt.fill_between(range(0, 772), mu-stds, mu+stds, linestyle='-', color='b', alpha=0.3)
plt.plot(vals - mu, color='r')
plt.legend(['Gaussian noise (gn)', 'Sorted average gene expression values (sg)','Difference between gn and sg'])

## Processing and Model Structure 
**Gene Expression Data**
- Based on the noise level in gene expression data, we first try to train a network based on the most significant gene expression values (n). 
- We first sort the gene expression data and take the n largest and smallest values, as well as the corresponding gene **indices** (relevant_gene_indices, relevant_gene_values).
- We find a concrete value for n through hyperparameter tuning. We used the [hyperopt][1] library for all hyperparameter tuning. In TODO, we provide the script that finds the best parameter settings.
- Small notes: As the control compounds have always zero active MoAs, we exclude them from training. 

**Cell Viability Data** 

We used principal component analysis (PCA) as a preprocessing step for the cell viability data. For this, we use the train and test cell viability data and apply PCA and project back the training cell data onto the learned components (these are known as cc columns). The number of components (nc) are another hyperparameter that we optimize over.

[1]: <https://github.com/hyperopt/hyperopt> "Hyperparameter Tuning"

In [None]:
def compute_cell_pca(X_train, X_test, nc):
    cell_train = X_train.filter(like='c-')
    cell_test = X_test.filter(like='c-')
    cell_data = cell_train.append(cell_test, ignore_index=True)
    pca = PCA(n_components=nc, whiten=True)
    pca.fit(cell_data.values)
    return pca

def preprocess_wpca(data, pca, n):
    # Split the input into gene expression, cell viability, and metadata.
    extra_data = data[['sig_id', 'cp_time', 'cp_dose']] 
    gene_data = data.filter(like='g-')
    
    cell_data = data.filter(like='c-')
    cell_df = pd.DataFrame(
        pca.transform(cell_data.values),
        index=cell_data.index,
        columns=[f'cc-{num}' for num in range(pca.transform(cell_data.values).shape[1])]
    )
    
    # Select n highest and lowest gene indices.
    sorted_gene_indices = np.argsort(gene_data.values, axis=1, kind='stable')
    relevant_gene_indices = np.concatenate((sorted_gene_indices[:,:n], sorted_gene_indices[:,-n:]), axis=1)
    relevant_gene_values = np.take_along_axis(gene_data.values, relevant_gene_indices, axis=1)
    gene_index_df = pd.DataFrame(relevant_gene_indices, index=data.index, columns=[f'gi-{num}' for num in range(2*n)])
    gene_value_df = pd.DataFrame(relevant_gene_values, index=data.index, columns=[f'gv-{num}' for num in range(2*n)])
    
    return pd.concat([extra_data, gene_index_df, gene_value_df, cell_df], axis=1)

In [None]:
npca = 66
n = 4
epsilon = 0.001
pca = compute_cell_pca(train_feat, test_feat, npca)

# Ignore the control vehicle rows, since the labels are always zero.
filtered_train_feat = train_feat[train_feat['cp_type'] == 'trt_cp']

train_df = preprocess_wpca(filtered_train_feat, pca, n)\
    .merge(train_targ_scored, on='sig_id')\
    .drop(columns=['sig_id'])

# Extract the categorical, continuous, and target variables
cat_names = ['cp_time', 'cp_dose'] + list(train_df.filter(like='gi-').columns) # ignore 'cp_type', since it's constant for the training data
cont_names = list(train_df.filter(like='gv-').columns) + list(train_df.filter(like='c-').columns)
y_names = list(train_targ_scored.drop(columns='sig_id').columns)

# Apply label smoothing
train_df[y_names] = train_df[y_names].mask(train_df[y_names] == 0, epsilon)
train_df[y_names] = train_df[y_names].mask(train_df[y_names] == 1, 1-epsilon)

In [None]:
train_df.head(3)

## fast.ai Tabular model construction 

The `train_df` dataframe contains the `gi` columns which correspond to the indices of the lowest and highest sorted gene values and their corresponding gene values `gv`. We treat the indices as <em>categorical variables</em> and use a learned embedding to incorparate them into our model. The idea of treating some continous variables as categorical variables is explained in the fastai webpage [fast-ai][1]. We therefore have the indices, cp_time, cp_dose as categorical variables and the gene values and the (principal components of the) cell data as continous variables. We want to learn embeddings for the categorical variables, but crucially, we want to use the same embedding for each of the gene indices to avoid an explosion of trainable parameters. (This was mentioned in some of the posts as to how to modify the tabular model to have the same embeddings at different layers. I hope it will be useful for some of you :-) ). 

If we use the fastai.tabular model directly, the model learns a different vector for the same index in the columns `gi-0` to `gi-n`, whereas we probably want the embedding vector for index 531 to be the same for all the columns as it corresponds to the same gene expression value. For this, we copy the fastai tabular model and make some modifications. The original tabular model is found here [tabular-model][2]. 

The additional parameters in the modified version of the TabularModel (we call it `MyTabularModel`) are: 

    1. rep_emb_szs: the tuple which is the size of the repeating embedding (number of genes, embedding size of interest) 
    2. rep_emb_reps: number of times that we want to repeat the embedding (i.e. we want this to be repeated 2n times, which is the number of signifcant gene values).

To handle the repeated embedding, we construct `self.rep_embed` by passing `rep_embed_szs` as input (`self.rep_embed = Embedding(*rep_emb_szs)`). The number of embeddings is then updated to be the sum of the embedding sizes for `cp_dose` and `cp_time` and the repeated embeddings (`n_emb = sum(e.embedding_dim for e in self.embeds) + rep_emb_reps * self.rep_embed.embedding_dim`). Apart from this, the code is a straight copy of the fastai tabular model and consists of an additional layer of dropout on the embeddings, batch normalization for the continuous variables followed by a number of `LinBnDrop` layers as usual.

The second place we need to add modifications is in the forward function, where we keep things simple by assuming that the repeated embedding dimensions are always the last `rep_emb_reps` columns the `x_cat` batch. Since in our case the non-repeated embeddings are small, we don't bother to apply dropout to the non-repeated embeddings (as this should allow us to use larger values for `embed_p` without suddenly forgetting the duration of a sample for no good reason).

In detail, after the second line of `forward`, the list `x` contains the embeddings for the categorical variables `cp_dose` and `cp_time`, which is a list of tensors with shape `batchSize x 3`. The variable `r` contains the flattened embeddings of the gene indices. This is a tensor with shape `batchSize x 80`. The flattening is necessary, since the result of applying `self.rep_embed` is a tensor with shape `batchSize x 8 x 10` and we want to concatenate all embeddings into a single tensor of shape `batchSize x 86`. That's exactly what we do in the fifth line, where we store the concatenated embeddings in `x`.

[1]: <https://www.fast.ai/2018/04/29/categorical-embeddings/> "fastai"
[2]: <https://github.com/fastai/fastai/blob/master/fastai/tabular/model.py#L28> "tabularModel"

In [None]:
class MyTabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs, rep_emb_szs, rep_emb_reps, n_cont, out_sz, layers, ps=None, embed_p=0.,
                 y_range=None, use_bn=True, bn_final=False, bn_cont=True, act_cls=nn.ReLU(inplace=True)):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.rep_embed = Embedding(*rep_emb_szs)
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont) if bn_cont else None
        n_emb = sum(e.embedding_dim for e in self.embeds) + rep_emb_reps * self.rep_embed.embedding_dim
        self.n_emb,self.n_cont = n_emb,n_cont
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [act_cls for _ in range(len(sizes)-2)] + [None]
        _layers = [LinBnDrop(sizes[i], sizes[i+1], bn=use_bn and (i!=len(actns)-1 or bn_final), p=p, act=a)
                       for i,(p,a) in enumerate(zip(ps+[0.],actns))]
        
        if y_range is not None: _layers.append(SigmoidRange(*y_range))
        self.layers = nn.Sequential(*_layers)

    def forward(self, x_cat, x_cont=None):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            r = torch.flatten(self.rep_embed(x_cat[:,len(self.embeds):]), start_dim=-2)
            r = self.emb_drop(r)
            
            x = torch.cat(x + [r], 1)
            #x = self.emb_drop(x)
        if self.n_cont != 0:
            if self.bn_cont is not None: x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        return self.layers(x)

## Fit the model 
   **Parameter initialization**

In [None]:
# Embedding sizes for cp_type, cp_time, cp_dose
emb_szs = [(4, 3), (3, 3)]
embedding_size = 10

# Embedding size for gene expression indices
num_genes = train_feat.filter(like='g-').columns.size + 1
rep_emb_szs = (num_genes, embedding_size)

# Size of hidden layers
layers = [288, 448, 352, 192]

In [None]:
def random_seed(seed_value, use_cuda=False):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

One of the difficulties with the data is that the classes are highly imbalanced. Therefore, in splitting data into training and validation sets, we can have all the active MoA in the training, or in the validation set. In order to have an even split in terms of number of active and non-active MoAs, we use the scikit-MultilabelStratifiedKFold where we compute the validation set error for K folds.

The final predictions are then the average over the predictions for each fold.

In [None]:
random_seed(42)

mskf = MultilabelStratifiedKFold(n_splits=10, shuffle=True, random_state=0)

predictions = []  
predictions_tabnet = []

for train_idx, valid_idx in mskf.split(train_df[cat_names + cont_names], train_df[y_names]):
    splits = (L(list(train_idx)), L(list(valid_idx)))
    
    # Create dataloader
    to = TabularPandas(train_df, procs = Categorify,
                       cat_names = cat_names,
                       cont_names = cont_names,
                       y_names = y_names,
                       y_block = MultiCategoryBlock(encoded=True, vocab=y_names),
                       splits = splits)
    dls = to.dataloaders(bs=96)
    
    model = MyTabularModel(
        emb_szs,
        rep_emb_szs,
        len(cat_names) - len(emb_szs),
        len(cont_names),
        len(y_names),
        layers,
        embed_p=0.5
    )
    mylearn = TabularLearner(dls, model, y_range=(epsilon,1-epsilon))
    
    # Find a good learning rate
    lr_min, lr_steep = mylearn.lr_find(show_plot=False, suggestions=True)
    lr = (lr_min + lr_steep) / 2

    # Fit the model
    mylearn.fit_one_cycle(10, max_lr=lr, cbs=SaveModelCallback(monitor='valid_loss'))
    
    # Get predictions using the tabular model
    test_df = preprocess_wpca(test_feat, pca, n)
    test_dl = mylearn.dls.test_dl(test_df)
    
    test_preds, _ = mylearn.get_preds(dl=test_dl)
    predictions.append(test_preds)


### Create submission

In [None]:
res = torch.mean(torch.stack(predictions), axis=0)
merged = pd.DataFrame(res, columns=y_names)
merged = pd.concat([test_feat[['sig_id']], merged], axis=1)

In [None]:
# Set control entries to zero manually
df_sigId = test_feat[test_feat['cp_type']=='ctl_vehicle'][['sig_id']].reset_index(drop=True)
inds = merged.index[merged['sig_id'].isin(df_sigId.sig_id)].tolist()
merged.iloc[inds, 1:] = 0 

In [None]:
merged.to_csv('submission.csv', index=False)