In [None]:
import numpy as np
import pandas as pd

# Introduction
This notebook will broadly follow the approach laid out in chapter 9 of "Deep Learning for Coders with fastai and PyTorch," titled "Tabular Modeling Deep Dive." Our initial goal is to get a deep learning model running and able to produce a valid submission as quickly as possible, after which we can do some more thoughtful EDA and iterate on the model.

In [None]:
from fastai.tabular.all import *
path = Path('../input/lish-moa')
path.ls()

In [None]:
train_features = pd.read_csv(path/'train_features.csv')
test_features = pd.read_csv(path/'test_features.csv')
train_targets = pd.read_csv(path/'train_targets_scored.csv')
train_drugs = pd.read_csv(path/'train_drug.csv')
sub = pd.read_csv(path/'sample_submission.csv')

From the documentation:

>Features for the training set. Features `g-` signify gene expression data, and `c-` signify cell viability data. `cp_type` indicates samples treated with a compound (`cp_vehicle`) or with a control perturbation (`ctrl_vehicle`); control perturbations have no MoAs; `cp_time` and `cp_dose` indicate treatment duration (24, 48, 72 hours) and dose (high or low).

In [None]:
train_features.shape, train_targets.shape

# Data Preprocessing

In [None]:
categorical = ['cp_type', 'cp_time', 'cp_dose']
continuous = [i for i in train_features.columns if i not in ['cp_type', 'cp_time', 'cp_dose', 'sig_id']]
dep_var = [i for i in train_targets.columns if i != 'sig_id']
train_features[dep_var] = train_targets[dep_var]
train_features.shape

In [None]:
# We have appended the targets to the features dataframe
train_features.iloc[1:5,-5:-1]

In [None]:
n_train = np.rint(train_features.shape[0]*0.8).astype(int)
n_train

In [None]:
indices = np.random.permutation(train_features.shape[0])
train_idx, test_idx = indices[:n_train], indices[n_train:]
splits = (list(train_idx), list(test_idx))

In [None]:
procs = [FillMissing, Categorify, Normalize]
data = TabularPandas(train_features, procs=procs, cat_names=categorical,
                    cont_names=continuous, y_names=dep_var, splits = splits)

In [None]:
len(data.train), len(data.valid)

In [None]:
data.show(10)

## Dataloaders

In [None]:
dls = data.dataloaders(512)

# Set Up Learner

In [None]:
learn = tabular_learner(dls, layers=[600,300], loss_func=BCEWithLogitsLossFlat())

In [None]:
learn.lr_find()

# Fit Model

In [None]:
learn.fit_one_cycle(5,5e-3)

In [None]:
learn.loss_func

In [None]:
learn.recorder.plot_loss()

# Get Predictions for Submission
We follow the guide to setting up a test set here: https://forums.fast.ai/t/a-brief-guide-to-test-sets-in-v2-you-can-do-labelled-now-too/57054

In [None]:
test_dl = dls.test_dl(test_features)

In [None]:
test_dl.show(5)

In [None]:
preds,_ = learn.get_preds(dl=test_dl)

In [None]:
preds.shape, preds.dtype

In [None]:
preds_df = pd.DataFrame(preds).astype("float")

In [None]:
preds_df.head

In [None]:
submission = sub
submission[dep_var] = preds_df
submission.loc[test_features['cp_type']=='ctl_vehicle', dep_var] = 0

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)