In [None]:
import numpy as np
import pandas as pd

# Introduction
In [this notebook](https://www.kaggle.com/dliend/fastai-tabular-learner-moa-challenge), we used a basic fastai `TabularLearner` to generate predictions for this challenge. We did not attempt substantial hyperparameter tuning (aside from tweaking the learning rate).

In an earlier notebook, we implemented grid search and discussed a few other ideas for improving performance. In this notebook, we try to implement those ideas.

In [None]:
from fastai.tabular.all import *
path = Path('../input/lish-moa')
path.ls()

In [None]:
train_features = pd.read_csv(path/'train_features.csv')
test_features = pd.read_csv(path/'test_features.csv')
train_targets = pd.read_csv(path/'train_targets_scored.csv')
train_drugs = pd.read_csv(path/'train_drug.csv')
sub = pd.read_csv(path/'sample_submission.csv')

From the documentation:

>Features for the training set. Features `g-` signify gene expression data, and `c-` signify cell viability data. `cp_type` indicates samples treated with a compound (`cp_vehicle`) or with a control perturbation (`ctrl_vehicle`); control perturbations have no MoAs; `cp_time` and `cp_dose` indicate treatment duration (24, 48, 72 hours) and dose (high or low).

# Data Preprocessing

In [None]:
categorical = ['cp_type', 'cp_time', 'cp_dose']
continuous = [i for i in train_features.columns if i not in ['cp_type', 'cp_time', 'cp_dose', 'sig_id']]
dep_var = [i for i in train_targets.columns if i != 'sig_id']
train_features[dep_var] = train_targets[dep_var]
# Dropping controls
# train_features = train_features[train_features['cp_type'] != 'ctl_vehicle']
train_features.shape

In [None]:
(train_features[train_features['cp_type'] == 'ctl_vehicle']).shape

In [None]:
n_train = np.rint(train_features.shape[0]*0.8).astype(int)
n_train

In [None]:
indices = np.random.RandomState(seed=1234321).permutation(train_features.shape[0])
train_idx, test_idx = indices[:n_train], indices[n_train:]
splits = (list(train_idx), list(test_idx))

In [None]:
procs = [FillMissing, Categorify, Normalize]
data = TabularPandas(train_features, procs=procs, cat_names=categorical,
                    cont_names=continuous, y_names=dep_var, splits = splits)

In [None]:
len(data.train), len(data.valid)

# Fit Model
Based on the results of a previously-run grid search, we will use the following:
* batch size 1024
* lr .005
* At least 20 epochs (maybe more)
* weight decay of 0.1

In [None]:
torch.cuda.empty_cache() 

In [None]:
torch.cuda.empty_cache() 
dls = data.dataloaders(64)
learn = tabular_learner(dls, layers=[15000, 1600], loss_func=BCEWithLogitsLossFlat(),
                        wd = 3.8)
learn.fit_one_cycle(17,.003)

In [None]:
#%%script false 
# train model on ALL data
n_train = np.rint(train_features.shape[0]).astype(int)
indices = np.random.permutation(train_features.shape[0])
train_idx, test_idx = indices[:n_train], indices[n_train:]
splits = (list(train_idx), list(test_idx))
alldata = TabularPandas(train_features, procs=procs, cat_names=categorical,
                    cont_names=continuous, y_names=dep_var)
alldata_dl = alldata.dataloaders(1024)

In [None]:
#%%script false
learn_alldata = tabular_learner(alldata_dl, layers=[1600, 400], loss_func=BCEWithLogitsLossFlat(),
                        wd = 0.01)
learn_alldata.fit_one_cycle(16,.005)

# Get Predictions for Submission
We follow the guide to setting up a test set here: https://forums.fast.ai/t/a-brief-guide-to-test-sets-in-v2-you-can-do-labelled-now-too/57054

In [None]:
%%script false 
test_dl = dls.test_dl(test_features)
preds,_ = learn.get_preds(dl=test_dl)
preds_df = pd.DataFrame(preds).astype("float")
submission = sub
submission[dep_var] = preds_df
submission.loc[test_features['cp_type']=='ctl_vehicle', dep_var] = 0

In [None]:
#%%script false 
test_dl = alldata_dl.test_dl(test_features)
preds,_ = learn_alldata.get_preds(dl=test_dl)
preds_df = pd.DataFrame(preds).astype("float")
submission_alldata = sub
submission_alldata[dep_var] = preds_df
submission_alldata.loc[test_features['cp_type']=='ctl_vehicle', dep_var] = 0

In [None]:
#%%script false 
#submission.to_csv('submission.csv', index=False)
submission_alldata.to_csv('submission.csv', index=False)