In [None]:
from fastai.tabular.all import *
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import QuantileTransformer

In [None]:
train_features=pd.read_csv('../input/lish-moa/train_features.csv')
train_targets=pd.read_csv('../input/lish-moa/train_targets_scored.csv')
drug_df=pd.read_csv('../input/lish-moa/train_drug.csv')

In [None]:
#Minor trick - using categorical features both as categorical and continious 
train_features['cp_dose_cont']=train_features.cp_dose.apply(lambda x:int(x[1]))
train_features['cp_time_cont']=train_features.cp_time

In [None]:
features=train_features.columns.tolist()
cat_names=features[2:4]
cont_names=features[4:]

In [None]:
y_names=train_targets.columns.tolist()[1:]

In [None]:
train_df=train_features.merge(train_targets,how='left',on='sig_id')
train_df.head()

In [None]:
# remove control rows
train_df=train_df[train_df.cp_type!='ctl_vehicle']
train_df.reset_index(drop=True,inplace=True)

In [None]:
train_df.shape

In [None]:
#Build classification target - one of the 8 most frequent drugs or 'misc' for everything else
train_df=train_df.merge(drug_df)
train_df['vc']=train_df.groupby('drug_id')['sig_id'].transform('count')
train_df.loc[train_df.vc>19,'drug']=train_df[train_df.vc>19].drug_id
train_df.loc[train_df.vc<=19,'drug']='misc'

In [None]:
train_df.drug.value_counts().sort_index()

In [None]:
#fastai transform implementing Rank Gauss
class RankGauss(TabularProc):
    def setups(self, to:Tabular,n_quantiles=200):
        self.cont_names=to.cont_names
        self.transformer = QuantileTransformer(n_quantiles=n_quantiles, output_distribution="normal")
        self.transformer.fit(to.train[self.cont_names])
        return self(to)

    def encodes(self, to:Tabular):
        to[self.cont_names]=self.transformer.transform(to[self.cont_names])
        return to

In [None]:
#Rank Gauss and embedding categorical features as the only data pre-processing steps
procs = [Categorify,RankGauss]

In [None]:
#Weights of drugs inversly proportional to their frequency in the dataset to represent oversampling
weights=tensor(train_df.drug.value_counts().sort_index().to_numpy()).unsqueeze(1)
weights=1000/weights.float()
weights[-1]=1
weights

In [None]:
#Custom loss function with label smoothing and class weights
class WeightedLoss(nn.Module):
    def __init__(self, eps=0.001,weights=weights): 
        super(WeightedLoss, self).__init__()  
        self.eps = eps
        self.weights=weights.cuda() if torch.cuda.is_available() else weights

    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = -log_preds.sum(dim=-1)
        unweighted_loss=loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target.long().squeeze(), reduction='none')
        
        weighted_loss=unweighted_loss*F.embedding(target.long(),self.weights)
        return weighted_loss.mean()

In [None]:
### SETUP TRAINING###
bs=50
num_layers=2
hidden_units=525
layers=[hidden_units]*num_layers
dropout_rate=0.3
config=tabular_config(ps=[dropout_rate]*num_layers,embed_p=dropout_rate,
                      act_cls=nn.LeakyReLU(negative_slope=0.1,inplace=True))
lr = 1e-2
wd = 0.15
epochs = 28

loss_func=WeightedLoss(eps=5e-4,weights=weights)
opt_func=Lamb
metrics=[accuracy]
cbs=[SaveModelCallback()]

In [None]:
### Training loop ###
valid_df=train_df.copy()
pred_cols=[f'pred{i}' for i in range(9)]
valid_df[pred_cols]=0
n_splits=7
n_runs=3
for i in range(n_runs):
    mskf=MultilabelStratifiedKFold(n_splits=n_splits,shuffle=True)
    for split in mskf.split(train_df,train_df[['cp_time','cp_dose']+y_names]):
        splits=[split[0].tolist(),split[1].tolist()]
        to=TabularPandas(train_df,procs=procs, cat_names=cat_names, cont_names=cont_names, y_names='drug',
             y_block=CategoryBlock(),splits=splits)
        dls = to.dataloaders(bs=bs,val_bs=10000)
        learn = tabular_learner(dls, layers=layers,
                        config=config,
                        cbs=cbs,
                        loss_func=loss_func,
                        opt_func = opt_func,
                        metrics = metrics)
        learn.fit_one_cycle(epochs,lr_max=lr,wd=wd)
        learn.save(f'frequent_drugs_{i}_{split[1][0]}')
        pred=learn.get_preds(act=partial(F.softmax,dim=1))[0]
        valid_df.loc[splits[1],pred_cols]+=pred.numpy()/n_runs

In [None]:
valid_df.to_csv('drug_predictions.csv',index=False)