# Ensemble of fastai tabular learner and TabNet:

* **tabular_learner** : **Uses in particular,embeddings for categorical variables with linear layers. [https://docs.fast.ai/tabular.learner]**
* **TabNet** : **This is an adaptation of TabNet (Attention-based network for tabular data) [https://arxiv.org/pdf/1908.07442.pdf]**

In [None]:
from fastai.tabular.all import *
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fastai.callback import *
from tqdm.notebook import tqdm
from ml_stratifiers import MultilabelStratifiedKFold

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

In [None]:
path = Path('../input/lish-moa')

In [None]:
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
train_features.head()

In [None]:
train_targets_scored.head()

In [None]:
cols = train_targets_scored.columns.tolist()[1:]

In [None]:
train_features.sig_id.nunique()

In [None]:
train_features.cp_type.value_counts()

In [None]:
train_features.cp_time.value_counts()

In [None]:
train_features.cp_dose.value_counts()

In [None]:
train_targets_scored.sum()[1:].sort_values().head(10)

In [None]:
train_targets_scored.sum()[1:].sort_values().tail(50)

* Lest try a sepearate model for these in the next version

In [None]:
train_targets_scored.sum()[1:].sort_values().tail(50).index[-24:]

* **atp-sensitive_potassium_channel_antagonist,erbb2_inhibitor** both have only one True values so we can keep them zero

In [None]:
trn_df = train_features.merge(train_targets_scored,on='sig_id',how='left')

In [None]:
targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
def make_folds(folds = 5, random_state = 0, stratify = True, scored = None):
    
    drug = pd.read_csv('../input/lish-moa/train_drug.csv')
    if scored is None:
        scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left')

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc<=18].index.sort_values()
    vc2 = vc.loc[vc>18].index.sort_values()

    # STRATIFY DRUGS 18 OR LESS
    dct1 = {}; dct2 = {}
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)
    
    # STRATIFY DRUGS MORE THAN 18
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)
    
    # ASSIGN FOLDS
    scored['fold'] = np.nan
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(),'fold'] = scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')
    
    return scored[['sig_id','fold']].copy()

In [None]:
FOLDS = 10; SEED = 42
ff = make_folds(folds=FOLDS, random_state=SEED, stratify=True, scored=targets)

trn_df['kfold'] = ff.fold.values

In [None]:
df = trn_df.copy()

In [None]:
df.head()

In [None]:
sig_ids = test_features[test_features['cp_type'] == 'ctl_vehicle']['sig_id'].values

In [None]:
len(cols)

In [None]:
cat_names = ['cp_type', 'cp_time', 'cp_dose']
cont_names = [c for c in train_features.columns if c not in cat_names and c != 'sig_id']

In [None]:
sig = lambda x : 100/(1+np.exp(-x/5))

In [None]:
df[cont_names]  = sig(df[cont_names]) 

In [None]:
test_features[cont_names] = sig(test_features[cont_names])

In [None]:
def get_data(fold):
    
    val_idx = df[df.kfold==fold].index
    dls = TabularDataLoaders.from_df(df, path=path, y_names=cols,
                                        cat_names = cat_names,
                                        cont_names = cont_names,
                                        procs = [Categorify, FillMissing, Normalize],
                                        valid_idx=val_idx,
                                        #y_block=MultiCategoryBlock(encoded=True,vocab=cols),
                                        bs=64)
    return dls
    

In [None]:
test_sc = []

for i in tqdm(range(FOLDS)):
    
    dls = get_data(i) # Data
    
    learn = tabular_learner(dls , y_range=(0,1), layers=[1024, 512, 512, 256], loss_func = BCELossFlat(), model_dir='/kaggle/working/') # Model
    
    name = 'best_model_' + str(i) 
    cb = SaveModelCallback(monitor='valid_loss',fname=name ,mode='min') # Callbacks
    
    lr = 9e-3
    learn.fit_one_cycle(10, slice(lr/(2.6**4),lr),cbs=cb) # Training
    
    learn.load(name) # Load best model
    
    test_dl = learn.dls.test_dl(test_features)
    sub = learn.get_preds(dl=test_dl) # prediction
    test_sc.append(sub[0].numpy())
    
    learn.export('/kaggle/working/'+name+'.pkl') # export model
    
test_sc = np.array(test_sc)

In [None]:
avg_prds = test_sc.mean(axis=0)

In [None]:
submission = sample_submission.copy()
submission[cols] = avg_prds
submission.loc[submission['sig_id'].isin(test_features.loc[test_features['cp_type'] =='ctl_vehicle', 'sig_id']), train_targets_scored.columns[1:]] = 0
submission['atp-sensitive_potassium_channel_antagonist'] = 0
submission['erbb2_inhibitor'] = 0

In [None]:
submission.head()

In [None]:
submission.to_csv('submission_tabular.csv',index=False)

In [None]:
import sys
sys.path.append('../input/pytorch-tabnet')
sys.path.append('../input/fastai-tabnet')

## Lets Build A TabNet Mobel

In [None]:
from fastai.basics import *
from pytorch_tabnet import *
from fast_tabnet.core import *

## Tabnet Model 5 fold

In [None]:
test_sc_tab = []
lr = 9e-3

for i in tqdm(range(FOLDS)):
    
    dls = get_data(i) # Data
    emb_szs = get_emb_sz(dls)
    
    model = TabNetModel(emb_szs, len(dls.cont_names), dls.c, n_d=8, n_a=32, n_steps=1); 
    
    opt_func = partial(Adam, wd=0.01, eps=1e-5)
    learn = Learner(dls, model, BCEWithLogitsLossFlat(), opt_func=opt_func, lr=lr, model_dir='/kaggle/working/')
    
    name = 'best_model_tabnet_' + str(i) 
    
    cb = SaveModelCallback(monitor='valid_loss',fname=name ,mode='min') # Callbacks
    
    lr = 9e-3
    learn.fit_one_cycle(30, slice(lr/(2.6**4),lr),cbs=cb) # Training
    
    learn.load(name) # Load best model
    
    test_dl = learn.dls.test_dl(test_features)
    sub = learn.get_preds(dl=test_dl) # prediction
    test_sc_tab.append(sub[0].numpy())
    
    learn.export('/kaggle/working/'+name+'.pkl') # export model
    
test_sc_tab = np.array(test_sc_tab)

In [None]:
avg_prds_tab = test_sc_tab.mean(axis=0)

In [None]:
submission_tab = sample_submission.copy()
submission_tab[cols] = avg_prds_tab
submission_tab.loc[submission_tab['sig_id'].isin(test_features.loc[test_features['cp_type'] =='ctl_vehicle', 'sig_id']), train_targets_scored.columns[1:]] = 0
submission_tab['atp-sensitive_potassium_channel_antagonist'] = 0
submission_tab['erbb2_inhibitor'] = 0

In [None]:
submission_tab.to_csv('submission_tabnet.csv',index=False)

## Combining both fastai tabular learner and TabNet predictions

In [None]:
final_prds = np.array((list(avg_prds),list(avg_prds_tab))).mean(axis=0)

In [None]:
fin_wt_prds = avg_prds*(0.75) + avg_prds_tab*(0.25)

In [None]:
submission_fin = sample_submission.copy()
submission_fin[cols] = fin_wt_prds
submission_fin.loc[submission_fin['sig_id'].isin(test_features.loc[test_features['cp_type'] =='ctl_vehicle', 'sig_id']), train_targets_scored.columns[1:]] = 0
submission_fin['atp-sensitive_potassium_channel_antagonist'] = 0
submission_fin['erbb2_inhibitor'] = 0

In [None]:
results = submission_fin.copy()
for cl in cols:
    results[cl].clip(0.0002, 0.999, inplace = True)
results.to_csv('submission.csv',index=False)