In [None]:
# Install fastai
!pip uninstall fastai -y
!pip install /kaggle/input/fast-v2-offline/dataclasses-0.6-py3-none-any.whl
!pip install /kaggle/input/fast-v2-offline/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install /kaggle/input/fast-v2-offline/torchvision-0.7.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install /kaggle/input/fast-v2-offline/fastcore-1.0.1-py3-none-any.whl
!pip install /kaggle/input/fast-v2-offline/fastai-2.0.8-py3-none-any.whl

### Hyper Parameters

In [None]:
LAYERS = [1024, 512, 256]
FOLDS = 30
LABEL_SMOOTH_FACTOR = 0.001

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
import pandas as pd
import matplotlib.pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
import torch.nn as nn
from fastai.tabular.all import *
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import os
import math 

In [None]:
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
# Get rid of useless samples
train_features = train_features[train_features["cp_type"]!="ctl_vehicle"]

In [None]:
# Getrid of useless columns
train_features = train_features.drop(["cp_type"], axis=1)
test_features_new = test_features.drop(["cp_type"], axis=1)

In [None]:
y_names = [y for y in train_targets_scored.columns if y != "sig_id"]
cat_names = ['cp_dose']
cont_names = [c for c in train_features.columns if c not in cat_names and c != "sig_id"]

In [None]:
training_data = train_features.merge(train_targets_scored, how="inner", on="sig_id")

### Stratified KFold

In [None]:
# Startified KFold based on targets
mskf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1982) 
for fold_, (trn_,val_) in enumerate(mskf.split(X=training_data, y=training_data.iloc[:, 875:])): 
    training_data.loc[val_, "kfold"] = fold_

In [None]:
 def training(fold):
    fold_data = training_data[training_data["kfold"] != fold]
    splits = RandomSplitter(valid_pct=0.2)(range_of(fold_data))
    fold_data = fold_data.drop("kfold", axis=1)
    
    # Label smoothing. replace zero from 0.1 and 1 from 0.9
#     fold_data.iloc[:,-206:] = fold_data.iloc[:,-206:] * (1 - LABEL_SMOOTH_FACTOR) + 0.5 * LABEL_SMOOTH_FACTOR
    fold_data.iloc[:,-206:] = fold_data.iloc[:,-206:].clip(0.001, 0.999)
    
    
    tabularPandas = TabularPandas(fold_data, y_names=y_names,
                                     cat_names = cat_names,
                                     cont_names = cont_names,
        procs = [Categorify, FillMissing, Normalize],
                                 splits = splits )
    dataLoader = tabularPandas.dataloaders(bs=128)
    learn = tabular_learner(dataLoader, layers=LAYERS, y_range=(0,1), ps=[0.3, 0.3, 0.3], loss_fn=nn.BCELoss(), opt_func=Adam, act_cls=nn.ReLU(), wd=1e-1)
    learn.fit_one_cycle(20)
    
    # Testing on fold's holdout set - find CV metric
    # Get holdout set
    fold_holdout_data = training_data[training_data["kfold"] == fold].drop(["kfold"], axis=1)
    holdout_dl = learn.dls.test_dl(fold_holdout_data)
    smooth_holdout_prediction = learn.get_preds(dl=holdout_dl)
    holdout_prediction = torch.where(smooth_holdout_prediction[0]<0.5, smooth_holdout_prediction[0]-0.001, smooth_holdout_prediction[0]+0.001)
    holdout_prediction = torch.clamp(holdout_prediction, 0.0, 1.0)
#     import pdb; pdb.set_trace()
    metric = nn.BCELoss()
    loss = metric(holdout_prediction.double(), torch.tensor(fold_holdout_data.iloc[:, 875:].values, dtype=torch.float64))
    print (f"Fold:{fold} - Holdout set metric - {loss}")
    
    # Prediction on test data
    test_dl = learn.dls.test_dl(test_features_new)
    prediction = learn.get_preds(dl=test_dl)
    return prediction[0]

### Train

In [None]:
predictions = torch.zeros([submission.shape[0], 206], dtype=torch.float64)
for i in range(FOLDS):
    preds = training(i)/FOLDS
    predictions += preds
    
submission.iloc[:, 1:] = predictions

### Post Processing

In [None]:
# https://www.kaggle.com/c/lish-moa/discussion/180165 
vehicle_indices = test_features[test_features["cp_type"]=="ctl_vehicle"].index.tolist()
submission.iloc[vehicle_indices, 1:] = np.zeros((1, 206))

In [None]:
submission.to_csv('submission.csv', index=False)