In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from sklearn.preprocessing import QuantileTransformer


from sklearn import preprocessing
import torch.optim as optim

In [None]:
import torch.nn.functional as F

In [None]:
import optuna
import plotly as pl

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
import lightgbm as lgb

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
test_fea = pd.read_csv('../input/lish-moa/test_features.csv')
train_fea = pd.read_csv('../input/lish-moa/train_features.csv')
train_tar_nonsco = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
train_tar_sco = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
def seed_everything(seed=1062):
    np.random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=1062)

In [None]:
class Sparsemax(nn.Module):
    def __init__(self, dim=None):
        super(Sparsemax, self).__init__()
        self.dim = -1 if dim is None else dim

    def forward(self, input):
        input = input.transpose(0, self.dim)
        original_size = input.size()
        input = input.reshape(input.size(0), -1)
        input = input.transpose(0, 1)
        dim = 1

        number_of_logits = input.size(dim)
        
        input = input - torch.max(input, dim=dim, keepdim=True)[0].expand_as(input)
        zs = torch.sort(input=input, dim=dim, descending=True)[0]
        range = torch.arange(start=1, end=number_of_logits + 1, device=device,step=1, dtype=input.dtype).view(1, -1)
        range = range.expand_as(zs)

        bound = 1 + range * zs
        cumulative_sum_zs = torch.cumsum(zs, dim)
        is_gt = torch.gt(bound, cumulative_sum_zs).type(input.type())
        k = torch.max(is_gt * range, dim, keepdim=True)[0]
        zs_sparse = is_gt * zs
        taus = (torch.sum(zs_sparse, dim, keepdim=True) - 1) / k
        taus = taus.expand_as(input)
        self.output = torch.max(torch.zeros_like(input), input - taus)
        output = self.output
        output = output.transpose(0, 1)
        output = output.reshape(original_size)
        output = output.transpose(0, self.dim)
        return output
    def backward(self, grad_output):
        dim = 1
        nonzeros = torch.ne(self.output, 0)
        sum = torch.sum(grad_output * nonzeros, dim=dim) / torch.sum(nonzeros, dim=dim)
        self.grad_input = nonzeros * (grad_output - sum.expand_as(grad_output))
        return self.grad_input

In [None]:
def initialize_non_glu(module,inp_dim,out_dim):
    gain = np.sqrt((inp_dim+out_dim)/np.sqrt(4*inp_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain)
    
class GBN(nn.Module):
    def __init__(self,inp,vbs=128,momentum=0.01):
        super().__init__()
        self.bn = nn.BatchNorm1d(inp,momentum=momentum)
        self.vbs = vbs
    def forward(self,x):
        chunk = torch.chunk(x,max(1,x.size(0)//self.vbs),0)
        res = [self.bn(y) for y in chunk ]
        return torch.cat(res,0)

class GLU(nn.Module):
    def __init__(self,inp_dim,out_dim,fc=None,vbs=128):
        super().__init__()
        if fc:
            self.fc = fc
        else:
            self.fc = nn.Linear(inp_dim,out_dim*2)
        self.bn = GBN(out_dim*2,vbs=vbs) 
        self.od = out_dim
    def forward(self,x):
        x = self.bn(self.fc(x))
        return x[:,:self.od]*torch.sigmoid(x[:,self.od:])
    

class FeatureTransformer(nn.Module):
    def __init__(self,inp_dim,out_dim,shared,n_ind,vbs=128):
        super().__init__()
        first = True
        self.shared = nn.ModuleList()
        if shared:
            self.shared.append(GLU(inp_dim,out_dim,shared[0],vbs=vbs))
            first= False    
            for fc in shared[1:]:
                self.shared.append(GLU(out_dim,out_dim,fc,vbs=vbs))
        else:
            self.shared = None
        self.independ = nn.ModuleList()
        if first:
            self.independ.append(GLU(inp,out_dim,vbs=vbs))
        for x in range(first, n_ind):
            self.independ.append(GLU(out_dim,out_dim,vbs=vbs))
        self.scale = torch.sqrt(torch.tensor([.5],device=device))
    def forward(self,x):
        if self.shared:
            x = self.shared[0](x)
            for glu in self.shared[1:]:
                x = torch.add(x, glu(x))
                x = x*self.scale
        for glu in self.independ:
            x = torch.add(x, glu(x))
            x = x*self.scale
        return x
class AttentionTransformer(nn.Module):
    def __init__(self,inp_dim,out_dim,relax,vbs=128):
        super().__init__()
        self.fc = nn.Linear(inp_dim,out_dim)
        self.bn = GBN(out_dim,vbs=vbs)
#         self.smax = Sparsemax()
        self.r = torch.tensor([relax],device=device)
    def forward(self,a,priors):
        a = self.bn(self.fc(a))
        mask = torch.sigmoid(a*priors)
        priors =priors*(self.r-mask)
        return mask

class DecisionStep(nn.Module):
    def __init__(self,inp_dim,n_d,n_a,shared,n_ind,relax,vbs=128):
        super().__init__()
        self.fea_tran = FeatureTransformer(inp_dim,n_d+n_a,shared,n_ind,vbs)
        self.atten_tran = AttentionTransformer(n_a,inp_dim,relax,vbs)
    def forward(self,x,a,priors):
        mask = self.atten_tran(a,priors)
        loss = ((-1)*mask*torch.log(mask+1e-10)).mean()
        x = self.fea_tran(x*mask)
        return x,loss

class TabNet(nn.Module):
    def __init__(self,inp_dim,final_out_dim,n_d=64,n_a=64,n_shared=2,n_ind=2,n_steps=5,relax=1.2,vbs=128):
        super().__init__()
        if n_shared>0:
            self.shared = nn.ModuleList()
            self.shared.append(nn.Linear(inp_dim,2*(n_d+n_a)))
            for x in range(n_shared-1):
                self.shared.append(nn.Linear(n_d+n_a,2*(n_d+n_a)))
        else:
            self.shared=None
        self.first_step = FeatureTransformer(inp_dim,n_d+n_a,self.shared,n_ind) 
        self.steps = nn.ModuleList()
        for x in range(n_steps-1):
            self.steps.append(DecisionStep(inp_dim,n_d,n_a,self.shared,n_ind,relax,vbs))
        self.fc = nn.Linear(n_d,final_out_dim)
        self.bn = nn.BatchNorm1d(inp_dim)
        self.n_d = n_d
    def forward(self,x):
        x = self.bn(x)
        x_a = self.first_step(x)[:,self.n_d:]
        loss = torch.zeros(1).to(x.device)
        out = torch.zeros(x.size(0),self.n_d).to(x.device)
        priors = torch.ones(x.shape).to(x.device)
        for step in self.steps:
            x_te,l = step(x,x_a,priors)
            out += F.relu(x_te[:,:self.n_d])
            x_a = x_te[:,self.n_d:]
            loss += l
        return self.fc(out),loss

In [None]:
class TabNetWithEmbed(nn.Module):
    def __init__(self,inp_dim,final_out_dim,n_d=64,n_a=64,n_shared=2,n_ind=2,n_steps=5,relax=1.2,vbs=128):
        super().__init__()
        self.tabnet = TabNet(inp_dim,final_out_dim,n_d,n_a,n_shared,n_ind,n_steps,relax,vbs)
        self.cat_embed = []
        self.emb1 = nn.Embedding(2,1)
        self.emb3 = nn.Embedding(3,1)
        self.cat_embed.append(self.emb1)
        self.cat_embed.append(self.emb3)
        
    def forward(self,x):
#         catv = catv.to(device)
#         contv = contv.to(device)
#         embeddings = [embed(catv[:,idx]) for embed,idx in zip(self.cat_embed,range(catv.size(1)))]
#         catv = torch.cat(embeddings,1)
#         x = torch.cat((catv,contv),1).contiguous()
        x,l = self.tabnet(x)
        return torch.sigmoid(x),l

In [None]:
class DrugData(Dataset):
    
    def __init__(self,df,out,train=True):
        self.df = df
        self.out = out
        self.train=train
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        if self.train:
            tar = np.where(self.out[idx].reshape(-1)==0,0.00001,0.99999)
            return torch.from_numpy(self.df[idx,:]).float(),torch.tensor(tar).float()
        else:
            return torch.from_numpy(self.df[idx,:]).float(),torch.tensor(self.out[idx].reshape(-1)).float()

In [None]:
train_fea['cp_dose'].replace({'D1':0,'D2':1},inplace=True)
test_fea['cp_dose'].replace({'D1':0,'D2':1},inplace=True)

train_fea['cp_time'].replace({24:0,72:1,48:2},inplace=True)
test_fea['cp_time'].replace({24:0,72:1,48:2},inplace=True)

train_fea['cp_type'].replace({'trt_cp':0,'ctl_vehicle':1},inplace=True)
test_fea['cp_type'].replace({'trt_cp':0,'ctl_vehicle':1},inplace=True)

train_fea = train_fea.drop(columns='sig_id') 
test_fea = test_fea.drop(columns='sig_id')

In [None]:
cat_col = [0,2]
num_col = len(train_fea.columns)

In [None]:
train_tar = train_tar_sco.drop(columns='sig_id').values

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

In [None]:
data = pd.concat([train_fea,test_fea],ignore_index=True)
g = [*(x for x in data.columns if 'g' in x)]
c = [*(x for x in data.columns if 'c-' in x)]

In [None]:
for col in g:
    sel = QuantileTransformer(n_quantiles=100,random_state=0,output_distribution='normal')
    sel.fit(data[col].to_numpy().reshape(-1,1))
    data[col] = sel.transform(data[col].to_numpy().reshape(-1,1))
for col in c:
    sel = QuantileTransformer(n_quantiles=100,random_state=0,output_distribution='normal')
    sel.fit(data[col].to_numpy().reshape(-1,1))
    data[col] = sel.transform(data[col].to_numpy().reshape(-1,1))

In [None]:
pca_c = PCA(n_components=20)
extra_c = pd.DataFrame(pca_c.fit_transform(data[c]))
pca_g = PCA(n_components=208)
extra_g = pd.DataFrame(pca_g.fit_transform(data[g]))
data = pd.concat((data,extra_c,extra_g),axis=1)

In [None]:
# from umap import UMAP

In [None]:
# umap_c = UMAP(random_state=256,n_components=15)
# extra_c = pd.DataFrame(umap_c.fit_transform(data[c]))
# umap_g = UMAP(random_state=256,n_components=50)
# extra_g = pd.DataFrame(umap_g.fit_transform(data[g]))
# data = pd.concat((data,extra_c,extra_g),axis=1)

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
vt = VarianceThreshold(0.65)
dat = data.iloc[:,:3]
data = vt.fit_transform(data.iloc[:,3:])
data = pd.concat((dat,pd.DataFrame(data)),axis=1)

In [None]:
train_df = data.iloc[:len(train_fea),:].values
test_df= data.iloc[len(train_fea):,:].values

In [None]:
kfold = StratifiedKFold(n_splits=5)

In [None]:
loss_func = nn.BCELoss()

In [None]:
# ra = np.arange(3,train_df.shape[1])
# np.random.shuffle(ra)
# train_df[:,3:] = train_df[:,ra]
# test_df[:,3:] = test_df[:,ra]

In [None]:
# Trial 97 finished with value: 0.015035743787155454 and parameters: {'batch_size': 8, 'nd': 8, 'na': 4, 'shared': 1, 'indep': 2, 'steps': 5, 'relax': 2.5, 'vbs': 7, 'lr': 0.008107344577671696}. Best is trial 97 with value: 0.015035743787155454.
# 0.015003892647780077 and parameters: {'batch_size': 9, 'nd': 8, 'na': 8, 'shared': 1, 'indep': 2, 'steps': 2, 'relax': 1, 'vbs': 7, 'lr': 0.009373664964478735}. Best is trial 95 with value: 0.015003892647780077.

In [None]:
# divisor=0

In [None]:
# seed_everything(1006)
# submission.iloc[:,1:]=0
# for train,test in kfold.split(train_df,np.zeros(len(train_df))):
#     batch_size=512
#     sparse_constant = 0
#     model = TabNetWithEmbed(train_df.shape[1]-3,train_tar.shape[1],n_d=256,n_a=256,n_shared=1,n_ind=2,n_steps=2,relax=1,vbs=128)
#     model.to(device)
#     torch.cuda.empty_cache()
#     optimizer = optim.Adam(model.parameters(),lr=0.009373664964478735,weight_decay=0.00001)
#     sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.1,patience=1,verbose=True)
#     train_dataset = DrugData(train_df[train],train_tar[train])
#     valid_dataset = DrugData(train_df[test],train_tar[test],False)
#     train_loader = DataLoader(train_dataset,batch_size=batch_size,num_workers=4,shuffle=True)
#     valid_loader = DataLoader(valid_dataset,batch_size=batch_size,num_workers=4,shuffle=True)
#     losses=[]
#     norm = []
#     t = time.time()
#     for x in range(22):
#         train_loss=0.
#         grad_norm_sum = 0.
#         for inp,tar in train_loader:
#             model.zero_grad()
#             out,l = model(inp[:,3:].to(device))
#             loss = loss_func(out,tar.to(device))#+l*sparse_constant
#             loss.backward()
#             optimizer.step()
# #             sched.step()
#             train_loss+=loss.item()*tar.size(0)
#         valid_loss=0.
#         v=0
#         for inp,tar in valid_loader:
#             v+=1
#             out,_ = model(inp[:,3:].to(device))
#             loss = loss_func(out,tar.to(device))

#             valid_loss+= loss.item()*tar.size(0)
#         losses.append(valid_loss/len(valid_dataset))
#         print('%d epoch, %.8f valid_loss, %.8f training_loss %fsec time'% (x+1,losses[-1],train_loss/len(train_dataset),(time.time()-t)))
#         sched.step(losses[-1])
#         t = time.time()
#     print("completed training one fold -------------- ")
#     model.eval()
#     divisor += 1/losses[-1]
#     submission.iloc[:,1:] += model(torch.from_numpy(test_df[:,3:]).float())[0].data.cpu().numpy()/losses[-1]

In [None]:
seed_everything(1006)
msk = np.random.rand(len(train_df))<0.85
train_inp = train_df[msk]
valid_inp = train_df[~msk]
train_targ = train_tar[msk]
valid_targ = train_tar[~msk]

In [None]:
import gc

In [None]:
def objective(trail):
    model=None
    seed_everything(1006)
    try:
#         sparse_constant = trail.suggest_loguniform('sparse_const',0.000001,0.001)
        batch_size= 2**trail.suggest_int('batch_size',8,12)
        train_dataset = DrugData(train_inp,train_targ)
        valid_dataset = DrugData(valid_inp,valid_targ,False)
        train_loader = DataLoader(train_dataset,batch_size=batch_size,num_workers=4,shuffle=True)
        valid_loader = DataLoader(valid_dataset,batch_size=batch_size,num_workers=4,shuffle=True)
        model = TabNetWithEmbed(train_df.shape[1]-3,train_tar.shape[1],n_d=2**trail.suggest_int('nd',4,9),
                                n_a=2**trail.suggest_int('na',4,9),n_shared=trail.suggest_int('shared',1,6),
                                n_ind=trail.suggest_int('indep',2,4),n_steps=trail.suggest_int('steps',2,7),
                                relax=trail.suggest_categorical('relax',[1,1.2,1.5,2,2.3,2.5]),vbs=2**trail.suggest_int('vbs',4,7))
        model.to(device)
        torch.cuda.empty_cache()
        optimizer = optim.Adam(model.parameters(),lr=trail.suggest_loguniform('lr',3e-4,0.01),weight_decay=trail.suggest_loguniform('wd',0.000005,0.00005))
        sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.1,patience=3,verbose=True)
        losses=[]
        norm = []
        t = time.time()
        for x in range(35):
            train_loss=0.
            grad_norm_sum = 0.
            for inp,tar in train_loader:
                model.zero_grad()
                out,l = model(inp[:,3:].to(device))
                loss = loss_func(out,tar.to(device))#+l*sparse_constant
                loss.backward()
                optimizer.step()

                train_loss+=loss.item()
            valid_loss=0.
            for inp,tar in valid_loader:
                out,_ = model(inp[:,3:].to(device))
                loss = loss_func(out,tar.to(device))

                valid_loss+= loss.item()*tar.size(0)
            losses.append(valid_loss/len(valid_dataset))
#             print('%d epoch, %.8f valid_loss, %.8f training_loss %fsec time'% (x+1,losses[-1],train_loss*batch_size/len(train_dataset),(time.time()-t)))
            sched.step(losses[-1])
            t = time.time()
        return min(losses)
    except:
        model=None
        gc.collect()
        torch.cuda.empty_cache()
        return 99.99

In [None]:
# import joblib

In [None]:
# !cp ../input/drug-prediction/optuna.db ./optuna.dbff

In [None]:
study = optuna.create_study(direction='minimize',storage='sqlite:///./optuna.dbff',load_if_exists=True)

In [None]:
study.optimize(objective,200) 

 Trial 8 finished with value: 0.01601797854527831 and parameters: {'sparse_const': 5.569518821520305e-06, 'batch_size': 10, 'nd': 8, 'na': 8, 'shared': 4, 'indep': 2, 'steps': 6, 'relax': 1.5, 'vbs': 6}. Best is trial 8 with value: 0.01601797854527831.
 Trial 24 finished with value: 0.015906228683888912 and parameters: {'sparse_const': 2.0155530946123628e-05, 'batch_size': 11, 'nd': 9, 'na': 8, 'shared': 3, 'indep': 3, 'steps': 7, 'relax': 2.5, 'vbs': 7}. Best is trial 24 with value: 0.015906228683888912
 Trial 2 finished with value: 0.01621854634040915 and parameters: {'sparse_const': 0.00013392320643434687, 'batch_size': 11, 'nd': 9, 'na': 8, 'shared': 5, 'indep': 2, 'steps': 7, 'relax': 1.5, 'vbs': 5}. Best is trial 2 with value: 0.01621854634040915.
 Trial 64 finished with value: 0.015986942599541212 and parameters: {'sparse_const': 7.667690166509668e-05, 'batch_size': 11, 'nd': 9, 'na': 9, 'shared': 4, 'indep': 3, 'steps': 4, 'relax': 3, 'vbs': 5}. Best is trial 64 with value: 0.015986942599541212.
 Trial 80 finished with value: 0.01593467401459108 and parameters: {'sparse_const': 6.937792007865797e-05, 'batch_size': 11, 'nd': 9, 'na': 9, 'shared': 4, 'indep': 2, 'steps': 4, 'relax': 2, 'vbs': 5}. Best is trial 80 with value: 0.01593467401459108.

[I 2020-11-08 14:37:58,675] Trial 3 finished with value: 0.015910927270087877 and parameters: {'batch_size': 10, 'nd': 8, 'na': 6, 'shared': 1, 'indep': 3, 'steps': 7, 'relax': 3.5, 'vbs': 6, 'lr': 0.000844953649205203}. Best is trial 3 with value: 0.015910927270087877.
[I 2020-11-08 14:59:41,919] Trial 13 finished with value: 0.015747105775683584 and parameters: {'batch_size': 11, 'nd': 8, 'na': 5, 'shared': 2, 'indep': 3, 'steps': 3, 'relax': 3.5, 'vbs': 6, 'lr': 0.009918007325458078}. Best is trial 13 with value: 0.015747105775683584.
[I 2020-11-08 15:08:50,521] Trial 21 finished with value: 0.015424192366516902 and parameters: {'batch_size': 10, 'nd': 7, 'na': 4, 'shared': 1, 'indep': 4, 'steps': 3, 'relax': 1.5, 'vbs': 6, 'lr': 0.009473372341704749}. Best is trial 21 with value: 0.015424192366516902.
[I 2020-11-08 15:12:08,382] Trial 23 finished with value: 0.015248623840681005 and parameters: {'batch_size': 9, 'nd': 7, 'na': 4, 'shared': 1, 'indep': 4, 'steps': 3, 'relax': 1.5, 'vbs': 6, 'lr': 0.0078097190001649875}. Best is trial 23 with value: 0.015248623840681005.

In [None]:
# for col in range(train_tar.shape[1]):
#     gc.collect()
#     t = time.time()
# #     for train,test in kfold.split(train_df,train_tar[:,col]):
# #         model  = mutual_info_classif()
# # #         tdset = lgb.Dataset(train_df[:,3:][train],train_tar[:,col][train])
# # #         vdset = lgb.Dataset(train_df[:,3:][test],train_tar[:,col][test])
# #         model.fit(train_df[:,3:][train],train_tar[:,col][train])
# #         submission.iloc[:,col+1] += model.predict(test_df[:,3:])
# #         features.iloc[col,:] += mutual.feature_importance()
# #           features.iloc
#     features.iloc[col,:] = mutual_info_classif(train_df[:,3:],train_tar[:,col])
#     print('Trained %d label in time: %t'%(col,(time.time()-t)/60))

In [None]:
# submission.iloc[:,1:] = submission.iloc[:,1:]/divisor

In [None]:
# submission = submission.copy()

In [None]:
# submission.to_csv('submission.csv',index=False)