In [10]:
import sys
sys.path.append('../input/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [11]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
import json
import math
import networkx as nx
from tqdm import tqdm
import time

import torchvision

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.prune
import warnings
warnings.filterwarnings('ignore')

def random_init(m, init_func=torch.nn.init.xavier_uniform_):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
        init_func(m.weight.data)
        if m.bias is not None:
            m.bias.data.zero_()



class ResBlock(nn.Module):
    def __init__(self, in_hidden, hidden, activation=nn.ReLU, device='cuda'):
        super().__init__()
        self.device = device
        self.linear1 = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(in_hidden, hidden)),
        )
        self.linear2 = nn.Sequential(
            nn.BatchNorm1d(hidden),
            nn.utils.weight_norm(nn.Linear(hidden, in_hidden)),
        )
        self.shortcut = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(in_hidden, hidden)),
            nn.BatchNorm1d(in_hidden)
        )
        self.linear1.apply(random_init).to(DEVICE)
        self.linear2.apply(random_init).to(DEVICE)
        
    def forward(self, x):
        out = F.relu(self.linear1(x))
        out = self.linear2(out)
        out += self.shortcut(x)
        out = F.relu(out)

        return out

    

def seed_everything(seed=1903):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class MoADataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
        }
        # dct['x'] += dct['x'] + (0.1**0.5)*torch.randn(dct['x'].shape)
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device, l1_reg=0):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs = data['x'].to(device)
        #print(inputs.shape)
        outputs, kl = model(inputs)
        kl_div = torch.mean(kl)

        loss_recon = loss_fn(outputs, inputs)
        print(loss_recon)
        print(kl_div)
        loss = loss_recon + kl_div
        #regularization_loss = 0
        #for param in model.parameters():
        #    regularization_loss += torch.sum(torch.abs(param))
        #loss += l1_reg * regularization_loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)
        outputs, _ = model(inputs)
        loss = loss_fn(outputs, inputs)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    

    return final_loss, valid_preds


def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds


class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size, n_res=1):
        super(Model, self).__init__()        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size).apply(random_init))

        self.batch_norm2 = nn.ModuleList()
        self.dropout2 = nn.ModuleList()
        self.dense2 = nn.ModuleList()

        for res in range(n_res):
            self.batch_norm2.append(nn.BatchNorm1d(hidden_size).to(DEVICE))
            self.dropout2.append(nn.Dropout(0.2).to(DEVICE))
            self.dense2.append(ResBlock(hidden_size, hidden_size, device=DEVICE).apply(random_init))
                
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets).apply(random_init))
        
        self.activation = nn.ReLU()
                
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.activation(self.dense1(x))
        for batch_norm, dropout, dense in zip(self.batch_norm2,
                                              self.dropout2, 
                                              self.dense2):
            x = batch_norm(x)
            x = dropout(x)
            x = dense(x)
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

    

def prune_model(model):
    parameters_to_prune = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            module.weight = torch.nn.Parameter(module.weight)
            parameters_to_prune += [[module, 'weight']]
    torch.nn.utils.prune.global_unstructured(
        parameters_to_prune,
        pruning_method=torch.nn.utils.prune.L1Unstructured,
        amount=0.1,
    )
    
    
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    
    return data

seed_everything(seed=1903)

In [12]:
import torch.nn.functional as F
import math

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F


class Stochastic(nn.Module):
    """
    Base stochastic layer that uses the
    reparametrization trick [Kingma 2013]
    to draw a sample from a distribution
    parametrised by mu and log_var.
    """
    def reparametrize(self, mu, log_var):
        epsilon = Variable(torch.randn(mu.size()), requires_grad=False)

        if mu.is_cuda:
            epsilon = epsilon.cuda()

        # log_std = 0.5 * log_var
        # std = exp(log_std)
        std = log_var.mul(0.5).exp_()

        # z = std * epsilon + mu
        z = mu.addcmul(std, epsilon)

        return z

class GaussianSample(Stochastic):
    """
    Layer that represents a sample from a
    Gaussian distribution.
    """
    def __init__(self, in_features, out_features):
        super(GaussianSample, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.mu = nn.Linear(in_features, out_features)
        self.log_var = nn.Linear(in_features, out_features)

    def forward(self, x):
        mu = self.mu(x)
        log_var = F.softplus(self.log_var(x))

        return self.reparametrize(mu, log_var), mu, log_var


def log_standard_gaussian(x):
    """
    Evaluates the log pdf of a standard normal distribution at x.
    :param x: point to evaluate
    :return: log N(x|0,I)
    """
    return torch.sum(-0.5 * math.log(2 * math.pi) - x ** 2 / 2, dim=-1)


def log_gaussian(x, mu, log_var):
    """
    Returns the log pdf of a normal distribution parametrised
    by mu and log_var evaluated at x.
    :param x: point to evaluate
    :param mu: mean of distribution
    :param log_var: log variance of distribution
    :return: log N(x|µ,σ)
    """
    log_pdf = - 0.5 * torch.log(2 * torch.tensor(math.pi, requires_grad=True)) - log_var / 2 - (x - mu)**2 / (2 * torch.exp(log_var))
    return torch.sum(log_pdf, dim=-1)
    

class PlanarNormalizingFlow(nn.Module):
    """
    Planar normalizing flow [Rezende & Mohamed 2015].
    Provides a tighter bound on the ELBO by giving more expressive
    power to the approximate distribution, such as by introducing
    covariance between terms.
    """
    def __init__(self, in_features):
        super(PlanarNormalizingFlow, self).__init__()
        self.u = nn.Parameter(torch.randn(in_features))
        self.w = nn.Parameter(torch.randn(in_features))
        self.b = nn.Parameter(torch.ones(1))

    def forward(self, z):
        # Create uhat such that it is parallel to w
        uw = torch.dot(self.u, self.w)
        muw = -1 + F.softplus(uw)
        uhat = self.u + (muw - uw) * torch.transpose(self.w, 0, -1) / torch.sum(self.w ** 2)

        # Equation 21 - Transform z
        zwb = torch.mv(z, self.w) + self.b

        f_z = z + (uhat.view(1, -1) * torch.tanh(zwb).view(-1, 1))

        # Compute the Jacobian using the fact that
        # tanh(x) dx = 1 - tanh(x)**2
        psi = (1 - torch.tanh(zwb)**2).view(-1, 1) * self.w.view(1, -1)
        psi_u = torch.mv(psi, uhat)

        # Return the transformed output along
        # with log determninant of J
        logdet_jacobian = torch.log(torch.abs(1 + psi_u) + 1e-8)

        return f_z, logdet_jacobian


class HFlow(nn.Module):
    def __init__(self):
        super(HFlow, self).__init__()

    def forward(self, v, z):
        '''
        :param v: batch_size (B) x latent_size (L)
        :param z: batch_size (B) x latent_size (L)
        :return: z_new = z - 2* v v_T / norm(v,2) * z
        '''
        # v * v_T
        vvT = torch.bmm(v.unsqueeze(2), v.unsqueeze(1) )  # v * v_T : batch_dot( B x L x 1 * B x 1 x L ) = B x L x L
        # v * v_T * z
        vvTz = torch.bmm(vvT, z.unsqueeze(2) ).squeeze(2) # A * z : batchdot( B x L x L * B x L x 1 ).squeeze(2) = (B x L x 1).squeeze(2) = B x L
        # calculate norm ||v||^2
        norm_sq = torch.sum(v * v, 1) # calculate norm-2 for each row : B x 1
        norm_sq = norm_sq.expand(v.size(1), norm_sq.size(0) ) # expand sizes : B x L
        # calculate new z
        z_new = z - 2 * vvTz / norm_sq.transpose(1, 0) # z - 2 * v * v_T  * z / norm2(v)
        return z_new


class linIAF(nn.Module):
    def __init__(self, z_dim):
        super().__init__()
        self.z_dim = z_dim

    def forward(self, l, z):
        '''
        :param L: batch_size (B) x latent_size^2 (L^2)
        :param z: batch_size (B) x latent_size (L)
        :return: z_new = L*z
        '''
        # L->tril(L)
        l_matrix = l.view(-1, self.z_dim, self.z_dim)  # resize to get B x L x L
        lt_mask = torch.tril(torch.ones(self.z_dim, self.z_dim), -1)  # lower-triangular mask matrix (1s in lower triangular part)
        I = Variable(torch.eye(self.z_dim, self.z_dim).expand(l_matrix.size(0), self.z_dim, self.z_dim))
        if self.cuda:
            lt_mask = lt_mask.cuda()
            I = I.cuda()
        lt_mask = Variable(lt_mask)
        lt_mask = lt_mask.unsqueeze(0).expand(l_matrix.size(0), self.z_dim, self.z_dim)  # 1 x L x L -> B x L x L
        lt = torch.mul(l_matrix, lt_mask) + I  # here we get a batch of lower-triangular matrices with ones on diagonal

        # z_new = L * z
        z_new = torch.bmm(lt, z.unsqueeze(2)).squeeze(2)  # B x L x L * B x L x 1 -> B x L

        return z_new


class CombinationL(nn.Module):
    def __init__(self, z_dim, n_combination):
        super().__init__()
        self.z_dim = z_dim
        self.n_combination = n_combination

    def forward(self, l, y):
        '''
        :param l: batch_size (B) x latent_size^2 * n_combination (L^2 * C)
        :param y: batch_size (B) x n_combination (C)
        :return: l_combination = y * L
        '''
        # calculate combination of Ls
        l_tensor = l.view(-1, self.z_dim ** 2, self.n_combination)  # resize to get B x L^2 x C
        y = y.unsqueeze(1).expand(y.size(0), self.z_dim ** 2, y.size(1))  # expand to get B x L^2 x C
        l_combination = torch.sum(l_tensor * y, 2).squeeze()
        return l_combination


class NormalizingFlows(nn.Module):
    """
    Presents a sequence of normalizing flows as a torch.nn.Module.
    """
    def __init__(self, in_features, n_flows=1, h_last_dim=None, flow_type=PlanarNormalizingFlow):
        self.h_last_dim = h_last_dim
        self.flows = []
        self.flows_a = []
        self.n_flows = n_flows
        self.flow_type = "nf"
        for i, features in enumerate(reversed(in_features)):
            self.flows += [nn.ModuleList([flow_type(features).cuda() for _ in range(n_flows)])]

        super(NormalizingFlows, self).__init__()

    def forward(self, z, i=0):
        log_det_jacobian = []
        flows = self.flows
        for flow in flows[i]:
            z, j = flow(z)
            log_det_jacobian.append(j)
        return z, sum(log_det_jacobian)


class HouseholderFlow(nn.Module):
    """
    Presents a sequence of normalizing flows as a torch.nn.Module.
    """
    def __init__(self, in_features, auxiliary, n_flows=1, h_last_dim=None, flow_type=HFlow, flow_flavour="hf"):
        super(HouseholderFlow, self).__init__()
        self.flow_flavour = flow_flavour
        self.v_layers = [[] for _ in range(len(in_features))]
        self.n_flows = n_flows
        self.flow_type = "hf"
        flows = []
        for i, features in enumerate(reversed(in_features)):
            flows += [flow_type().cuda()]
            v_layers = [nn.Linear(h_last_dim, features)] + [nn.Linear(features, features) for _ in range(n_flows)]
            self.v_layers[i] = nn.ModuleList(v_layers)
        if not auxiliary:
            self.flows = nn.ModuleList(flows)
        else:
            self.flows_a = nn.ModuleList(flows)

    def forward(self, z, h_last, auxiliary=False):
        self.cuda()
        v = {}
        z = {'0': z, '1': None}
        # Householder Flow:
        if self.n_flows > 0:
            v['1'] = self.v_layers[0][0].cuda()(h_last)
            if not auxiliary:
                z['1'] = self.flows[0](v['1'], z['0'])
            else:
                z['1'] = self.flows_a[0](v['1'], z['0'])

            for j in range(1, self.n_flows):
                v[str(j + 1)] = self.v_layers[0][j].cuda()(v[str(j)])
                if not auxiliary:
                    z[str(j + 1)] = self.flows[0](v[str(j + 1)], z[str(j)])
                else:
                    z[str(j + 1)] = self.flows_a[0](v[str(j + 1)], z[str(j)])

        return z[str(j + 1)]


class ccLinIAF(nn.Module):
    def __init__(self, in_features, n_flows=1, h_last_dim=None, flow_flavour="ccLinIAF", auxiliary=False, flow_type=linIAF):
        super().__init__()
        self.n_combination = n_flows
        self.n_flows = n_flows
        self.flow_flavour = flow_flavour
        flows = []
        combination_l = []
        encoder_y = []
        encoder_L = []

        for i, features in enumerate(list(reversed(in_features))):
            flows += [flow_type(features).cuda()]
            combination_l += [CombinationL(features, self.n_combination)]
            encoder_y += [nn.Linear(h_last_dim, self.n_combination)]
            encoder_L += [nn.Linear(h_last_dim, (features ** 2) * self.n_combination)]
        if not auxiliary:
            self.flows = nn.ModuleList(flows)
            self.combination_l = nn.ModuleList(combination_l)
            self.encoder_y = nn.ModuleList(encoder_y)
            self.encoder_L = nn.ModuleList(encoder_L)
        else:
            self.flows_a = nn.ModuleList(flows)
            self.combination_l_a = nn.ModuleList(combination_l)
            self.encoder_y_a = nn.ModuleList(encoder_y)
            self.encoder_L_a = nn.ModuleList(encoder_L)

        self.cuda()

    def forward(self, z, h_last, auxiliary=False, k=0):
        z = {'0': z, '1': None}
        if not auxiliary:
            l = self.encoder_L[k](h_last)
            y = F.softmax(self.encoder_y[k](h_last), dim=0)
            l_combination = self.combination_l[k](l, y)
            z['1'] = self.flows[k](l_combination, z['0'])
        else:
            l = self.encoder_L_a[k](h_last)
            y = F.softmax(self.encoder_y_a[k](h_last), dim=0)
            l_combination = self.combination_l_a[k](l, y)
            z['1'] = self.flows_a[k](l_combination, z['0'])

        return z['1']
    
    
    
class Stochastic(nn.Module):
    """
    Base stochastic layer that uses the
    reparametrization trick [Kingma 2013]
    to draw a sample from a distribution
    parametrised by mu and log_var.
    """

    def reparameterize(self, mu, log_var):
        epsilon = Variable(torch.randn(mu.size()), requires_grad=False)

        if mu.is_cuda:
            epsilon = epsilon.cuda()

        # log_std = 0.5 * log_var
        # std = exp(log_std)
        std = log_var.mul(0.5).exp_()

        # z = std * epsilon + mu
        z = mu.addcmul(std, epsilon)

        return z


class GaussianSample(Stochastic):
    """
    Layer that represents a sample from a
    Gaussian distribution.
    """

    def __init__(self, in_features, out_features):
        super(GaussianSample, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.mu = nn.Linear(in_features, out_features)
        self.log_var = nn.Linear(in_features, out_features)

    def forward(self, x):
        mu = self.mu(x)
        log_var = F.softplus(self.log_var(x))

        return self.reparameterize(mu, log_var), mu, log_var


class ResBlock(nn.Module):
    def __init__(self, in_channel, channel, activation=nn.ReLU, device='cuda'):
        super().__init__()
        self.device = device
        self.linear = nn.Sequential(
            activation(),
            nn.Linear(in_channel, channel, 3, padding=1),
            activation(),
            nn.Linear(channel, in_channel, 1),
        )
        self.conv.apply(random_init)
    def forward(self, input):
        out = self.conv.to(self.device)(input)
        out += input

        return out


class ResBlockDeconv(nn.Module):
    def __init__(self, in_channel, channel, activation=nn.ReLU, device='cuda'):
        super().__init__()
        self.device = device
        self.conv = nn.Sequential(
            activation(),
            nn.ConvTranspose1d(in_channel, channel, 1),
            activation(),
            nn.ConvTranspose1d(channel, in_channel, 3, padding=1),
        )
        self.conv.apply(random_init)

    def forward(self, input):
        out = self.conv.to(self.device)(input)
        out += input

        return out


class Autoencoder(torch.nn.Module):
    def __init__(self,
                 z_dim,
                 batchnorm,
                 activation=torch.nn.GELU,
                 flow_type="nf",
                 n_flows=2,
                 n_res=3,
                 gated=True,
                 has_dense=True,
                 resblocks=False,
                 ):
        super(Autoencoder, self).__init__()

        if torch.cuda.is_available():
            device = 'cuda'
        else:
            device = 'cpu'

        self.device = device
        self.bns = []
        self.bns_deconv = []
        self.GaussianSample = GaussianSample(z_dim, z_dim).to(device)
        self.activation = activation()

        self.n_res = n_res

        self.has_dense = has_dense
        self.batchnorm = batchnorm
        self.a_dim = None
        self.dense1 = torch.nn.Linear(in_features=772, out_features=z_dim)
        self.dense2 = torch.nn.Linear(in_features=z_dim, out_features=772)
        self.dense1_bn = nn.BatchNorm1d(num_features=z_dim)
        self.dense2_bn = nn.BatchNorm1d(num_features=772)
        self.dropout = nn.Dropout(0.5)
        self.bns = nn.ModuleList(self.bns)
        self.bns_deconv = nn.ModuleList(self.bns_deconv)
        self.flow_type = flow_type
        self.n_flows = n_flows
        if self.flow_type == "nf":
            self.flow = NormalizingFlows(in_features=[z_dim], n_flows=n_flows)
        if self.flow_type == "hf":
            self.flow = HouseholderFlow(in_features=[z_dim], auxiliary=False, n_flows=n_flows, h_last_dim=z_dim)
        if self.flow_type == "iaf":
            self.flow = IAF(z_dim, n_flows=n_flows, num_hidden=n_flows, h_size=z_dim, forget_bias=1., conv1d=False)
        if self.flow_type == "ccliniaf":
            self.flow = ccLinIAF(in_features=[z_dim], auxiliary=False, n_flows=n_flows, h_last_dim=z_dim)
        if self.flow_type == "o-sylvester":
            self.flow = SylvesterFlows(in_features=[z_dim], flow_flavour='o-sylvester', n_flows=1, h_last_dim=None)

    def random_init(self, init_func=torch.nn.init.kaiming_uniform_):

        for m in self.modules():
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
                init_func(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def _kld(self, z, q_param, h_last=None, p_param=None):
        if len(z.shape) == 1:
            z = z.view(1, -1)
        if (self.flow_type == "nf") and self.n_flows > 0:
            (mu, log_var) = q_param
            f_z, log_det_z = self.flow(z)
            qz = log_gaussian(z, mu, log_var) - sum(log_det_z)
            z = f_z
        elif (self.flow_type == "iaf") and self.n_flows > 0:
            (mu, log_var) = q_param
            f_z, log_det_z = self.flow(z, h_last)
            qz = log_gaussian(z, mu, log_var) - sum(log_det_z)
            z = f_z
        elif (self.flow_type in ['hf', 'ccliniaf']) and self.n_flows > 0:
            (mu, log_var) = q_param
            f_z = self.flow(z, h_last)
            qz = log_gaussian(z, mu, log_var)
            z = f_z
        elif self.flow_type in ["o-sylvester", "h-sylvester", "t-sylvester"] and self.n_flows > 0:
            mu, log_var, r1, r2, q_ortho, b = q_param
            f_z = self.flow(z, r1, r2, q_ortho, b)
            qz = log_gaussian(z, mu, log_var)
            z = f_z
        else:
            (mu, log_var) = q_param
            qz = log_gaussian(z, mu, log_var)
            print(qz)
        if p_param is None:
            pz = log_standard_gaussian(z)
        else:
            (mu, log_var) = p_param
            pz = log_gaussian(z, mu, log_var)

        kl = qz - pz

        return kl

    def encoder(self, x):
        z = self.dense1(x)
        z = self.activation(z)
        if self.batchnorm:
            if z.shape[0] != 1:
                z = self.dense1_bn(z)
        z = self.dropout(z)
        return z

    def decoder(self, z):
        if self.has_dense:
            z = self.dense2(z)
            z = self.activation(z)
            if self.batchnorm:
                if z.shape[0] != 1:
                    z = self.dense2_bn(z)
            z = self.dropout(z)

        x = torch.sigmoid(z)
        return x

    def forward(self, x):

        x = self.encoder(x)
        z, mu, log_var = self.GaussianSample(x)

        # Kullback-Leibler Divergence
        kl = self._kld(z, (mu, log_var), x)
        print(kl)
        if len(z.shape) == 1:
            z = z.unsqueeze(0)
        rec = self.decoder(z)
        return rec, kl

    def sample(self, z, y=None):
        """
        Given z ~ N(0, I) generates a sample from
        the learned distribution based on p_θ(x|z).
        :param z: (torch.autograd.Variable) Random normal variable
        :return: (torch.autograd.Variable) generated sample
        """
        return self.decoder(z)

    def get_parameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                print(name, param.data.shape)

    def get_total_parameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                print(name, param.data.shape)

In [13]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [14]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [15]:
# GENES
n_comp = 29

# CELLS
n_comp = 4

In [16]:
lr = 1e-4
l1 = 0.
l2 = 0.
batch_size = 128
mc = 1 # seems to be a problem when mc > 1 for display only, results seem good
iw = 1 # seems to be a problem when iw > 1 for display only, results seem good

# Neurons layers
h_dims = [128, 64]
z_dim = 29

# number of flows
n_combinations = 20 #could be just 1 with number_of_flows?
number_of_flows = 8
num_elements = 3

is_example = True
DEVICE = 'cuda'

In [17]:
train = train_features[GENES]
test = test_features[GENES]

In [18]:
model = Autoencoder(flow_type='vanilla', z_dim=z_dim, batchnorm=16, n_flows=number_of_flows)
model.to(DEVICE)

Autoencoder(
  (GaussianSample): GaussianSample(
    (mu): Linear(in_features=29, out_features=29, bias=True)
    (log_var): Linear(in_features=29, out_features=29, bias=True)
  )
  (activation): GELU()
  (dense1): Linear(in_features=772, out_features=29, bias=True)
  (dense2): Linear(in_features=29, out_features=772, bias=True)
  (dense1_bn): BatchNorm1d(29, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense2_bn): BatchNorm1d(772, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (bns): ModuleList()
  (bns_deconv): ModuleList()
)

In [19]:
BATCH_SIZE = 16
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
EPOCHS = 20
EARLY_STOPPING_STEPS = 11
EARLY_STOP = True
L1_REG = 1

In [20]:
train_dataset = MoADataset(train.values)
valid_dataset = MoADataset(test.values)
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)



optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.05, div_factor=1.5e3, 
                                          max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))

loss_fn = nn.MSELoss()

early_stopping_steps = EARLY_STOPPING_STEPS
early_step = 0

best_loss = np.inf

for epoch in range(EPOCHS):

    train_loss = train_fn(model, optimizer,scheduler, loss_fn, trainloader, DEVICE, L1_REG)
    print(f"EPOCH: {epoch}, train_loss: {train_loss}")
    valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
    print(f"EPOCH: {epoch}, valid_loss: {valid_loss}")

    if valid_loss < best_loss:

        best_loss = valid_loss
        torch.save(model.state_dict(), f"ae_.pth")

    elif(EARLY_STOP == True):

        early_step += 1
        if (early_step >= early_stopping_steps):
            break


x_test = test_[feature_cols].values
testdataset = TestDataset(x_test)
testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)

model = Model(
    num_features=num_features,
    num_targets=num_targets,
    hidden_size=hidden_size,
    n_res=1
)
prune_model(model)
model.load_state_dict(torch.load(f"ae_.pth"))
model.to(DEVICE)

predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
predictions = inference_fn(model, testloader, DEVICE)


tensor([-49.1840, -52.0087, -55.9794, -51.7899, -54.1948, -46.8981, -49.7072,
        -50.1945, -50.5662, -50.9372, -56.4687, -48.7947, -49.4632, -53.7878,
        -54.4238, -53.2916], device='cuda:0', grad_fn=<SumBackward1>)
tensor([11.9052, 10.7871, 49.0063,  6.6295, 21.1392, 12.4708, 16.2063, 10.7895,
        21.2600, -3.2598, 19.8677, 21.9045,  8.4237, 23.5325, 31.4090, 19.8248],
       device='cuda:0', grad_fn=<SubBackward0>)
tensor(0.8770, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(17.6185, device='cuda:0', grad_fn=<MeanBackward0>)


RuntimeError: Function SubBackward0 returned an invalid gradient at index 0 - expected type TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false) but got TensorOptions(dtype=float, device=cuda:0, layout=Strided, requires_grad=false) (validate_outputs at /opt/conda/conda-bld/pytorch_1591914855613/work/torch/csrc/autograd/engine.cpp:484)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x4e (0x7f43ddc52b5e in /opt/anaconda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x2ae2414 (0x7f440bad9414 in /opt/anaconda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #2: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x548 (0x7f440badaf48 in /opt/anaconda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7f440badced2 in /opt/anaconda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::Engine::thread_init(int) + 0x39 (0x7f440bad5549 in /opt/anaconda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #5: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7f440f025638 in /opt/anaconda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #6: <unknown function> + 0xc819d (0x7f4455fa319d in /opt/anaconda/lib/python3.7/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6)
frame #7: <unknown function> + 0x93e9 (0x7f44589d13e9 in /usr/lib/libpthread.so.0)
frame #8: clone + 0x43 (0x7f44588ff293 in /usr/lib/libc.so.6)
