An implementation of Swap denoising autoencoder as used in the porto seguro's winning submit. In a futuure kernel i will set activation layers as features to obtain a prediction.

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
#sys.path.append('../..')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sn


from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline,make_union

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



import warnings
warnings.filterwarnings('ignore')

from joblib import dump, load

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
           # print("dtype after: ",props[col].dtype)
           # print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist


In [None]:

def train_short_form_loader(feature_file,target_file,extra_target_file=None):
    '''takes the original target and features and creates a train dataset 
    in col long format'''


    train_features = pd.read_csv(feature_file)

    train_targets = pd.read_csv(target_file)
    train_features,_= reduce_mem_usage(train_features)
    train_targets,_ = reduce_mem_usage(train_targets)


    if extra_target_file is not None:
        extra_targets = pd.read_csv(extra_target_file)
        extra_targets,_ = reduce_mem_usage(extra_targets)
        train_targets = pd.merge(train_targets,extra_targets,on ='sig_id')
        del extra_targets

    targets = train_targets.columns[1:]

    train_melt=train_targets.merge(train_features,how="left",on="sig_id")


    del train_features,train_targets


    train_melt.set_index("sig_id",inplace=True)

    #train_melt["variable"]= train_melt["variable"].astype('category')
    train_melt["cp_type"]= train_melt["cp_type"].astype('category')
    train_melt["cp_dose"]= train_melt["cp_dose"].astype('category')

    return train_melt , targets.to_list()



def test_short_form_loader(feature_file):
    '''takes the original target and features and creates a train dataset 
    in col long format'''


    train_features = pd.read_csv(feature_file)

    #train_targets = pd.read_csv(target_file)
    train_features,_= reduce_mem_usage(train_features)
    #train_targets,_ = reduce_mem_usage(train_targets)

    train_melt =  train_features.copy()
    del train_features


    train_melt.set_index("sig_id",inplace=True)

    #train_melt["variable"]= train_melt["variable"].astype('category')
    train_melt["cp_type"]= train_melt["cp_type"].astype('category')
    train_melt["cp_dose"]= train_melt["cp_dose"].astype('category')

    return train_melt 


In [None]:
#from tools.loaders import train_short_form_loader, test_short_form_loader

In [None]:
input_directory = '../input/lish-moa/'

In [None]:
train,target_cols = train_short_form_loader(input_directory +'train_features.csv',input_directory+'train_targets_scored.csv')
test = test_short_form_loader(input_directory +"test_features.csv")


In [None]:
GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer

In [None]:
from sklearn.compose import make_column_transformer,ColumnTransformer

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    #os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# CV folds

In [None]:
def multifold_indexer(train,target_columns,n_splits=10,random_state=12347,**kwargs):
    folds = train.copy()

    mskf = MultilabelStratifiedKFold(n_splits=n_splits,random_state=random_state,**kwargs)
    folds[ 'kfold']=0
    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=train[target_columns])):
        folds.iloc[v_idx,-1] = int(f)

    folds['kfold'] = folds['kfold'].astype(int)
    return folds


# Dataset Classes

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    

    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

 we define a dataset that swaps instances of a column by another instance of the same column 

In [None]:
class MoASwapDataset:
    def __init__(self, features, noise):
        self.features = features
        self.noise = noise
        
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        
#         if torch.is_tensor(idx):
#             idx = idx.tolist()
        
        sample = self.features[idx, :].copy()
        sample = self.swap_sample(sample)
        
        dct = {
            'x' : torch.tensor(sample, dtype=torch.float),
            'y' : torch.tensor(self.features[idx, :], dtype=torch.float)            
        }
        return dct
    
    def swap_sample(self,sample):
            #print(sample.shape)
            num_samples = self.features.shape[0]
            num_features = self.features.shape[1]
            if len(sample.shape) == 2:
                batch_size = sample.shape[0]
                random_row = np.random.randint(0, num_samples, size=batch_size)
                for i in range(batch_size):
                    random_col = np.random.rand(num_features) < self.noise
                    #print(random_col)
                    sample[i, random_col] = self.features[random_row[i], random_col]
            else:
                batch_size = 1
          
                random_row = np.random.randint(0, num_samples, size=batch_size)
               
            
                random_col = np.random.rand(num_features) < self.noise
                #print(random_col)
                #print(random_col)
       
                sample[ random_col] = self.features[random_row, random_col]
                
            return sample

        
    

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
        if not  scheduler.__class__ ==  torch.optim.lr_scheduler.ReduceLROnPlateau:
            scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, scheduler, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    if scheduler.__class__ ==  torch.optim.lr_scheduler.ReduceLROnPlateau:
        scheduler.step(final_loss)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds
   
    

# Transformers definition

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin

class CatIntMapper( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self ,col,dicti):
        self.col = col
        self.dicti = dicti
        
    def fit(self, X, y = None):
        
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        
        assert  X[self.col].isin(self.dicti.keys()).all() 
        
       
        return pd.concat([X.drop(self.col,axis=1),X[self.col].map(self.dicti).astype(int).rename(self.col)],axis=1) 
    
    def transform( self, X):
        assert  X[self.col].isin(self.dicti.keys()).all() 
        
        return pd.concat([X.drop(self.col,axis=1),X[self.col].map(self.dicti).astype(int).rename(self.col)],axis=1) 

class NamedOutTWrapper( BaseEstimator, TransformerMixin ):
    
    def __init__(self,transformer,columns,inplace=False,prefix='_' ):
        
        self.transformer = transformer
        self.cols = columns
        self.inplace =  inplace
        self.prefix = prefix
        self.transformer_name = self._get_transformer_name()
        
    def fit(self, X, y = None):
            
        self.transformer =   self.transformer.fit(X[self.cols] , y )
            
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        
       
        
        transformed_columns = self.transformer.fit_transform(X[self.cols] , y )
        out=pd.DataFrame(index=X.index)
        
       
        if self.inplace:
            out = X[self.cols]
            out[self.cols] = transformed_columns
            
            return pd.concat([X.drop(self.cols,axis=1),out],axis=1)
        else:
           
            for i,values in enumerate(transformed_columns.transpose()):
            
                out[ self.transformer_name + self.prefix + str(i)] = values
        
       
        
            return   pd.concat([X,out],axis=1)
    
    def transform( self, X):
        
        transformed_columns = self.transformer.transform(X[self.cols]  )
        
        out=pd.DataFrame(index=X.index)
        
        if self.inplace:
            out = X[self.cols]
            out[self.cols] = transformed_columns
            
            return pd.concat([X.drop(self.cols,axis=1),out],axis=1)
        else:
            for i,values in enumerate(transformed_columns.transpose()):

                out[ self.transformer_name + self.prefix + str(i)] = values

             
        return   pd.concat([X,out],axis=1)
            
    
    def _get_transformer_name(self):
        return str(self.transformer.__class__).split('.')[-1][0:-2]


class IdentityTransformer:
    '''Duummy_tansformer as a filler'''
    def __init__(self ):
        pass
    def fit(self, X, y = None):
        
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        
        return  X
      
    
    def transform( self, X):
       
        return  X    

class SuppressControls( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self ):
        pass
    def fit(self, X, y = None):
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        
      
        
        return   X.loc[X['cp_type']!='ctl_vehicle'].drop('cp_type', axis=1) 
    
    def transform( self, X):
       
       
        return    X.loc[X['cp_type']!='ctl_vehicle'].drop('cp_type', axis=1)



In [None]:
exp_name="torch_DAE_rankgauss_test"

In [None]:
import sys
sys.path.append('../../input/iterative-stratification/iterative-stratification-master')
sys.path.append('../..')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline,make_union


In [None]:

map_controls = CatIntMapper('cp_type',{'ctl_vehicle': 0, 'trt_cp': 1})    

map_dose = CatIntMapper('cp_dose',{'D1': 1, 'D2': 0})    
map_time = CatIntMapper('cp_time',{24: 0, 48: 1, 72: 2})    


In [None]:
train = pd.read_csv(f'{input_directory}/train_features.csv')

In [None]:
Rankg_g_tansform =  NamedOutTWrapper( QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal"),columns= GENES+CELLS,inplace=True)

In [None]:
PCA_g_tansform =  NamedOutTWrapper(PCA(20),columns= GENES,prefix ='_g' )

In [None]:
PCA_c_tansform =  NamedOutTWrapper(PCA(20),columns= CELLS,prefix ='_c' )

In [None]:
#transformers_list=[map_controls,map_dose,map_time,PCA_g_tansform,PCA_c_tansform,Rankg_g_tansform]

# Model definition

we have two forward modes one for training the layers and another to generate the layer activations to then use as features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, cols ):
        self.cols=cols
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):

        return X.drop(self.cols,axis=1)


In [None]:

CatDropper =ColumnDropper(cols=['cp_type','cp_time','cp_dose'])

In [None]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size,hidden_size2,drop_rate1=0.2,drop_rate2=0.5,drop_rate3=0.8):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        #self.dropout1 = nn.Dropout(drop_rate1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
      #  self.dropout2 = nn.Dropout(drop_rate2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size2))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size2)
        #self.dropout3 = nn.Dropout(drop_rate2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size2, hidden_size))
        
      #  self.batch_norm4 = nn.BatchNorm1d(hidden_size)
      #  self.dropout4 = nn.Dropout(drop_rate3)
        self.dense4 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))

        
    def forward(self, x,mode='DAE'):
      #  x = self.batch_norm1(x)
       # x1 = self.dropout1(x1)
        x1 = F.relu(self.dense1(x))
        
            
        x2 = self.batch_norm2(x1)
      #  x = self.dropout2(x)
        x2 = F.relu(self.dense2(x2))
        
        x3 = self.batch_norm3(x2)
      
        x3 = F.relu(self.dense3(x3))
        
        out = self.dense4(x3)
        
        if mode == 'DAE':
            return out
        else:
            return x1,x2,x3
    
#     def forwardh2(self, x):
#       #  x = self.batch_norm1(x)
#        # x1 = self.dropout1(x1)
#         x = F.relu(self.dense1(x))
        
#         return x
    
#     def forwardh3(self, x):
#       #  x = self.batch_norm1(x)
#        # x1 = self.dropout1(x1)
#         x = F.relu(self.dense1(x))
        
#         return x

In [None]:
def initialize_from_past_model(model,past_model_file, freeze_first_layer=False):

   # pretrained_dict = torch.load('FOLD0_.pth')
    pretrained_dict = torch.load(past_model_file)
    model_dict = model.state_dict()

    pretrained_dict['dense3.bias']=pretrained_dict['dense3.bias'][:206]

    pretrained_dict['dense3.weight_g']=pretrained_dict['dense3.weight_g'][:206]

    pretrained_dict['dense3.weight_v']=pretrained_dict['dense3.weight_v'][:206]

    # 1. filter out unnecessary keys
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    # 2. overwrite entries in the existing state dict
    model_dict.update(pretrained_dict) 
    # 3. load the new state dict
    model.load_state_dict(pretrained_dict)
    
    
    if freeze_first_layer:
        for name,layer in model.named_parameters():
            if '1' in name:
                 layer.requires_grad = False
        
    

# Pipeline

In [None]:
transformers_list=[SuppressControls(),Rankg_g_tansform,ColumnDropper(cols=['cp_time','cp_dose'])]

In [None]:

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 200
BATCH_SIZE = 1024 #640
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 1e-8
NFOLDS = 10
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False
GAMMA=0.5

#num_features=len(feature_cols)
#num_targets=len(target_cols)
hidden_size=1100
hidden_size2=1300

PATIENCE = 5
FACTOR = 0.5
THRESHOLD = 1e-4

In [None]:
#exp_name =  "test_DAE_all_together"
exp_name = 'test_DAE_0.3_all_together'

In [None]:
def run_training(X_train,X_valid,X_test,fold, seed,verbose=False,**kwargs):
    
    seed_everything(seed)
    
   
    
    train_dataset = MoASwapDataset(X_train, 0.35)
    valid_dataset = MoASwapDataset(X_valid, 0.35)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features= X_train.shape[1] ,
        num_targets=  X_train.shape[1],
        hidden_size=hidden_size,hidden_size2=hidden_size2,**kwargs
    )
    
    model.to(DEVICE)
    
    #initialize_from_past_model(model,f"../results/FOLD{fold}_original_torch_moa_5_folds_AUX.pth")#,freeze_first_layer=True)
    #optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    #scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e2, 
                                          #max_lr=5e-4, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=PATIENCE,factor=FACTOR,
                                                threshold=THRESHOLD,verbose=True)
    #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=10,factor=0.5,threshold=1e2)
    #scheduler = optim.lr_scheduler.CyclicLR(optimizer,1e-4,5e-3,scale_mode='exp_range',gamma=FACTOR)
   # loss_val = nn.BCEWithLogitsLoss()
    
    #loss_tr = SmoothBCEwLogits(smoothing =0.001)
    
    
    loss_tr = nn.MSELoss()
    loss_val = nn.MSELoss()
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    #todo el guardado de los resultados se puede mover a kfold que si tiene info de los indices
    #oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    
    
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, loss_tr, trainloader, DEVICE)
        if verbose & (epoch%10==0):
            print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model,scheduler, loss_val, validloader, DEVICE)
        if verbose & (epoch%10==0):
            print(f"FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof = valid_preds
        
            #if epoch > 0.7*EPOCHS:
            torch.save(model.state_dict(), f"FOLD{fold}_{exp_name}.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
   
   # testdataset = Test(X_test)
   # testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
#     model = Model(
#          num_features= X_train.shape[1] ,
#         num_targets=  y_train.shape[1],
#         hidden_size=hidden_size,**kwargs
#     )
    
#     model.load_state_dict(torch.load(f"../results/FOLD{fold}_{exp_name}.pth"))
    #model.to(DEVICE)
    
    #predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
   # predictions = inference_fn(model, testloader, DEVICE)
    del model
    torch.cuda.empty_cache()
    return #oof, _#predictions


In [None]:
def run_k_fold(folds,target_cols,test,transformers_list,NFOLDS, seed,verbose=False,**kwargs):
    
    
    train = folds
    test_ = test
    
    #oof = np.zeros((len(folds), len(target_cols)))
    oof = train[target_cols].copy()
    predictions = np.zeros((len(test), len(target_cols)))
    
    #print(test_.head())
    for fold in range(NFOLDS):
        
        #trn_idx = train[train['kfold'] != fold].reset_index().index
        #val_idx = train[train['kfold'] == fold].reset_index().index
    
        train_df = train[train['kfold'] != fold]#.reset_index(drop=True)
        valid_df = train[train['kfold'] == fold]#.reset_index(drop=True)
        
       # print(len(train_df))
        #print(len(valid_df))
        
        feature_cols = [col  for col in train_df.columns if not (col in target_cols+['kfold'])]
        
        #print(feature_cols)
        
        pipeline_val = make_pipeline(*transformers_list)
        
        X_train, y_train  = train_df[feature_cols], train_df[target_cols]
        X_valid, y_valid =  valid_df[feature_cols], valid_df[target_cols].values
        
      
       
        pipeline_val.fit(X_train,y_train)
        
        ###############################
        #### SAVE/LOAD PREPROCESSING #######
        dump(pipeline_val,'pipeline_val1_fold%i.joblib'%fold)
        pipeline_val = load('pipeline_val1_fold%i.joblib'%fold)
        print('SAVE/LOAD PIPELINE_VAL_FOLD%i'%fold)
        print(pipeline_val)
        ###############################
        
        X_train = pipeline_val.transform(X_train)
        
        feature_cols = [col  for col in X_train.columns if not (col in target_cols+['kfold'])]
        
        X_train = X_train.values
        
        
        X_valid = pipeline_val.transform(X_valid)
        valid_index = X_valid.index
        X_valid = X_valid.values
        
        y_train = y_train.values
        
        
        X_test = test_[feature_cols].values
            
        #oof_, pred_ = 
        run_training(X_train,X_valid,X_test,fold, seed,verbose,**kwargs)
        
        #oof.loc[valid_index] = oof_
        
        #predictions += pred_ / NFOLDS
        
        if fold>=6:
            break #ONLY SEVEN FOLDS
        
    return #oof, predictions

In [None]:
params ={}

In [None]:
# Training

In [None]:
# Averaging on multiple SEEDS

#SEED = [0,12347,565657,123123,78591]
SEED = [0]
train,target_cols = train_short_form_loader(f'{input_directory}/train_features.csv',f'{input_directory}/train_targets_scored.csv')
test = test_short_form_loader(f"{input_directory}/test_features.csv")

train = pd.concat([train,test])
train[target_cols]= train[target_cols].fillna(0)
pipeline_test = make_pipeline(*transformers_list)
pipeline_test.fit(train)

###############################
#### SAVE/LOAD PREPROCESSING #######
dump(pipeline_test,'pipeline_test1.joblib')
pipeline_test = load('pipeline_test1.joblib')
print('SAVE/LOAD PIPELINE_TEST')
print(pipeline_test)
###############################

test = pipeline_test.transform(test)
    

oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
   
    folds = multifold_indexer(train,target_cols,n_splits=NFOLDS)
    #oof_, predictions_ =
    run_k_fold(folds,target_cols,test,transformers_list,NFOLDS, seed,verbose=True,**params)
   # oof += oof_ / len(SEED)
   # predictions += predictions_ / len(SEED)
    break
#train[target_cols] = oof
test[target_cols] = predictions
