# Description
* This notebook started as an identical copy of @Vitalii Mokin's great notebook https://www.kaggle.com/vbmokin/moa-pytorch-rankgauss-pca-nn-upgrade-3d-visual/notebook
* In order to keep the notebook short all the functions are imported from moalib dataset

# Upgrade 
* Commit 9: Added features generated by cnn from https://www.kaggle.com/martintosstorff/moa-dimensionalityreduction-for-applying-cnn
* Commit 11: Removed features generated by cnn as they lead to overfit

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        continue
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append('../input/moalib')
sys.path.append('../input/iterativestratification')

from rankgauss import quantile_transform_dataframe
from pca import add_PC_to_dataframe
from featureselection import select_features
from cvfolds import get_folds
from main import run_training
from main import run_predicition
from preprocess import process_data
from models import Model
from seed import seed_everything
from transferlearning import FineTuneScheduler
import torch
import torch.nn as nn
from sklearn.metrics import log_loss
from sklearn.feature_selection import VarianceThreshold
import torch.nn.functional as F
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
#import_modules()

# Dataloading

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
contnames = train_features.columns[4:]

train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
train_features.head()

# Quantile regression

In [None]:
quantile_transform_dataframe(train_features, test_features, GENES + CELLS)

In [None]:
train_features.head()

# PCA

In [None]:
n_comp_GENES = 463
n_comp_CELLS = 60
train_features, test_features = add_PC_to_dataframe(train_features, test_features, n_comp_GENES, n_comp_CELLS, GENES, CELLS)

In [None]:
train_features.head(5)

# Feature selection

In [None]:
VarianceThreshold_for_FS = 0.9
train, test, target, target_cols, slected_feats = select_features(train_features, test_features, train_targets_scored, VarianceThreshold_for_FS)
#train, test, target, target_cols = select_original_or_PCA_features(train_features, test_features, train_targets_scored, False)

In [None]:
#target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
all_target_cols = target_cols + aux_target_cols

num_targets = len(target_cols)
num_aux_targets = len(aux_target_cols)
num_all_targets = len(all_target_cols)

In [None]:
train = train.merge(train_targets_nonscored, on='sig_id')

In [None]:
[print(f) for f in train_features.columns[4:][slected_feats]]

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
folds = get_folds(train, target)

In [None]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

In [None]:
feature_cols = [c for c in process_data(folds).columns if c not in all_target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

# Model scaling

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size, Dropout_Model):
        super(Model, self).__init__()
        self.hidden_size = [hidden_size]
        self.dropout_value = [Dropout_Model]
        
        self.batch_norm_0 = nn.BatchNorm1d(num_features)
        self.dense_0 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

        self.batch_norm_1 = nn.BatchNorm1d(hidden_size)
        self.dropout_1 = nn.Dropout(Dropout_Model)
        self.dense_1 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))

        self.batch_norm_last = nn.BatchNorm1d(hidden_size)
        self.dropout_last = nn.Dropout(Dropout_Model)
        self.dense_last = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
        
    def recalibrate_layer(self, layer):

        if(torch.isnan(layer.weight_v).sum() > 0):
            print ('recalibrate layer.weight_v')
            layer.weight_v = torch.nn.Parameter(torch.where(torch.isnan(layer.weight_v), torch.zeros_like(layer.weight_v), layer.weight_v))
            layer.weight_v = torch.nn.Parameter(layer.weight_v + 1e-7)

        if(torch.isnan(layer.weight).sum() > 0):
            print ('recalibrate layer.weight')
            layer.weight = torch.where(torch.isnan(layer.weight), torch.zeros_like(layer.weight), layer.weight)
            layer.weight += 1e-7

    def forward(self, x):
        x = self.batch_norm_0(x)
        self.recalibrate_layer(self.dense_0)
        x = F.leaky_relu(self.dense_0(x))        

        x = self.batch_norm_1(x)
        x = self.dropout_1(x)
        self.recalibrate_layer(self.dense_1)
        x = F.leaky_relu(self.dense_1(x))

        x = self.batch_norm_last(x)
        x = self.dropout_last(x)
        self.recalibrate_layer(self.dense_last)
        x = self.dense_last(x)

        return x

# Training

In [None]:
from seed import seed_everything
from preprocess import process_data
from datasets import MoADataset
from datasets import TestDataset
from labelsmoothing import SmoothBCEwLogits
from training import train_fn
from training import valid_fn
from training import inference_fn
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn

def run_training(fold, seed, folds, model, feature_cols, target_cols, target, BATCH_SIZE, DEVICE, LEARNING_RATE, WEIGHT_DECAY, EPOCHS, EARLY_STOPPING_STEPS, EARLY_STOP):
    seed_everything(seed)

    train = process_data(folds)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()
    loss_tr = SmoothBCEwLogits(smoothing=0.001)

    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_tr, trainloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")

        if valid_loss < best_loss:
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"FOLD{fold}_{seed}.pth")

        elif (EARLY_STOP == True):
            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    return oof



def run_predicition(fold, model, test, feature_cols, DEVICE, BATCH_SIZE):
    test_ = process_data(test)
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)

    model.load_state_dict(torch.load(f"FOLD{fold}_{seed}.pth"))
    model.to(DEVICE)

    predictions = inference_fn(model, testloader, DEVICE)

    return predictions

In [None]:
def run_training(fold, seed):
    seed_everything(seed)
    
    train = process_data(folds)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)   
    
    def train_model(model, tag_name, target_cols_now, fine_tune_scheduler=None):       
        x_train, y_train  = train_df[feature_cols].values, train_df[target_cols_now].values
        x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols_now].values
        
        train_dataset = MoADataset(x_train, y_train)
        valid_dataset = MoADataset(x_valid, y_valid)

        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=WEIGHT_DECAY[tag_name])
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                  steps_per_epoch=len(trainloader),
                                                  pct_start=PCT_START,
                                                  div_factor=DIV_FACTOR[tag_name], 
                                                  max_lr=MAX_LR[tag_name],
                                                  epochs=EPOCHS)
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss_tr = SmoothBCEwLogits(smoothing=0.001)

        oof = np.zeros((len(train), len(target_cols_now)))
        best_loss = np.inf
        
        for epoch in range(EPOCHS):
            if fine_tune_scheduler is not None:
                fine_tune_scheduler.step(epoch, model)

            train_loss = train_fn(model, optimizer, scheduler, loss_tr, trainloader, DEVICE)
            valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
            print(f"SEED: {seed}, FOLD: {fold}, {tag_name}, EPOCH: {epoch}, train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}")

            if np.isnan(valid_loss):
                break
            
            if valid_loss < best_loss:
                best_loss = valid_loss
                oof[val_idx] = valid_preds
                torch.save(model.state_dict(), f"{tag_name}_FOLD{fold}_{seed}.pth")

        return oof

    fine_tune_scheduler = FineTuneScheduler(EPOCHS)

    pretrained_model = Model(
        num_features=num_features,        
        num_targets=num_all_targets,
        hidden_size=hidden_size,
        Dropout_Model=Dropout_Model)  
    #pretrained_model = Model(num_features, num_all_targets)
    pretrained_model.to(DEVICE)

    # Train on scored + nonscored targets
    train_model(pretrained_model, 'ALL_TARGETS', all_target_cols)

    # Load the pretrained model with the best loss
    pretrained_model =Model(
        num_features=num_features,        
        num_targets=num_all_targets,
        hidden_size=hidden_size,
        Dropout_Model=Dropout_Model)  
    pretrained_model.load_state_dict(torch.load(f"ALL_TARGETS_FOLD{fold}_{seed}.pth"))
    pretrained_model.to(DEVICE)

    model_new = Model(
        num_features=num_features,        
        num_targets=num_all_targets,
        hidden_size=hidden_size,
        Dropout_Model=Dropout_Model)  
    model_new.load_state_dict(pretrained_model.state_dict())
    # Copy model without the top layer
    final_model = fine_tune_scheduler.copy_without_top(pretrained_model, model_new, num_targets, DEVICE)

    # Fine-tune the model on scored targets only
    oof = train_model(final_model, 'SCORED_ONLY', target_cols, fine_tune_scheduler)
    
    return oof   


def run_predicition(fold, model, test, feature_cols, DEVICE, BATCH_SIZE):
    test_ = process_data(test)
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)

    model.load_state_dict(torch.load(f"SCORED_ONLY_FOLD{fold}_{seed}.pth"))
    model.to(DEVICE)

    predictions = inference_fn(model, testloader, DEVICE)

    return predictions

In [None]:
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False
Dropout_Model = 0.25

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size = 1550

In [None]:
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 28
BATCH_SIZE = 128

WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 3e-6}
MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 3e-3}
DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}
PCT_START = 0.1

In [None]:
import os.path
def has_model(fold): return os.path.isfile(f"FOLD{fold}_.pth")

In [None]:
import gc

In [None]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
   
    for fold in range(NFOLDS):
        gc.collect()
        #model = Model(
        #num_features=num_features,        
        #num_targets=num_targets,
        #hidden_size=hidden_size,
        #Dropout_Model=Dropout_Model)     
                      
        
        #oof_ = run_training(fold, seed, folds, model, feature_cols, 
        #                    target_cols, target, BATCH_SIZE, DEVICE, LEARNING_RATE,
        #                    WEIGHT_DECAY, EPOCHS, EARLY_STOPPING_STEPS, EARLY_STOP)
        
        oof_ = run_training(fold, seed)
        
        modelPred = Model(
        num_features=num_features,        
        num_targets=num_targets,
        hidden_size=hidden_size,
        Dropout_Model=Dropout_Model)       
        
        pred_ = run_predicition(fold, modelPred, test, feature_cols, DEVICE, BATCH_SIZE)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions
# Averaging on multiple SEEDS

SEED = [0,1,2,3,4,5,6]


oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))
for seed in SEED:  
    #folds = get_folds(train, target)
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)  
    #del folds
                
                

train[target_cols] = oof
test[target_cols] = predictions
        
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
            

            
#print("num_layers: ", num_layers)
print("hidden_size: ", hidden_size)
print("CV log_loss: ", score)  

In [None]:
#train[target_cols] = oof
#test[target_cols] = predictions

# Evaluation

In [None]:
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)    

# Submission

In [None]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)