In [None]:
# V2 : apply Rank Gauss, del controlled
# V3 : apply Rank Gauss only
# V5 : True Rank Gauss

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss, confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
import numpy as np
from scipy.special import erfinv as sp_erfinv


def rank_gauss(data):
    epsilon = 1e-6

    for k in tqdm(GENES + CELLS):
        r_cpu = data.loc[:,k].argsort().argsort()
        r_cpu = (r_cpu/r_cpu.max()-0.5)*2 
        r_cpu = np.clip(r_cpu,-1+epsilon,1-epsilon)
        r_cpu = sp_erfinv(r_cpu) 
        data.loc[:,k] = r_cpu * np.sqrt(2)  
    return data


train_features = rank_gauss(train_features)
test_features = rank_gauss(test_features)

In [None]:
# New train data
# data_train = train_features[train_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)[GENES+CELLS]
data_train = train_features[GENES+CELLS]

# New Test data
# data_test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)[GENES+CELLS]
data_test = test_features[GENES+CELLS]

In [None]:
len(CELLS), len(GENES)

In [None]:
class Config:
    NUM_FOLDS = 5
    SEED = 718
    TRAIN_BATCH_SIZE = 128
    VALID_BATCH_SIZE = 128
    EPOCHS = 100
    

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
        
class DenoisingAutoEncoder(nn.Module):
    def __init__(self, in_out_channels, hidden_channels):
        super(DenoisingAutoEncoder, self).__init__()
        self.encoder=nn.Sequential(
                        nn.Linear(in_out_channels, hidden_channels),
                        nn.ReLU(True),
                        nn.Linear(hidden_channels, hidden_channels//2),
                        nn.ReLU(True),
                        nn.Linear(hidden_channels//2, hidden_channels//4),
                        nn.ReLU(True)
                        )

        self.decoder=nn.Sequential(
                        nn.Linear(hidden_channels//4, hidden_channels//2),
                        nn.ReLU(True),
                        nn.Linear(hidden_channels//2, hidden_channels),
                        nn.ReLU(True),
                        nn.Linear(hidden_channels, in_out_channels),
                        # nn.Sigmoid(),
                        )

    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)   
        return x
        
        
class DAEDataset:
    def __init__(self, ids, ys=None):
        self.ids = ids
        self.ys = ys
    
    def __len__(self):
        return len(self.ids)

    def __getitem__(self, item):
        
        if self.ys is not None:

            return {
                'ids': torch.tensor(self.ids[item], dtype=torch.float32),
                'targets': torch.tensor(self.ys[item], dtype=torch.float32),
            }
        
        else:
            return {
                'ids': torch.tensor(self.ids[item], dtype=torch.float32),
            }
        
        
def loss_fn(logits, targets):
    loss_fct = nn.MSELoss()
    loss = loss_fct(logits, targets)
    return loss


def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    y_true = []
    y_pred = []
    for bi, d in enumerate(tk0):
        ids = d["ids"].to(device, dtype=torch.float32)        
        targets = d["targets"].to(device, dtype=torch.float32)

        model.zero_grad()
        
        outputs = model(targets)
        
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        y_true.append(targets.cpu().detach().numpy())
        y_pred.append(outputs.cpu().detach().numpy())

        losses.update(loss.item(), targets.size(0))
        tk0.set_postfix(loss=losses.avg)


def valid_fn(data_loader, model, device, scheduler=None):
    model.eval()
    losses = AverageMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    y_true = []
    y_pred = []
    with torch.no_grad():
        for bi, d in enumerate(tk0):
            ids = d["ids"].to(device, dtype=torch.float32)        
            targets = d["targets"].to(device, dtype=torch.float32)

            outputs = model(targets)

            loss = loss_fn(outputs, targets)

            y_true.append(targets.cpu().detach().numpy())
            y_pred.append(outputs.cpu().detach().numpy())

            losses.update(loss.item(), targets.size(0))
            tk0.set_postfix(loss=losses.avg)

    y_true = np.concatenate(y_true, 0)
    y_pred = np.concatenate(y_pred, 0)
    return loss


def test_fn(data_loader, model, device):
    model.eval()
    
    preds = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"].to(device, dtype=torch.float32)
            outputs = model(ids)
            outputs = outputs.cpu().detach().numpy()
            preds.append(outputs)
            
    preds = np.concatenate(preds, 0)
    return preds

In [None]:
# GENES

In [None]:
train_dataset = DAEDataset(
            ids=data_train[GENES].values,
            ys=data_train[GENES].values,
        )
    
train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=Config.TRAIN_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )

val_dataset = DAEDataset(
            ids=data_test[GENES].values,
            ys=data_test[GENES].values,
        )
    
val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=Config.VALID_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = DenoisingAutoEncoder(len(GENES), 512)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6)

In [None]:
p = 0
min_loss = 999
patience = 10

for epoch in range(1, Config.EPOCHS + 1):

    print("Starting {} epoch...".format(epoch))

    train_fn(train_loader, model, optimizer, device)
    val_loss = valid_fn(val_loader, model, device)
    scheduler.step()
    
    if val_loss < min_loss:
        min_loss = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), 'GENES_Reconstruction_model.pth')
        print("save model at min valid loss={} on epoch={}".format(min_loss, best_epoch))
        p = 0 

    if p > 0: 
        print(f'val loss is not updated while {p} epochs of training')
    p += 1
    if p > patience:
        print(f'Early Stopping')
        break

In [None]:
ls

In [None]:
ae_genes_train_dataset = DAEDataset(
            ids=data_train[GENES].values,
        )

ae_genes_test_dataset = DAEDataset(
            ids=data_test[GENES].values,
        )

ae_genes_train_loader = torch.utils.data.DataLoader(
            ae_genes_train_dataset,
            shuffle=False,
            batch_size=Config.VALID_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )

ae_genes_test_loader = torch.utils.data.DataLoader(
            ae_genes_test_dataset,
            shuffle=False,
            batch_size=Config.VALID_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )

model.load_state_dict(torch.load('GENES_Reconstruction_model.pth'))
ae_genes_train_reconstruction = test_fn(ae_genes_train_loader, model, device)
ae_genes_test_reconstruction = test_fn(ae_genes_test_loader, model, device)

print(ae_genes_train_reconstruction.shape)
print(ae_genes_test_reconstruction.shape)

autoencoder_error = mean_squared_error(data_test[GENES].values, ae_genes_test_reconstruction)
print("GENES reconstruction error is " + str(autoencoder_error))

In [None]:
# CELLS

In [None]:
train_dataset = DAEDataset(
            ids=data_train[CELLS].values,
            ys=data_train[CELLS].values,
        )
    
train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=Config.TRAIN_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )

val_dataset = DAEDataset(
            ids=data_test[CELLS].values,
            ys=data_test[CELLS].values,
        )
    
val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=Config.VALID_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = DenoisingAutoEncoder(len(CELLS), 64)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6)

In [None]:
p = 0
min_loss = 999
patience = 10

for epoch in range(1, Config.EPOCHS + 1):

    print("Starting {} epoch...".format(epoch))

    train_fn(train_loader, model, optimizer, device)
    val_loss = valid_fn(val_loader, model, device)
    scheduler.step()
    
    if val_loss < min_loss:
        min_loss = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), 'CELLS_Reconstruction_model.pth')
        print("save model at min valid loss={} on epoch={}".format(min_loss, best_epoch))
        p = 0 

    if p > 0: 
        print(f'val loss is not updated while {p} epochs of training')
    p += 1
    if p > patience:
        print(f'Early Stopping')
        break

In [None]:
ae_cells_train_dataset = DAEDataset(
            ids=data_train[CELLS].values,
        )

ae_cells_test_dataset = DAEDataset(
            ids=data_test[CELLS].values,
        )

ae_cells_train_loader = torch.utils.data.DataLoader(
            ae_cells_train_dataset,
            shuffle=False,
            batch_size=Config.VALID_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )

ae_cells_test_loader = torch.utils.data.DataLoader(
            ae_cells_test_dataset,
            shuffle=False,
            batch_size=Config.VALID_BATCH_SIZE,
            num_workers=0, 
            pin_memory=True
        )

model.load_state_dict(torch.load('CELLS_Reconstruction_model.pth'))
ae_cells_train_reconstruction = test_fn(ae_cells_train_loader, model, device)
ae_cells_test_reconstruction = test_fn(ae_cells_test_loader, model, device)

print(ae_cells_train_reconstruction.shape)
print(ae_cells_test_reconstruction.shape)

autoencoder_error = mean_squared_error(data_test[CELLS].values, ae_cells_test_reconstruction)
print("CELLS reconstruction error is " + str(autoencoder_error))

In [None]:
ls