In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import torch
import random
import torch.nn as nn

import torch.optim as optim

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px

from collections import OrderedDict
from  torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer

In [None]:
def seed_everything():
    random.seed(0)
    torch.manual_seed(0)
    np.random.seed(0)
    torch.cuda.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    
seed_everything()

In [None]:
train_features_df=pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_features_df=pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

gcolumns=[colname for colname in train_features_df.columns if colname.startswith('g-')]
ccolumns=[colname for colname in train_features_df.columns if colname.startswith('c-')]

feature_columns=gcolumns+ccolumns
print('Number Of gene expression features:', len(gcolumns))
print('Number Of Cell Viable Features:', len(ccolumns))
print('Total Number Of features:', len(feature_columns))

In [None]:
train_features_df=train_features_df[train_features_df.cp_type!='ctl_vehicle'].copy()
test_features_df =test_features_df[test_features_df.cp_type!='ctl_vehicle'].copy()

features_df=pd.concat([train_features_df, test_features_df])
features_df=features_df[feature_columns].copy()
print(features_df.shape)

In [None]:
class DAEDataset(torch.utils.data.Dataset):
    def __init__(self, df, alpha=None, phase='train'):
        self.phase=phase
        self.df=df.copy().sample(frac=1.0, random_state=42)
        self.alpha=alpha
    def __getitem__(self, idx):
        x=torch.tensor(self.df.iloc[idx], dtype=torch.float32)
        x_in=x.clone()
        return (x_in, x)
        
    def __len__(self):
        return self.df.shape[0]

In [None]:
X_train,X_val, _, _= train_test_split(features_df, 
                                       features_df.iloc[:,0], 
                                       test_size=0.2, 
                                       shuffle=True,
                                       random_state=42)

train_df=features_df.iloc[X_train.index].copy()



X_train,X_test, _, _= train_test_split(train_df, 
                                       train_df.iloc[:, 0],
                                       test_size=0.2,
                                       shuffle=True, 
                                       random_state=42)


train_df=features_df.iloc[X_train.index].copy()
val_df  =features_df.iloc[X_val.index].copy()
test_df =features_df.iloc[X_test.index].copy()

print('Train :', train_df.shape)
print('Val Set:', val_df.shape)
print('Test Set:', test_df.shape)

In [None]:
autoencoder_params={
    'epochs': 200,
    'batch_size': 512,
    'lr': 0.001,
    'weight_decay': 1e-5,
    'input_size': features_df.shape[1],
    'output_size': features_df.shape[1],
    'drop_out': 0.5,
    'code_size': 1000,
    'alpha': 0.5
}

In [None]:
class MOAAutoencoder(nn.Module):
    def __init__(self):
        super(MOAAutoencoder, self).__init__()
        self.autoencoder=nn.Sequential(OrderedDict([
            ('encoder_layer1', nn.Linear(autoencoder_params['input_size'], 2000)),
            ('encoder_layer1_bn', nn.BatchNorm1d(2000)),
            ('encoder_layer1_dropout', nn.Dropout(p=autoencoder_params['drop_out'])),
            ('encoder_layer1_activation', nn.PReLU()),
            
            #('encoder_layer2', nn.Linear(2000,1500)),
            #('encoder_layer2_bn', nn.BatchNorm1d(1500)),
            #('encoder_layer2_dropout', nn.Dropout(p=autoencoder_params['drop_out'])),
            #('encoder_layer2_activation', nn.PReLU()),            
            
            
            ('code_layer', nn.Linear(2000, 1000)),
            ('code_layer_bn', nn.BatchNorm1d(1000)),
            ('code_layer_dropout', nn.Dropout(p=autoencoder_params['drop_out'])),
            ('code_layer_activation', nn.PReLU()),
            
            #('decoder_layer1', nn.Linear(1000, 1500)),
            #('decoder_layer1_bn', nn.BatchNorm1d(1500)),
            #('decoder_layer1_dropout', nn.Dropout(p=autoencoder_params['drop_out'])),
            #('decoder_layer1_activation', nn.PReLU()),
            
            ('decoder_layer2', nn.Linear(1000, 2000)),
            ('decoder_layer2_bn', nn.BatchNorm1d(2000)),
            ('decoder_layer2_dropout', nn.Dropout(p=autoencoder_params['drop_out'])),
            ('decoder_layer2_activation', nn.PReLU()),
            
            ('output_layer', nn.Linear(2000, autoencoder_params['output_size']))
        ]))
    def __call__(self, x):
        x=self.autoencoder(x)
        return x


def train_one_epoch(train_dataloader, criterion, optimizer, scheduler, model):
    epoch_loss=0.0
    model.train()
    for batch_id, (X, y) in enumerate(train_dataloader):
        for col_id in torch.randperm(X.shape[1])[:175]:
            X[:, col_id]=X[torch.randperm(X.shape[0]).numpy(), col_id]
        yout=model(X)
        
        loss_=criterion(yout, y)
        optimizer.zero_grad()
        
        loss_.backward()
        clip_grad_norm_(model.parameters(),1000)
        optimizer.step()
        scheduler.step()
        
        epoch_loss+=loss_.item()
    epoch_loss/=len(train_dataloader)
    return epoch_loss

def validate(val_dataloader, criterion, model):
    val_loss=0.0
    model.eval()
    for X, y in val_dataloader:
        with torch.no_grad():
            yout=model(X)
            val_loss+=criterion(yout, y)
    val_loss/=len(val_dataloader)
    return val_loss

In [None]:
def train_model(model):
    print('Training the Model')
    
    best_score=np.inf
    best_epoch=-1
    train_losses_=[]
    val_losses_=[]
    
    train_dataset=DAEDataset(train_df, alpha=autoencoder_params['alpha'], phase='train')
    train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=autoencoder_params['batch_size'],shuffle=True,pin_memory=True,num_workers=4)
    
    criterion=nn.MSELoss()
    optimizer=optim.Adam(model.parameters(), lr=0.002, weight_decay=autoencoder_params['weight_decay'])
    scheduler=optim.lr_scheduler.OneCycleLR(optimizer, 
                                            max_lr=0.002,
                                            div_factor=100,
                                            epochs=autoencoder_params['epochs'],
                                            steps_per_epoch=len(train_dataloader)
                                           )
    
    for i in range(autoencoder_params['epochs']):
        seed_everything()
        train_dataset=DAEDataset(train_df, alpha=autoencoder_params['alpha'], phase='train')
        val_dataset=DAEDataset(val_df, phase='validation')
        
        train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=autoencoder_params['batch_size'],shuffle=True,pin_memory=True,num_workers=4)
        val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=autoencoder_params['batch_size'],shuffle=False,pin_memory=True,num_workers=4)
        
        epoch_loss=train_one_epoch(train_dataloader, criterion, optimizer, scheduler, model)
        val_loss=validate(val_dataloader, criterion, model)
        print('Epoch:{} | Epoch Train Loss:{} | Validation Loss: {}'.format(i+1, epoch_loss, val_loss) )
        
        train_losses_.append(epoch_loss)
        val_losses_.append(val_loss)
        if val_loss < best_score:
            best_score=val_loss
            best_epoch=i+1
            torch.save(model.state_dict(), "epoch{}.pth".format(i+1))
    print('Best SCore:{} | Best Epoch:{}'.format(best_score, best_epoch))
    
    plt.plot(train_losses_[5:])
    plt.plot(val_losses_[5:])
    plt.show()

In [None]:
autoencoder=MOAAutoencoder()
train_model(autoencoder)

In [None]:
autoencoder=MOAAutoencoder()
criterion=nn.MSELoss()
init_lr=0.0001
max_lr=0.1
gamma=1.05

#optimizer=optim.SGD(autoencoder.parameters(), lr= init_lr,weight_decay=autoencoder_params['weight_decay'])
optimizer=optim.Adam(autoencoder.parameters(), lr=init_lr)
scheduler=optim.lr_scheduler.StepLR(optimizer, 1, gamma=gamma)

lrs=[]
batch_loss=[]

for i in range(5):
    train_dataset=DAEDataset(train_df, alpha=autoencoder_params['alpha'], phase='train')
    train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=autoencoder_params['batch_size'],shuffle=True,pin_memory=True,num_workers=4)
    
    val_dataset=DAEDataset(val_df, phase='validation')
    val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=autoencoder_params['batch_size'],shuffle=False,pin_memory=True,num_workers=4)
    for batch_id, (X, y) in enumerate(train_dataloader):
        optimizer.zero_grad()
        ypred=autoencoder(X)
        loss_=criterion(ypred, y)
        loss_.backward()
        
        optimizer.step()
        scheduler.step()
        
        if init_lr >= max_lr:
            break
        lrs.append(init_lr)
        batch_loss.append(loss_.item())
        init_lr *= gamma

In [None]:
print(len(lrs))
print(lrs[-1], batch_loss[-1])

plt.plot(lrs, batch_loss)

In [None]:
test_df
test_dataset=DAEDataset(test_df, phase='test')
test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=autoencoder_params['batch_size'],shuffle=False,pin_memory=True,num_workers=4)
criterion=nn.MSELoss()
test_loss=validate(test_dataloader, criterion, autoencoder)

print(test_loss)

In [None]:
test_df.head()