In [1]:
import os
GPU_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)
    
from mpnn_model.common import * 
from mpnn_model.common_constants import * 
from mpnn_model.dataset import TensorBatchDataset, BatchDataBunch, BatchDataLoader
from mpnn_model.data_collate import tensor_collate_rnn
from mpnn_model.GaussRank import GaussRankMap
from mpnn_model.helpers import load_cfg
from mpnn_model.model import Net 
from mpnn_model.train_loss import train_criterion, lmae_criterion
from mpnn_model.callback import get_reverse_frame, lmae, LMAE
from mpnn_model.radam import * 
from mpnn_model.build_predictions import do_test 
from mpnn_model.helpers import * 

# Fast ai
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.basic_data import *
from fastai.callbacks import SaveModelCallback
from fastai import *


import cudf as gd
import numpy as np 
import pandas as pd 

from torch.utils.dlpack import from_dlpack
import torch
from torch import _utils
from fastai.torch_core import to_device
import torch.nn.functional as F 

from timeit import default_timer as timer
from datetime import datetime
from time import time 
from functools import partial
import glob 
import warnings
warnings.filterwarnings("ignore") 

matplotlib.get_backend :  module://ipykernel.pylab.backend_inline


In [2]:
from scripts.train_type import *
from mpnn_model.common_constants import COUPLING_TYPE

In [3]:
model_dict = { '1JHC': 'lmae', '2JHC': 'lmae', '3JHC': 'lmae', '3JHH': 'lmae',
             '1JHN': 'mlmae' , '2JHN':'mlmae' , '3JHN':'mlmae', '2JHH':'mlmae'}

In [4]:
COUPLING_TYPE

['1JHC', '2JHC', '3JHC', '1JHN', '2JHN', '3JHN', '2JHH', '3JHH']

In [5]:
NUM_TARGET =  1   

In [6]:
cfg ='/rapids/notebooks/srabhi/champs-2019/CherKeng_solution/fastai_code/experiments/MPNN_RNN_MAE_WO_GAUSSRANK_SINGLE_TYPE.yaml'
fold = 1
type_='3JHH'
COUPLING_MAX = COUPLING_MAX_DICT[type_]

In [7]:
cfg = load_cfg(cfg)
DATA_DIR = cfg['dataset']['input_path']
normalize = cfg['dataset']['normalize']
gaussrank=  cfg['dataset']['gaussrank']
model_name = cfg['train']['model_name']  
model_name = model_name+ '_fold_%s' %fold 
batch_size = cfg['train']['batch_size']
predict_type = cfg['train']['predict_type']
loss_name = cfg['train']['loss_name']
predict_type = cfg['model']['regression']['predict_type']
epochs = cfg['train']['epochs']
max_lr = cfg['train']['max_lr']
device = cfg['train']['device']

<h1> Dataset </h1>

In [8]:
%%time
test= pd.read_csv(DATA_DIR+'/csv/test.csv')
id_test = test.id.values
mol_test = test.molecule_name.values

print('\n Load Train/Validation features for fold %s' %fold)
validation = gd.read_parquet(DATA_DIR +'/rnn_parquet/fold_%s/%s/validation.parquet'%(fold, type_))
train = gd.read_parquet(DATA_DIR +'/rnn_parquet/fold_%s/%s/train.parquet' %(fold, type_))

print('\n Get In-memory Tensor ')

# Convert train to tensors 
num_nodes_tensor = from_dlpack(train['num_nodes'].to_dlpack()).long()
num_edges_tensor = from_dlpack(train['num_edge'].to_dlpack()).long()
num_coupling_tensor = from_dlpack(train['num_coupling'].to_dlpack()).long()

node_cols = [i for i in train.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(train[node_cols].to_dlpack()).type(torch.float32)

edge_cols = [i for i in train.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(train[edge_cols].to_dlpack()).type(torch.float32)

coupling_cols = [i for i in train.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(train[coupling_cols].to_dlpack()).type(torch.float32)

mol_train = train.molecule_name.unique().to_pandas().values
train_dataset = TensorBatchDataset(mol_train, 
                                   tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                            num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                    batch_size=batch_size,
                                   collate_fn=tensor_collate_rnn,
                                   COUPLING_MAX=COUPLING_MAX,
                                    mode='train',
                                    csv='train')
# convert validation to tensors 
num_nodes_tensor = from_dlpack(validation['num_nodes'].to_dlpack()).long()
num_edges_tensor = from_dlpack(validation['num_edge'].to_dlpack()).long()
num_coupling_tensor = from_dlpack(validation['num_coupling'].to_dlpack()).long()

node_cols = [i for i in validation.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(validation[node_cols].to_dlpack()).type(torch.float32)

edge_cols = [i for i in validation.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(validation[edge_cols].to_dlpack()).type(torch.float32)

coupling_cols = [i for i in validation.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(validation[coupling_cols].to_dlpack()).type(torch.float32)


mol_valid = validation.molecule_name.unique().to_pandas().values
valid_dataset = TensorBatchDataset(mol_valid, 
                                   tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                            num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                    batch_size=batch_size,
                                   collate_fn=tensor_collate_rnn,
                                   COUPLING_MAX=COUPLING_MAX,
                                    mode='train',
                                    csv='train')

del train 
del validation 

data = BatchDataBunch.create(train_dataset, valid_dataset, device=device, bs=batch_size)


 Load Train/Validation features for fold 1

 Get In-memory Tensor 
CPU times: user 30.6 s, sys: 13.1 s, total: 43.7 s
Wall time: 1min 35s


# Model 

In [9]:
pretrain_model = model_dict[type_]
freeze_cycle = 1
unfreeze_cycle = 1 

In [11]:
if not gaussrank: 
    net = torch.load('pre_trained_models/coupling_%s_%s_fold_%s_wo_gaussrank.pth'%(type_, pretrain_model, fold))
else: 
    net = torch.load('pre_trained_models/coupling_%s_%s_fold_%s_gaussrank.pth'%(type_, pretrain_model, fold))
        
# load grm : 
data_dir = DATA_DIR + '/rnn_parquet'
file = glob.glob(data_dir+'/fold_%s/'%fold+'%s/*.csv'%type_)[0]     
coupling_order = [type_]
mapping_frames = [pd.read_csv(file)]  
grm = GaussRankMap(mapping_frames, coupling_order)


############################------------- Fine tune training ---------------################################
optal = partial(RAdam)
learn =  Learner(data,
                 net,
                 metrics=None,
                 opt_func=optal,
                 callback_fns=partial(LMAE,
                                    grm=grm,
                                    predict_type=predict_type,
                                    normalize_coupling=normalize, 
                                    coupling_rank=gaussrank))

learn.loss_func = lmae_criterion

learn.split([[learn.model.preprocess,learn.model.message_function, learn.model.update_function, learn.model.readout],
             [learn.model.rnn_attention],[learn.model.dense_layer, learn.model.predict]])

learn.lr_range(slice(1e-3))

learn.freeze()
learn.fit_one_cycle(freeze_cycle, callbacks=[SaveModelCallback(learn,
                                                 every='improvement',
                                                 monitor='LMAE', 
                                                 name=cfg['train']['model_name']+'_fold_%s_frozen_type_%s_'%(fold, type_),
                                                 mode='min')])

learn.unfreeze()
learn.fit_one_cycle(unfreeze_cycle, max_lr=max_lr, callbacks=[SaveModelCallback(learn,
                                                 every='improvement',
                                                 monitor='LMAE', 
                                                 name=cfg['train']['model_name']+'_fold_%s_pretrained_%s_'%(fold, type_),
                                                 mode='min')])

epoch,train_loss,valid_loss,LMAE,time
0,0.78377,0.739121,0.760839,01:27


Better model found at epoch 0 with LMAE value: 0.7608386278152466.


epoch,train_loss,valid_loss,LMAE,time
0,-0.165126,-0.24794,-0.225796,02:40


Better model found at epoch 0 with LMAE value: -0.22579647600650787.


<h1> Predictions </h1>

In [30]:
valid_dataset = TensorBatchDataset(mol_valid, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                        num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_rnn,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='test',
                                csv='train')

valid_loader = BatchDataLoader(valid_dataset, 
                               shuffle=False, 
                               pin_memory=False, 
                               drop_last=False, 
                               device='cuda')

print('\n Compute predictions for validation data at fold %s\n' %fold)  
valid_loss, reverse_frame, contributions, molecule_representation = do_test(learn.model,
                                                                       valid_loader,
                                                                       1,
                                                                       1,
                                                                       predict_type,
                                                                       grm,
                                                                       normalize=normalize,
                                                                       gaussrank=gaussrank)


val_loss = valid_loss[-3]
print('\nValidation loss is : %s' %val_loss)

print('\nSave model to disk')
torch.save(learn.model, 'models/' + cfg['train']['model_name'] + '_fold_%s_final_save.pth'%fold)

print('load test data')
torch.cuda.empty_cache()
test = gd.read_parquet(DATA_DIR +'/rnn_parquet/test_%s.parquet'%type_)
num_nodes_tensor = from_dlpack(test['num_nodes'].to_dlpack())
num_edges_tensor = from_dlpack(test['num_edge'].to_dlpack())
num_coupling_tensor = from_dlpack(test['num_coupling'].to_dlpack())
node_cols = [i for i in test.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(test[node_cols].to_dlpack())
nodes_matrix = from_dlpack(test[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in test.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(test[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in test.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(test[coupling_cols].to_dlpack()).type(torch.float32)

mol_test  = test.molecule_name.unique().to_pandas().values
#batch_node, batch_edge, batch_coupling, batch_graussrank, batch_num_node, batch_num_edge, batch_num_coupling
del test

test_dataset = TensorBatchDataset(mol_test, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                         num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_rnn,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='test',
                                csv='test')

test_loader = BatchDataLoader(test_dataset, 
                               shuffle=False, 
                               pin_memory=False, 
                               drop_last=False, 
                               device='cuda')

print('\n Compute predictions for test data at fold %s\n' %fold)
test_loss, preds_fold_test, contributions, molecule_representation = do_test(learn.model,
                                                                       valid_loader,
                                                                       1,
                                                                       1,
                                                                       predict_type,
                                                                       grm,
                                                                       normalize=normalize,
                                                                       gaussrank=gaussrank)


 Compute predictions for validation data at fold 1

   147689/       1     147689.00   0 hr 00 min

predict
build preds frame
Compute lmae per type

Validation loss is : -0.2508597426602501

Save model to disk
load test data

 Compute predictions for test data at fold 1

   147689/       1     147689.00   0 hr 00 min

predict
build preds frame
Compute lmae per type


In [31]:
reverse_frame.head(4)

Unnamed: 0,scalar_coupling_constant,type_ind,id,true_scalar_coupling_constant
0,11.331904,7,111.0,3.42929
1,8.175097,7,112.0,12.3307
2,9.798114,7,117.0,12.3318
3,4.748343,7,118.0,3.42409


# Save predictions 

In [34]:
OUT_DIR = cfg['dataset']['output_path']
num_output = cfg['model']['regression']['num_output']
if num_output == 1:
    out_dir = OUT_DIR + '/submit/scalar_output/'
    # init preditions arrays 
    pred_cv = np.zeros( cfg['train']['train_shape'])
    pred_sub = np.zeros(cfg['train']['test_shape'])

elif num_output == 5:
    out_dir = OUT_DIR + '/submit/multi_output/'
    pred_cv = np.zeros((cfg['train']['train_shape'], 5))
    pred_sub = np.zeros((cfg['train']['test_shape'], 5))

In [36]:
print('\n Save Validation frame' )
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]
output_name = out_dir + '/cv_%s_%s_%.4f_type_%s_fold_%s.csv.gz'%(clock, pretrain_model, val_loss, type_, fold)
reverse_frame.to_csv(output_name, index=False,compression='gzip')

# save test predictions 
print('\n Save Test frame' )
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]
output_name = out_dir + '/sub_%s_%s_%.4f_type_%s_fold_%s.csv.gz'%(clock, pretrain_model, val_loss, type_, fold)
preds_fold_test.to_csv(output_name, index=False,compression='gzip')

net=None
torch.cuda.empty_cache()


 Save Validation frame

 Save Test frame
