In [1]:
import os
GPU_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

from fastai.basic_train import *
from fastai.callbacks import SaveModelCallback
from functools import partial
from torch.utils.dlpack import from_dlpack
import cudf as gd
import warnings
import glob 


from mpnn_model.common import * 
from mpnn_model.common_constants import * 
from mpnn_model.dataset import TensorBatchDataset, BatchDataBunch, BatchDataLoader
from mpnn_model.data_collate import tensor_collate_baseline
from mpnn_model.GaussRank import GaussRankMap
from mpnn_model.helpers import load_cfg
from mpnn_model.model import Net 
from mpnn_model.train_loss import train_criterion, lmae_criterion
from mpnn_model.callback import get_reverse_frame, lmae, LMAE
from mpnn_model.radam import * 
from mpnn_model.build_predictions import do_test 
from mpnn_model.helpers import * 

warnings.filterwarnings("ignore") 



matplotlib.get_backend :  module://ipykernel.pylab.backend_inline


## Load config file 

In [2]:
# load config dict 
cfg = load_cfg('/rapids/notebooks/srabhi/champs-2019/CherKeng_solution/fastai_code/experiments/MPNN_EMBED_TYPE_LMAE_WO_GAUSSRANK.yaml')

In [3]:
COUPLING_MAX = 136

<center> <h1> Data set : </h1> </center>

In [4]:
DATA_DIR = cfg['dataset']['input_path']
fold = 1

In [5]:
%%time
validation = gd.read_parquet(DATA_DIR +'/parquet/fold_%s/validation.parquet'%fold)
train = gd.read_parquet(DATA_DIR +'/parquet/fold_%s/train.parquet' %fold)

CPU times: user 27.9 s, sys: 12.8 s, total: 40.7 s
Wall time: 1min 34s


In [6]:
batch_size = cfg['train']['batch_size']

- Convert dataframe to tensors 

In [7]:
num_nodes_tensor = from_dlpack(train['num_nodes'].to_dlpack()).long()
num_edges_tensor = from_dlpack(train['num_edge'].to_dlpack()).long()
num_coupling_tensor = from_dlpack(train['num_coupling'].to_dlpack()).long()
node_cols = [i for i in train.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(train[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in train.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(train[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in train.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(train[coupling_cols].to_dlpack()).type(torch.float32)
mol_train = train.molecule_name.unique().to_pandas().values
train_dataset = TensorBatchDataset(mol_train, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                        num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_baseline,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='train',
                                csv='train')
del train
# convert validation to tensors 
print('** Convert validation tensors **\n')
num_nodes_tensor = from_dlpack(validation['num_nodes'].to_dlpack()).long()
num_edges_tensor = from_dlpack(validation['num_edge'].to_dlpack()).long()
num_coupling_tensor = from_dlpack(validation['num_coupling'].to_dlpack()).long()
node_cols = [i for i in validation.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(validation[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in validation.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(validation[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in validation.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(validation[coupling_cols].to_dlpack()).type(torch.float32)
mol_valid = validation.molecule_name.unique().to_pandas().values
valid_dataset = TensorBatchDataset(mol_valid, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                            num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_baseline,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='train',
                                csv='train')
del validation 

** Convert validation tensors **



In [8]:
data = BatchDataBunch.create(train_dataset, valid_dataset, device='cuda', bs=batch_size)

<center> <h1> MPNN model </h1></center>


In [9]:
net = Net(cfg, y_range=[-36.2186, 204.8800])

## Init Fastai Learner

In [10]:
#### Init Fastai learner 
loss_name = cfg['train']['loss_name']
num_output = cfg['model']['regression']['num_output']
predict_type = cfg['model']['regression']['predict_type']
gaussrank = cfg['dataset']['gaussrank']
print('\tCriterion: %s\n'%(loss_name))

### Get GaussRank mapping 
print('\n Load GaussRank mapping')
data_dir = DATA_DIR + '/rnn_parquet'
normalize = cfg['dataset']['normalize']
files = glob.glob(data_dir+'/fold_%s/'%fold+'*.csv')
mapping_frames = ['']*8
coupling_order = ['']*8

for file in files:
    type_ = file.split('/')[-1].split('_')[2]
    order = int(file.split('/')[-1].split('_')[-1].strip('.csv'))
    coupling_order[order] = type_
    mapping_frames[order] = pd.read_csv(file)  

grm = GaussRankMap(mapping_frames, coupling_order)

optal = partial(RAdam)

learn =  Learner(data,
                 net.cuda(),
                 metrics=None,
                 opt_func=optal,
                 callback_fns=partial(LMAE,
                                    grm=grm,
                                    predict_type=predict_type,
                                    normalize_coupling=normalize,
                                    coupling_rank=gaussrank))

learn.loss_func = partial(train_criterion, 
                          criterion=loss_name,
                          num_output=num_output,
                          gaussrank=gaussrank,
                          pred_type=predict_type) 

print('\tTraining loss: %s\n'%(learn.loss_func))

#### fit one cycle 
epochs = cfg['train']['epochs']
max_lr = cfg['train']['max_lr']

	Criterion: lmae_embed_type


 Load GaussRank mapping
	Training loss: functools.partial(<function train_criterion at 0x7f1506f25a60>, criterion='lmae_embed_type', num_output=1, gaussrank=False, pred_type=False)



## Fit_one_cycle 

In [11]:
learn.fit_one_cycle(1,
                    0.005, 
                    callbacks=[SaveModelCallback(learn,
                                                 every='improvement',
                                                 monitor='LMAE', 
                                                 name=cfg['train']['model_name']+'_fold_%s'%fold,
                                                 mode='min')])

epoch,train_loss,valid_loss,LMAE,time
0,0.637416,0.584647,0.428507,03:00


Better model found at epoch 0 with LMAE value: 0.4285072386264801.


<h1> <center> Build predictions </center></h1>

In [12]:
torch.cuda.empty_cache()

In [14]:
valid_dataset = TensorBatchDataset(mol_valid, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                        num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_baseline,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='test',
                                csv='train')

valid_loader = BatchDataLoader(valid_dataset, 
                               shuffle=False, 
                               pin_memory=False, 
                               drop_last=False, 
                               device='cuda')

valid_dataset.get_total_samples()
print('compute the validation predictions ')    
valid_loss, reverse_frame, contributions, molecule_representation = do_test(learn.model,
                                                                       valid_loader,
                                                                       valid_dataset.total_samples,
                                                                       1,
                                                                       predict_type,
                                                                       grm,
                                                                       normalize=normalize,
                                                                       gaussrank=gaussrank)



compute the validation predictions 
  1164554/ 1164554     1.00   0 hr 00 min

predict
build preds frame
Compute lmae per type


In [16]:
print('\n')
print('|------------------------------------ VALID ------------------------------------------------|\n')
print('| 1JHC,   2JHC,   3JHC,   1JHN,   2JHN,   3JHN,   2JHH,   3JHH  |  loss  mae log_mae | fold |\n')
print('|-------------------------------------------------------------------------------------------|\n')
print('|%+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f | %+5.3f %5.2f %+0.2f |  %s   |\n' %(*valid_loss[:11], fold))



|------------------------------------ VALID ------------------------------------------------|

| 1JHC,   2JHC,   3JHC,   1JHN,   2JHN,   3JHN,   2JHH,   3JHH  |  loss  mae log_mae | fold |

|-------------------------------------------------------------------------------------------|

|+0.693, +0.496, +0.810, +0.658, +0.272, -0.027, -0.012, +0.539 | +0.594  1.60 +0.43 |  1   |



## Test data 

In [19]:
DATA_DIR = cfg['dataset']['input_path']
batch_size = cfg['train']['batch_size']

In [23]:
print('load test data')
torch.cuda.empty_cache()
test = gd.read_parquet(DATA_DIR +'/parquet/test.parquet')
num_nodes_tensor = from_dlpack(test['num_nodes'].to_dlpack())
num_edges_tensor = from_dlpack(test['num_edge'].to_dlpack())
num_coupling_tensor = from_dlpack(test['num_coupling'].to_dlpack())
node_cols = [i for i in test.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(test[node_cols].to_dlpack())
nodes_matrix = from_dlpack(test[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in test.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(test[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in test.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(test[coupling_cols].to_dlpack()).type(torch.float32)

mol_test  = test.molecule_name.unique().to_pandas().values
del test

test_dataset = TensorBatchDataset(mol_test, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                         num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_baseline,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='test',
                                csv='test')

test_loader = BatchDataLoader(test_dataset, 
                               shuffle=False, 
                               pin_memory=False, 
                               drop_last=False, 
                               device='cuda')

print('\n Compute predictions for test data at fold %s\n' %fold)
test_loss, preds_fold_test, contributions, molecule_representation = do_test(learn.model,
                                                                       test_loader,
                                                                       cfg['train']['test_shape'], 
                                                                       1,
                                                                       predict_type,
                                                                       grm,
                                                                       normalize=False,
                                                                       gaussrank=gaussrank)

load test data

 Compute predictions for test data at fold 1

  2505542/ 2505542     1.00   0 hr 00 min

predict
build preds frame
Compute lmae per type


## Save validation and test frames 

In [None]:
val_loss = valid_loss[-1]
print('\n Save Validation frame' )
out_dir = '/rapids/notebooks/srabhi/champs-2019/output'
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]
output_name = out_dir + '/submit/scalar_output/cv_%s_%s_%.4f_fold_%s.csv.gz'%(clock, loss_name, val_loss, fold)
reverse_frame.to_csv(output_name, index=False,compression='gzip')

In [None]:
# save test predictions 
print('\n Save Test frame' )
out_dir =   cfg['dataset']['output_path']
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]
output_name = out_dir + '/submit/scalar_output/sub_%s_%s_%.4f_fold_%s.csv.gz'%(clock, loss_name, val_loss, fold)
preds_fold_test.to_csv(output_name, index=False,compression='gzip')