In [1]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

from fastai.basic_train import *
from fastai.callbacks import SaveModelCallback
from functools import partial
import cudf as gd
import warnings
import glob 
import gzip
from torch.utils.dlpack import from_dlpack

from mpnn_model.common import * 
from mpnn_model.common_constants import * 
from mpnn_model.dataset import TensorBatchDataset, BatchDataBunch, BatchDataLoader
from mpnn_model.data_collate import tensor_collate_rnn
from mpnn_model.GaussRank import GaussRankMap
from mpnn_model.helpers import load_cfg
from mpnn_model.model import Net 
from mpnn_model.train_loss import train_criterion, lmae_criterion
from mpnn_model.callback import get_reverse_frame, lmae, LMAE
from mpnn_model.radam import * 
from mpnn_model.build_predictions import do_test 
from mpnn_model.helpers import * 

warnings.filterwarnings("ignore") 


matplotlib.get_backend :  module://ipykernel.pylab.backend_inline


## Load config file 

In [2]:
# load config dict 
cfg = load_cfg('/rapids/notebooks/srabhi/champs-2019/CherKeng_solution/fastai_code/experiments/MPNN_RNN_PREDICT_TYPE_LMAE_GAUSSRANK_BOOTSTRAP.yaml')

In [3]:
COUPLING_MAX = 136

In [9]:
fold = 1

### Load GRM transformer : 

In [10]:
print('\n Load GaussRank mapping for fold %s' %fold)
data_dir = DATA_DIR + '/rnn_parquet'
files = glob.glob(data_dir+'/fold_%s/'%fold+'*.csv')
mapping_frames = ['']*8
coupling_order = ['']*8

for file in files:
    type_ = file.split('/')[-1].split('_')[2]
    order = int(file.split('/')[-1].split('_')[-1].strip('.csv'))
    coupling_order[order] = type_
    mapping_frames[order] = pd.read_csv(file)  
grm = GaussRankMap(mapping_frames, coupling_order)


 Load GaussRank mapping for fold 1


<center> <h1> Data set : </h1> </center>

In [11]:
from time import time
def build_test_data(fold, grm, coupling_frame, molecule_edge, molecule_node):
    
    # Transform test predictions to gaussrank 
    df_test = coupling_frame[['coupling_type', 'scalar_coupling_constant']]
    df_test.columns = ['type', 'scalar_coupling_constant']

    # Reverse type mapping 
    df_test.type = df_test.type.map(REVERSE_COUPLING_TYPE)

    #fit grm 
    t0 = time()
    transformed_test = grm.convert_df(df_test, from_coupling=True)
    coupling_frame['gaussrank_coupling'] =  transformed_test
    print('\nGetting gaussrank transformation for test data took %s seconds\n' %(time()-t0))
    
    # Build molecule coupling frame for fold 
    coupling_cols = ['atom_index_0', 'atom_index_1', 'coupling_type', 'scalar_coupling_constant', 'gaussrank_coupling',
                    'fc', 'sd', 'pso', 'dso', 'id',
                    'path_index_0', 'path_index_1', 'path_index_2','path_index_3', 
                    'path_btype_0', 'path_btype_1', 'path_btype_2',
                    'path_a_num_0', 'path_a_num_1', 'path_a_num_2', 'path_a_num_3']

    shared_cols = ['molecule_name', 'num_coupling', 'coupling_dim']

    tmp = coupling_frame.groupby('molecule_name').apply(lambda x: x[coupling_cols].values.reshape(-1))
    molecule_coupling = pd.DataFrame(tmp.values.tolist())
    # pad coupling_max from 132 to 136
    COUPLING_MAX = 136
    pad_cols = 21*5
    d = dict.fromkeys([str(i) for i in range(molecule_coupling.shape[1], molecule_coupling.shape[1]+pad_cols)], 0.0)
    molecule_coupling = molecule_coupling.assign(**d).fillna(0.0)
    molecule_coupling['molecule_name'] = tmp.index
    molecule_coupling = molecule_coupling.merge(coupling_frame[shared_cols].drop_duplicates(), on='molecule_name', how='left')
    cols = molecule_coupling.columns.tolist()

    new_cols = cols[-3:] + cols[:-3]
    molecule_coupling = molecule_coupling[new_cols]
    molecule_coupling.columns = ['molecule_name', 'num_coupling', 'coupling_dim'] + ['coupling_%s'%i for i in range(COUPLING_MAX*21)]

    node_edge_frame = pd.merge(molecule_node, molecule_edge, on='molecule_name', how='left')
    general_stack_frame =  pd.merge(node_edge_frame, molecule_coupling, on='molecule_name', how='left')
    return general_stack_frame

In [12]:
DATA_DIR = cfg['dataset']['input_path']
best_pred_file_path = '/datasets/trivago/champs-2019/output/sub_2019-08-27-17-20-58_lmae_-2.3194.csv.gz'
fold = 1

In [13]:
%%time
print('get test pseudo labels')
test= pd.read_csv(DATA_DIR+'/csv/test.csv')
id_test = test.id.values
mol_test = test.molecule_name.values
molecule_edge = pd.read_parquet(DATA_DIR+'/parquet/molecule_edge.parquet')
molecule_edge = molecule_edge[molecule_edge.molecule_name.isin(mol_test)]
molecule_node = pd.read_parquet(DATA_DIR+'/parquet/molecule_node.parquet')
molecule_node = molecule_node[molecule_node.molecule_name.isin(mol_test)]
coupling_frame = pd.read_parquet(DATA_DIR+'/parquet/baseline_rnn_coupling_frame.parquet')
coupling_frame = coupling_frame[coupling_frame.molecule_name.isin(mol_test)]

with gzip.open(best_pred_file_path) as f:
    best_stack_test = pd.read_csv(f) 

coupling_frame = coupling_frame.merge(best_stack_test, on = 'id', how='left')
test_frame = build_test_data(fold, grm, coupling_frame, molecule_edge, molecule_node)

print('\n Load Train/Validation features for fold %s' %fold)
validation = gd.read_parquet(DATA_DIR +'/rnn_parquet/fold_%s/validation.parquet'%fold)
train = gd.read_parquet(DATA_DIR +'/rnn_parquet/fold_%s/train.parquet' %fold)

# transform pandas to cudf 
print('\n Define new train with test observations' )
test = gd.from_pandas(test_frame)
train  = gd.concat([train, test])
del test

get test pseudo labels

Getting gaussrank transformation for test data took 5.167094469070435 seconds


 Load Train/Validation features for fold 1

 Define new train with test observations
CPU times: user 2min 38s, sys: 1min 31s, total: 4min 10s
Wall time: 6min 5s


In [14]:
batch_size = cfg['train']['batch_size']

- Convert dataframe to tensors 

In [15]:
num_nodes_tensor = from_dlpack(train['num_nodes'].to_dlpack()).long()
num_edges_tensor = from_dlpack(train['num_edge'].to_dlpack()).long()
num_coupling_tensor = from_dlpack(train['num_coupling'].to_dlpack()).long()
node_cols = [i for i in train.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(train[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in train.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(train[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in train.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(train[coupling_cols].to_dlpack()).type(torch.float32)
mol_train = train.molecule_name.unique().to_pandas().values
train_dataset = TensorBatchDataset(mol_train, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                        num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_rnn,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='train',
                                csv='train')
del train
# convert validation to tensors 
print('** Convert validation tensors **\n')
num_nodes_tensor = from_dlpack(validation['num_nodes'].to_dlpack()).long()
num_edges_tensor = from_dlpack(validation['num_edge'].to_dlpack()).long()
num_coupling_tensor = from_dlpack(validation['num_coupling'].to_dlpack()).long()
node_cols = [i for i in validation.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(validation[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in validation.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(validation[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in validation.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(validation[coupling_cols].to_dlpack()).type(torch.float32)
mol_valid = validation.molecule_name.unique().to_pandas().values
valid_dataset = TensorBatchDataset(mol_valid, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                            num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_rnn,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='train',
                                csv='train')
del validation 

** Convert validation tensors **



In [16]:
data = BatchDataBunch.create(train_dataset, valid_dataset, device='cuda', bs=batch_size)

<center> <h1> MPNN + 3-seq RNN model </h1></center>


In [17]:
net = Net(cfg, y_range=[-36.2186, 204.8800])

## Init Fastai Learner

In [18]:
#### Init Fastai learner 
loss_name = cfg['train']['loss_name']
num_output = cfg['model']['regression']['num_output']
predict_type = cfg['model']['regression']['predict_type']
gaussrank = cfg['dataset']['gaussrank']
print('\tCriterion: %s\n'%(loss_name))

### Get GaussRank mapping 
print('\n Load GaussRank mapping')
data_dir = DATA_DIR + '/rnn_parquet'
normalize = cfg['dataset']['normalize']
files = glob.glob(data_dir+'/fold_%s/'%fold+'*.csv')
mapping_frames = ['']*8
coupling_order = ['']*8

for file in files:
    type_ = file.split('/')[-1].split('_')[2]
    order = int(file.split('/')[-1].split('_')[-1].strip('.csv'))
    coupling_order[order] = type_
    mapping_frames[order] = pd.read_csv(file)  

grm = GaussRankMap(mapping_frames, coupling_order)

optal = partial(RAdam)

learn =  Learner(data,
                 net.cuda(),
                 metrics=None,
                 opt_func=optal,
                 callback_fns=partial(LMAE,
                                    grm=grm,
                                    predict_type=predict_type,
                                    normalize_coupling=normalize,
                                    coupling_rank=gaussrank))

learn.loss_func = partial(train_criterion, 
                          criterion=loss_name,
                          num_output=num_output,
                          gaussrank=gaussrank,
                          pred_type=predict_type) 

print('\tTraining loss: %s\n'%(learn.loss_func))

#### fit one cycle 
epochs = cfg['train']['epochs']
max_lr = cfg['train']['max_lr']

	Criterion: lmaeo2ceha


 Load GaussRank mapping
	Training loss: functools.partial(<function train_criterion at 0x7fbd7af209d8>, criterion='lmaeo2ceha', num_output=1, gaussrank=True, pred_type=True)



## Fit_one_cycle 

In [19]:
learn.fit_one_cycle(1,
                    0.005, 
                    callbacks=[SaveModelCallback(learn,
                                                 every='improvement',
                                                 monitor='LMAE', 
                                                 name=cfg['train']['model_name']+'_fold_%s'%fold,
                                                 mode='min')])

epoch,train_loss,valid_loss,LMAE,time
0,-0.467079,-0.958322,0.869475,05:14


Better model found at epoch 0 with LMAE value: 0.8694748207331626.


<h1> <center> Build predictions </center></h1>

In [20]:
torch.cuda.empty_cache()

In [21]:
valid_dataset = TensorBatchDataset(mol_valid, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                        num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_rnn,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='test',
                                csv='train')

valid_loader = BatchDataLoader(valid_dataset, 
                               shuffle=False, 
                               pin_memory=False, 
                               drop_last=False, 
                               device='cuda')

valid_dataset.get_total_samples()
print('compute the validation predictions ')    
valid_loss, reverse_frame, contributions, molecule_representation = do_test(learn.model,
                                                                       valid_loader,
                                                                       valid_dataset.total_samples,
                                                                       1,
                                                                       predict_type,
                                                                       grm,
                                                                       normalize=normalize,
                                                                       gaussrank=gaussrank)



compute the validation predictions 
  1164554/ 1164554     1.00   0 hr 00 min

predict
compute the reverse frame
Compute lmae per type


In [22]:
print('\n')
print('|------------------------------------ VALID ------------------------------------------------|\n')
print('| 1JHC,   2JHC,   3JHC,   1JHN,   2JHN,   3JHN,   2JHH,   3JHH  |  loss  mae log_mae | fold |\n')
print('|-------------------------------------------------------------------------------------------|\n')
print('|%+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f, %+0.3f | %+5.3f %5.2f %+0.2f |  %s   |\n' %(*valid_loss[:11], fold))



|------------------------------------ VALID ------------------------------------------------|

| 1JHC,   2JHC,   3JHC,   1JHN,   2JHN,   3JHN,   2JHH,   3JHH  |  loss  mae log_mae | fold |

|-------------------------------------------------------------------------------------------|

|+1.488, +0.308, +0.943, +1.892, +0.492, +0.037, +0.732, +1.064 | +2.917  2.83 +0.87 |  1   |



## Test data 

In [23]:
DATA_DIR = cfg['dataset']['input_path']
batch_size = cfg['train']['batch_size']

In [24]:
print('load test data')
torch.cuda.empty_cache()
test = gd.read_parquet(DATA_DIR +'/rnn_parquet/test.parquet')
num_nodes_tensor = from_dlpack(test['num_nodes'].to_dlpack())
num_edges_tensor = from_dlpack(test['num_edge'].to_dlpack())
num_coupling_tensor = from_dlpack(test['num_coupling'].to_dlpack())
node_cols = [i for i in test.columns if re.compile("^node_[0-9]+").findall(i)]
nodes_matrix = from_dlpack(test[node_cols].to_dlpack())
nodes_matrix = from_dlpack(test[node_cols].to_dlpack()).type(torch.float32)
edge_cols = [i for i in test.columns if re.compile("^edge_[0-9]+").findall(i)]
edges_matrix = from_dlpack(test[edge_cols].to_dlpack()).type(torch.float32)
coupling_cols = [i for i in test.columns if re.compile("^coupling_[0-9]+").findall(i)]
coupling_matrix = from_dlpack(test[coupling_cols].to_dlpack()).type(torch.float32)

mol_test  = test.molecule_name.unique().to_pandas().values
del test

test_dataset = TensorBatchDataset(mol_test, 
                                tensors=[nodes_matrix, edges_matrix, coupling_matrix,
                                         num_nodes_tensor, num_edges_tensor, num_coupling_tensor], 
                                batch_size=batch_size,
                                collate_fn=tensor_collate_rnn,
                                COUPLING_MAX=COUPLING_MAX,
                                mode='test',
                                csv='test')

test_loader = BatchDataLoader(test_dataset, 
                               shuffle=False, 
                               pin_memory=False, 
                               drop_last=False, 
                               device='cuda')

print('\n Compute predictions for test data at fold %s\n' %fold)
test_loss, preds_fold_test, contributions, molecule_representation = do_test(learn.model,
                                                                       test_loader,
                                                                       cfg['train']['test_shape'], 
                                                                       1,
                                                                       predict_type,
                                                                       grm,
                                                                       normalize=False,
                                                                       gaussrank=gaussrank)

load test data

 Compute predictions for test data at fold 1

  2505542/ 2505542     1.00   0 hr 00 min

predict
compute the reverse frame
Compute lmae per type


## Save validation and test frames 

In [None]:
val_loss = valid_loss[-1]
print('\n Save Validation frame' )
out_dir = '/rapids/notebooks/srabhi/champs-2019/output'
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]
output_name = out_dir + '/submit/scalar_output/cv_%s_%s_%.4f_fold_%s.csv.gz'%(clock, loss_name, val_loss, fold)
reverse_frame.to_csv(output_name, index=False,compression='gzip')

In [None]:
# save test predictions 
print('\n Save Test frame' )
out_dir =   cfg['dataset']['output_path']
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]
output_name = out_dir + '/submit/scalar_output/sub_%s_%s_%.4f_fold_%s.csv.gz'%(clock, loss_name, val_loss, fold)
preds_fold_test.to_csv(output_name, index=False,compression='gzip')