In [1]:
from build_predictions import * 
from GaussRank import GaussRankMap
import pandas as pd 
from create_parquet import *
from data import *
import warnings
warnings.filterwarnings("ignore") 

matplotlib.get_backend :  module://ipykernel.pylab.backend_inline


In [2]:
DATA_DIR='/rapids/notebooks/srabhi/champs-2019/input/'

In [3]:
COUPLING_TYPE

['1JHC', '2JHC', '3JHC', '1JHN', '2JHN', '3JHN', '2JHH', '3JHH']

In [4]:
# node frame 
molecule_node = pd.read_parquet(DATA_DIR+'parquet/molecule_node.parquet')

# edge frame 
molecule_edge = pd.read_parquet(DATA_DIR+'parquet/molecule_edge.parquet')

##### Get the three frames needed for building train / validation oupling stacking 

# Build Train / validation gaussrank value for each fold 

In [11]:
from time import time
def save_cv_data(type_, fold):
    print('type %s : %s' %(type_, COUPLING_TYPE[type_]))
    # coupling frame of the type 
    coupling_frame = pd.read_csv(DATA_DIR+'parquet/baseline_coupling_frame.csv')
    cols = ['molecule_name', 'num_coupling', 'coupling_dim', 'atom_index_0', 'atom_index_1', 'coupling_type', 'scalar_coupling',
           'fc', 'sd', 'pso', 'dso', 'id']
    coupling_frame = coupling_frame[cols]
    coupling_frame = coupling_frame[coupling_frame.coupling_type == type_]
    new_num_coupling = dict(coupling_frame.groupby('molecule_name').count()['num_coupling'])
    coupling_frame.num_coupling = coupling_frame.molecule_name.map(new_num_coupling)

    # shortest path
    shortest_path_frame = pd.read_csv('/rapids/notebooks/srabhi/champs-2019/input/shortest_path.csv')
    cols = [ 'id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'path_index_0', 'path_index_1',
           'path_index_2', 'path_index_3', 'path_btype_0', 'path_btype_1',
           'path_btype_2', 'path_a_num_0', 'path_a_num_1', 'path_a_num_2',
           'path_a_num_3']
    shortest_path_frame = shortest_path_frame[cols]
    coupling_frame = pd.merge(coupling_frame, shortest_path_frame, on=['id', 'molecule_name', 'atom_index_0', 'atom_index_1'], how='left')
    print(coupling_frame.shape)
    max_ = coupling_frame.num_coupling.max()
    COUPLING_MAX = max_
    print('max coupling: %s' %max_)
    print('fold: %s' %fold)
    split_train = 'train_split_by_mol_hash.%s.npy'%fold
    split_valid = 'valid_split_by_mol_hash.%s.npy'%fold
    id_train_ = np.load(DATA_DIR + '/split/%s'%split_train,allow_pickle=True)
    id_valid_ = np.load(DATA_DIR + '/split/%s'%split_valid,allow_pickle=True)
    csv = 'test'
    df = pd.read_csv(DATA_DIR + '/csv/%s.csv'%csv)
    id_test_ = df.molecule_name.unique()
    
    train = coupling_frame[coupling_frame.molecule_name.isin(id_train_)]
    validation = coupling_frame[coupling_frame.molecule_name.isin(id_valid_)]
    test = coupling_frame[coupling_frame.molecule_name.isin(id_test_)]
    
    # Get GaussRank of coupling values 
    t0 = time()
    grm = GaussRankMap()
    df_train = train[['coupling_type', 'scalar_coupling']]
    df_valid = validation[['coupling_type', 'scalar_coupling']]
    df_train.columns = ['type', 'scalar_coupling_constant']
    df_valid.columns = ['type', 'scalar_coupling_constant']
    # Reverse type mapping 
    df_train.type = df_train.type.map(REVERSE_COUPLING_TYPE)
    df_valid.type = df_valid.type.map(REVERSE_COUPLING_TYPE)
    #fit grm 
    transformed_training = grm.fit_training(df_train, reset=True)
    transformed_validation = grm.convert_df(df_valid, from_coupling=True)
    validation['gaussrank_coupling'] =  transformed_validation
    train['gaussrank_coupling'] = transformed_training
    print('Getting gaussrank transformation for train/validation data took %s seconds' %(time()-t0))
    print(grm.coupling_order)
    test['gaussrank_coupling'] = 0 

    general_coupling_frame = pd.concat([train, validation, test.fillna(0.0)])

    # Build molecule coupling frame for fold 
    coupling_cols = ['atom_index_0', 'atom_index_1', 'coupling_type', 'scalar_coupling', 'gaussrank_coupling',
                    'fc', 'sd', 'pso', 'dso', 'id',
                    'path_index_0', 'path_index_1', 'path_index_2','path_index_3', 
                    'path_btype_0', 'path_btype_1', 'path_btype_2',
                    'path_a_num_0', 'path_a_num_1', 'path_a_num_2', 'path_a_num_3']
    
    shared_cols = ['molecule_name', 'num_coupling', 'coupling_dim']

    tmp = general_coupling_frame.groupby('molecule_name').apply(lambda x: x[coupling_cols].values.reshape(-1))
    molecule_coupling = pd.DataFrame(tmp.values.tolist()).fillna(0.0)
    molecule_coupling['molecule_name'] = tmp.index
    molecule_coupling = molecule_coupling.merge(general_coupling_frame[shared_cols].drop_duplicates(), on='molecule_name', how='left')

    cols = molecule_coupling.columns.tolist()

    new_cols = cols[-3:] + cols[:-3]
    molecule_coupling = molecule_coupling[new_cols]
    print(molecule_coupling.shape)
    molecule_coupling.columns = ['molecule_name', 'num_coupling', 'coupling_dim'] + ['coupling_%s'%i for i in range(COUPLING_MAX*21)]


    node_edge_frame = pd.merge(molecule_node, molecule_edge, on='molecule_name', how='left')
    general_stack_frame =  pd.merge(node_edge_frame, molecule_coupling, on='molecule_name', how='inner')

    train_frame = general_stack_frame[general_stack_frame.molecule_name.isin(id_train_)]
    validation_frame = general_stack_frame[general_stack_frame.molecule_name.isin(id_valid_)]
    test_frame = general_stack_frame[general_stack_frame.molecule_name.isin(id_test_)]

    
    type_str = COUPLING_TYPE[type_]
    os.makedirs(DATA_DIR+'rnn_parquet/fold_%s'%fold+'/%s'%type_str, exist_ok=True)
    
    validation_frame.to_parquet(DATA_DIR+'rnn_parquet/fold_%s'%fold+'/%s/validation.parquet'%type_str)
    train_frame.to_parquet(DATA_DIR +'rnn_parquet/fold_%s'%fold+ '/%s/train.parquet'%type_str)
    # save mapping
    for i, (str_type_, frame) in enumerate(zip(grm.coupling_order, grm.training_maps)): 
        frame.to_csv(DATA_DIR +'rnn_parquet/fold_%s'%fold+'/%s/mapping_%s_order_%s.csv'%(str_type_, str_type_, i), index=False)
    return test_frame


In [None]:
COUPLING_MAX_DICT = {'1JHC': 20, '2JHC': 36, '3JHC': 66, '1JHN': 8, '2JHN': 12, '3JHN': 18, '3JHH': 36, '2JHH': 19 }

In [12]:
for type_ in range(8)
    for fold in range(4):
        test_frame = save_cv_data(type_, fold) 
        test_frame.to_parquet(DATA_DIR +'/rnn_parquet/test_%s.parquet'%COUPLING_TYPE[type_])

type 7 : 3JHH
(908046, 23)
max coupling: 36
fold: 0
Getting gaussrank transformation for train/validation data took 0.636885404586792 seconds
['3JHH']
(118341, 759)
type 7 : 3JHH
(908046, 23)
max coupling: 36
fold: 1
Getting gaussrank transformation for train/validation data took 0.6745212078094482 seconds
['3JHH']
(118341, 759)
type 7 : 3JHH
(908046, 23)
max coupling: 36
fold: 2
Getting gaussrank transformation for train/validation data took 0.6809020042419434 seconds
['3JHH']
(118341, 759)
type 7 : 3JHH
(908046, 23)
max coupling: 36
fold: 3
Getting gaussrank transformation for train/validation data took 0.6988716125488281 seconds
['3JHH']
(118341, 759)
