In [1]:
from create_parquet import *
from data import *
import warnings
warnings.filterwarnings("ignore") 

matplotlib.get_backend :  module://ipykernel.pylab.backend_inline


## Build node frame from graphs 

In [2]:
def get_node_from_graph(molecule_file): 
    '''
      - molecule file:  path to %molecule_name.pickle
    Returns: 
      Convert the pickled graph to a padded vector with all the molecule information 
    '''
    molecule_name = molecule_file.split('/')[-1].strip('.pickle')
    graph = read_pickle_from_file(molecule_file)
    molecule_name = graph.molecule_name
    node_feats = np.concatenate(graph.node,-1)
    num_node, node_dim = node_feats.shape 
    node = pd.DataFrame(node_feats)
    node.columns = ['symbol', 'acceptor', 'donor', 'aromatic', 'hybridization', 'num_h', 'atomic',]
    node['num_nodes'] = num_node
    node['node_dim'] = node_dim
    node['molecule_name'] = molecule_name
    node['atom_index'] = list(range(num_node))
    return  node

In [3]:
get_node_from_graph('/rapids/notebooks/srabhi/champs-2019/input/structure/graph2/dsgdb9nsd_133885.pickle')

Unnamed: 0,symbol,acceptor,donor,aromatic,hybridization,num_h,atomic,num_nodes,node_dim,molecule_name,atom_index
0,2.0,0.0,0.0,0.0,3.0,2.0,6.0,16,7,dsgdb9nsd_133885,0
1,3.0,0.0,1.0,0.0,3.0,0.0,7.0,16,7,dsgdb9nsd_133885,1
2,2.0,0.0,0.0,0.0,3.0,1.0,6.0,16,7,dsgdb9nsd_133885,2
3,2.0,0.0,0.0,0.0,3.0,1.0,6.0,16,7,dsgdb9nsd_133885,3
4,2.0,0.0,0.0,0.0,3.0,1.0,6.0,16,7,dsgdb9nsd_133885,4
5,4.0,1.0,0.0,0.0,3.0,0.0,8.0,16,7,dsgdb9nsd_133885,5
6,2.0,0.0,0.0,0.0,3.0,0.0,6.0,16,7,dsgdb9nsd_133885,6
7,2.0,0.0,0.0,0.0,3.0,1.0,6.0,16,7,dsgdb9nsd_133885,7
8,2.0,0.0,0.0,0.0,3.0,1.0,6.0,16,7,dsgdb9nsd_133885,8
9,1.0,0.0,0.0,0.0,0.0,0.0,1.0,16,7,dsgdb9nsd_133885,9


In [21]:
from parallel_process import parallel_process
files = glob.glob('/rapids/notebooks/srabhi/champs-2019/input/structure/graph2/*.pickle')
frames = parallel_process(files, get_node_from_graph)

100%|██████████| 131k/131k [02:24<00:00, 905it/s]    
130772it [00:00, 411905.84it/s]


In [22]:
node_frame = pd.concat(frames)

In [24]:
node_frame.to_csv('/rapids/notebooks/srabhi/champs-2019/input/parquet/baseline_node_frame.csv', index=False)

In [42]:
node_frame.head(2)

Unnamed: 0,symbol,acceptor,donor,aromatic,hybridization,num_h,atomic,num_nodes,node_dim,molecule_name,atom_index
0,2.0,0.0,0.0,0.0,3.0,3.0,6.0,21,7,dsgdb9nsd_101594,0
1,3.0,0.0,1.0,0.0,2.0,1.0,7.0,21,7,dsgdb9nsd_101594,1


## Build coupling dataframe from graph

In [37]:
def get_coupling_from_graph(molecule_file): 
    '''
      - molecule file:  path to %molecule_name.pickle
    Returns: 
      Convert the pickled graph to a padded vector with all the molecule information 
    '''
    molecule_name = molecule_file.split('/')[-1].strip('.pickle')
    graph = read_pickle_from_file(molecule_file)
    molecule_name = graph.molecule_name
    
    coupling_feats = np.concatenate([graph.coupling.index, graph.coupling.type.reshape(-1, 1), 
                               graph.coupling.value.reshape(-1,1), graph.coupling.contribution,
                               graph.coupling.id.reshape(-1,1)], -1)

    num_coupling, coupling_dim = coupling_feats.shape

    #change to cudf
    coupling = pd.DataFrame(coupling_feats)
    coupling.columns = ['atom_index_0', 'atom_index_1', 'coupling_type', 'scalar_coupling', 'fc', 'sd', 'pso', 'dso', 'id']
    coupling['num_coupling'] = num_coupling
    coupling['coupling_dim'] = coupling_dim
    coupling['molecule_name'] = molecule_name
    return  coupling

In [43]:
get_coupling_from_graph('/rapids/notebooks/srabhi/champs-2019/input/structure/graph2/dsgdb9nsd_103915.pickle').head(2)

Unnamed: 0,atom_index_0,atom_index_1,coupling_type,scalar_coupling,fc,sd,pso,dso,id,num_coupling,coupling_dim,molecule_name
0,9.0,0.0,0.0,82.3191,80.4588,0.170039,0.892564,0.797695,3622270.0,72,9,dsgdb9nsd_103915
1,9.0,1.0,1.0,-2.13186,-2.26579,0.091737,-0.046631,0.088824,3622271.0,72,9,dsgdb9nsd_103915


In [44]:
from parallel_process import parallel_process
files = glob.glob('/rapids/notebooks/srabhi/champs-2019/input/structure/graph2/*.pickle')
frames = parallel_process(files, get_coupling_from_graph)

100%|██████████| 131k/131k [02:17<00:00, 950it/s]    
130772it [00:00, 332063.91it/s]


In [46]:
coupling_frame = pd.concat(frames)

In [48]:
coupling_frame.head(3)

Unnamed: 0,atom_index_0,atom_index_1,coupling_type,scalar_coupling,fc,sd,pso,dso,id,num_coupling,coupling_dim,molecule_name
0,9.0,0.0,0.0,84.5056,83.1179,0.133123,0.548422,0.706185,3528934.0,64,9,dsgdb9nsd_101594
1,9.0,1.0,4.0,-0.42038,-0.454646,-0.012691,0.098126,-0.051169,3528935.0,64,9,dsgdb9nsd_101594
2,9.0,2.0,2.0,-0.017699,0.076249,0.003486,0.026211,-0.123645,3528936.0,64,9,dsgdb9nsd_101594


In [49]:
coupling_frame.to_csv('/rapids/notebooks/srabhi/champs-2019/input/parquet/baseline_coupling_frame.csv', index=False)

## Build edge frame from graph 

In [None]:
from data import *
def get_edge_from_graph(molecule_file): 
    '''
      - molecule file:  path to %molecule_name.pickle
    Returns: 
      Convert the pickled graph to a padded vector with all the molecule information 
    '''
    molecule_name = molecule_file.split('/')[-1].strip('.pickle')
    graph = read_pickle_from_file(molecule_file)
    molecule_name = graph.molecule_name
    edge_feats = np.concatenate(graph.edge,-1)
    edge_feats = np.concatenate([graph.edge_index, edge_feats], -1)
    num_edge, edge_dim = edge_feats.shape 
    infor = [molecule_name, num_edge, edge_dim]
    edge = pd.DataFrame(edge_feats)
    edge.columns = ['atom_index_0', 'atom_index_1', 'edge_type', 'distance', 'angle']
    edge['molecule_name'] = molecule_name
    edge['num_edge'] = num_edge
    edge['edge_dim'] = edge_dim
    return  edge

In [None]:
files = glob.glob('/rapids/notebooks/srabhi/champs-2019/input/structure/graph2/*.pickle')
#t = get_edge_from_graph(molecule_file+molecule_name+'.pickle')

In [None]:
from parallel_process import parallel_process
frames = parallel_process(files, get_edge_from_graph)

In [None]:
edge_frame = pd.concat(frames)

In [None]:
edge_frame.to_csv('/rapids/notebooks/srabhi/champs-2019/input/parquet/baseline_edge_frame.csv', index=False)