# Splitting Train and Test Sets based on clusters

The motive behind this experiment is to make sure our predictive model is not biased to the chemicals in the training set. So, what we will do here is use a list of chemicals clustered based on their similarities, find their neighbors, calculate their weight over the overall graph then use these weights to seperate training and test sets, in which a cluster of chemcials should only be in one of the sets

### Create dataframe from csv file

In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import pybel
import networkx as nx
import os
from tqdm import tqdm_notebook as tqdm

In [2]:
clustered_chemicals = pd.read_csv(os.path.join(os.pardir, "resources", 'Clustered_chemicals.csv'))

### Splitting training and testing sets based on weighted clusters

In [3]:
full_graph = pybel.from_pickle(os.path.join(os.pardir, "resources", "fullgraph_with_chemsim.pickle"))

In [4]:
clusters_dict = {i : clustered_chemicals['PubchemID'].loc[clustered_chemicals['Cluster'] == i].tolist()
                for i in range(1,clustered_chemicals.Cluster.nunique()+1)}

In this step, we make lists of the chemicals in each clusters and their neighbors

In [5]:
subgraphs_dict = {}
for cluster, chemicals in tqdm(clusters_dict.items()):
    chemicals_subgraph = []
    for chemical in chemicals:
        #formate chemical to BEL to match the nodes in full graph
        chemical = pybel.dsl.Abundance(namespace='pubchem', name=str(chemical)) 
        #ignore chemicals not in the graph
        if chemical not in full_graph.nodes():
            continue
        chemicals_subgraph.append(chemical)
        for neighbor in full_graph.neighbors(chemical):
            chemicals_subgraph.append(neighbor)
        #ignore empty lists
        if not chemicals_subgraph:
            continue
    subgraphs_dict[cluster] = list(dict.fromkeys(chemicals_subgraph)) # to remove duplicates

HBox(children=(IntProgress(value=0, max=1319), HTML(value='')))




Now, we calculate the weights of each cluster by counting the number of edges in the subgraph and dividing it by the number of edges in the fullgraph

In [6]:
fullgraph_edges = len(full_graph.edges())
cluster_weights = {}
for cluster, nodes in tqdm(subgraphs_dict.items()):
    subgraph = full_graph.subgraph(nodes)
    edges = len(subgraph.edges())
    cluster_weights[cluster] = edges/fullgraph_edges

HBox(children=(IntProgress(value=0, max=1319), HTML(value='')))




Next we create a csv file from the previous clusters file that contains the chemical, its cluster and its weight

In [7]:
clustered_chemicals['weight'] = None

In [8]:
for cluster, weight in cluster_weights.items():
    clustered_chemicals.loc[clustered_chemicals['Cluster'] == cluster, 'weight'] = weight

In [9]:
clustered_chemicals.to_csv(os.path.join(os.pardir, "resources", 'clusters_weights.csv'))

Create an weighted edgelist

In [10]:
weights_df = pd.read_csv(os.path.join(os.pardir, "resources",'clusters_weights.csv'), index_col=False, dtype={'PubchemID': str})

In [11]:
weights = {}
for index, row in weights_df.iterrows():
    weights[pybel.dsl.Abundance(namespace='pubchem', name=row['PubchemID'])] = row['weight']

In [12]:
mapping_df = pd.read_csv(os.path.join(os.pardir, "resources", 'fullgraph_nodes_mapping.tsv'), index_col=False, dtype={'NodeName': str, 'NodeNamespace':str})

In [13]:
mapping_df.head()

Unnamed: 0,node_id,namespace,identifier,name
0,1,pubchem,85,(3-carboxy-2-hydroxypropyl)-trimethylazanium
1,2,umls,C0000729,Abdominal cramps
2,3,umls,C0000737,Abdominal pain
3,4,umls,C0687713,Gastrointestinal pain
4,5,umls,C0002418,Amblyopia


In [14]:
mapping_dict={}
for index, row in mapping_df.iterrows():
    if row['namespace'] == 'pubchem':
        mapping_dict[pybel.dsl.Abundance(namespace=row['namespace'], name=row['identifier'])] = row['node_id']
    elif row['namespace'] == 'umls':
        mapping_dict[pybel.dsl.Pathology(namespace=row['namespace'], name=row['name'])] = row['node_id']
    else:
        mapping_dict[pybel.dsl.Protein(namespace=row['namespace'], name=row['identifier'])] = row['node_id']


In [15]:
with open(os.path.join(os.pardir, "resources", 'fullgraph_weighted.edgelist'),'w') as outputfile:
    outputfile.write('Source\tTarget\tWeight\n')
    for source, target in full_graph.edges():
        if source not in weights:
            outputfile.write('%s\t%s\t%f\n' %(mapping_dict[source], mapping_dict[target], 0.0))
        else:
            outputfile.write('%s\t%s\t%f\n' %(mapping_dict[source], mapping_dict[target], weights[source]))

In [16]:
weighted_edgelist = pd.read_csv(os.path.join(os.pardir, "resources",'fullgraph_weighted.edgelist'), sep='\t')

Split training and testing sets based on weights (each weight corresponds to a cluster)

In [17]:
train_inds, test_inds = next(GroupShuffleSplit(test_size=.40, n_splits=2, random_state = 7).split(weighted_edgelist, groups=weighted_edgelist['Weight']))

training = weighted_edgelist.iloc[train_inds]
testing = weighted_edgelist.iloc[test_inds]

In [18]:

with open(os.path.join(os.pardir, "resources", 'training_edgelist.edgelist'), 'w') as trainingfile:
    for index, row in training.iterrows():
        trainingfile.write('%d\t%d\n' % (row['Source'], row['Target']))

with open(os.path.join(os.pardir, "resources", 'testing_edgelist.edgelist'), 'w') as testingfile:
    for index, row in testing.iterrows():
        testingfile.write('%d\t%d\n' % (row['Source'], row['Target']))

G_train = nx.read_edgelist(os.path.join(os.pardir, "resources",'training_edgelist.edgelist'))
G_test = nx.read_edgelist(os.path.join(os.pardir, "resources",'testing_edgelist.edgelist'))
for edge in G_test.edges():
    if edge[0] not in G_train.nodes():
        G_train.add_node(edge[0])
        G_train.add_edge(edge[0], edge[1])
    if edge[1] not in G_train.nodes():
        G_train.add_node(edge[1])
        G_train.add_edge(edge[0], edge[1])
for edge in G_train.edges():
    if edge in G_test.edges():
        G_test.remove_edge(edge[0],edge[1])
nx.write_edgelist(G_train, os.path.join(os.pardir, "resources",'training_edgelist.edgelist'), data = False)
nx.write_edgelist(G_test, os.path.join(os.pardir, "resources",'testing_edgelist.edgelist'), data = False)