In [None]:
import pandas as pd
import os
from pyrheadb.RheaDB import RheaDB
from pyrheadb.ReactionNetwork import ReactionNetwork
rdb = RheaDB()

In [None]:
from rdkit import Chem

In [None]:
def get_inchikey(smiles):
    if type(smiles)==float:
        return None
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.MolToInchiKey(mol)
    return None

In [None]:
rdb.df_chebi_cmpname['Inchikey']=rdb.df_chebi_cmpname['smiles'].apply(get_inchikey)

In [None]:
rdb.df_chebi_cmpname.drop(columns=['smiles'], inplace=True)
rhea_reaction_long_format_smiles_chebi = rdb.rhea_reaction_long_format_smiles_chebi.merge(rdb.df_chebi_cmpname, on='chebiid')

In [None]:
def build_network_from_long_format_table(long_format_reaction_participant_table, hub_compounds_from='from_count', idgroup='chebiid'):
    """
    Removes hub compounds and creates a networkx graph
    :param long_format_reaction_participant_table: table that contains information about which participant is in which reaction
    :param hub_compounds_from: 'from_count' or 'from_list'
    :return: adds reaction graph to self
    """
    counts_chebiids = long_format_reaction_participant_table['chebiid'].value_counts().rename_axis('chebiid').reset_index(name='counts')
    
    # identify non hub compounds
    # by count
    if hub_compounds_from == 'from_count':
        df_non_hubs = pd.DataFrame(counts_chebiids[counts_chebiids['counts']<100]['chebiid'])
    elif hub_compounds_from == 'from_list':
        # by manual list of hub compounds
        filepath = os.path.dirname(__file__)
        with open(os.path.join(filepath, 'biochemical_assumptions', 'rhea_chebi_hub.tsv')) as f:
            hub_compounds = ['CHEBI:'+i.strip() for i in f.readlines() if i]
        df_non_hubs = pd.DataFrame(counts_chebiids[~counts_chebiids['chebiid'].isin(hub_compounds)]['chebiid'])
    
    # Remove hub compounds from the network
    long_format_reaction_participant_table = long_format_reaction_participant_table.merge(df_non_hubs, on='chebiid', how='inner')
    # create chebiid to chebiid table
    df_m = long_format_reaction_participant_table.merge(long_format_reaction_participant_table, on='MASTER_ID')

    # Drop transports
    df_m = df_m[df_m['reaction_side_x'] != df_m['reaction_side_y']]
    df_m[f'{idgroup}pair'] = df_m.apply(lambda row: '.'.join(sorted([row[f'{idgroup}_x'], row[f'{idgroup}_y']])), axis=1)
    df_m.drop(columns=['reaction_side_x', f'{idgroup}_x', 'smiles_x',
    'reaction_side_y', f'{idgroup}_y', 'smiles_y'], inplace=True)
    grouped = df_m.groupby(f'{idgroup}pair', as_index=False, dropna=False).agg(lambda x: set(list(x)))
    grouped[f'{idgroup}1']=grouped[f'{idgroup}pair'].apply(lambda x: x.split('.')[0])
    grouped[f'{idgroup}2']=grouped[f'{idgroup}pair'].apply(lambda x: x.split('.')[1])

    return grouped

In [None]:
rhea_reaction_long_format_smiles_chebi['Inchikey14l']=rhea_reaction_long_format_smiles_chebi['Inchikey'].apply(lambda x: x.split('-')[0])

In [None]:
grouped = build_network_from_long_format_table(rhea_reaction_long_format_smiles_chebi, idgroup='Inchikey')

In [None]:
grouped.head()

In [None]:
grouped.dropna(subset=['Inchikey1', 'Inchikey2'], inplace=True)

In [None]:
import networkx as nx

In [None]:
participants_graph = nx.from_pandas_edgelist(grouped, 'Inchikey1', 'Inchikey2', ['MASTER_ID'])

In [None]:
print('nodes:', len(participants_graph.nodes()))
print('edges:', len(participants_graph.edges()))

In [None]:
grouped14l = build_network_from_long_format_table(rhea_reaction_long_format_smiles_chebi, idgroup='Inchikey14l')

In [None]:
grouped14l.dropna(subset=['Inchikey14l1', 'Inchikey14l2'], inplace=True)

In [None]:
participants_graph_14l = nx.from_pandas_edgelist(grouped14l, 'Inchikey14l1', 'Inchikey14l2', ['MASTER_ID'])
print('nodes:', len(participants_graph_14l.nodes()))
print('edges:', len(participants_graph_14l.edges()))

In [None]:
def connected_in_graph(row, G = nx.Graph, idtype='Inchikey'):
    nodefrom = row['from']#[f'{idtype}_x']
    nodeto = row['to']#[f'{idtype}_y']
    try: 
        path=nx.shortest_path(G, nodefrom, nodeto)
        if path:
            return True
    except Exception:
        return None


In [None]:
import itertools
chebis = rdb.df_chebi_cmpname['chebiid'].to_list()
chebis_combs = list(itertools.product(chebis, chebis))
len(chebis_combs)

In [None]:
inchikeys = set(rdb.df_chebi_cmpname['Inchikey'].to_list())
inchikeys_combs = list(itertools.product(inchikeys, inchikeys))
len(inchikeys_combs)

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
df_combs = pd.DataFrame(inchikeys_combs, columns=['from', 'to'])

In [None]:
df_combs['connected']=df_combs.progress_apply(connected_in_graph,axis=1, G=participants_graph , idtype='Inchikey')

In [None]:
def get_inchikey14l_pair(row):
    listinchikeys14l=[row['from'].split('-')[0], row['to'].split('-')[0]]
    listinchikeys14l.sort()
    if not all(listinchikeys14l):
        return False
    listinchikeys14l = ';'.join(listinchikeys14l)
    return listinchikeys14l

df_combs.dropna(subset=['from', 'to'], inplace=True)
df_combs = df_combs[df_combs['connected']==True]
df_combs['pair']=df_combs.progress_apply(get_inchikey14l_pair, axis=1)
df_combs = df_combs[df_combs['pair']!=False]
df_combs.to_csv('inchis_connected.tsv', sep='\t')
len(df_combs)

In [None]:
rdb.df_chebi_cmpname.dropna(subset=['Inchikey'], inplace=True)
rdb.df_chebi_cmpname['Inchikey14l']=rdb.df_chebi_cmpname['Inchikey'].apply(lambda x: x.split('-')[0])
inchikeys = set(rdb.df_chebi_cmpname['Inchikey14l'].to_list())
inchikeys14l_combs = list(itertools.product(inchikeys, inchikeys))
len(inchikeys14l_combs)

In [None]:
df_combs_14l = pd.DataFrame(inchikeys14l_combs, columns=['from', 'to'])
df_combs_14l['connected']=df_combs_14l.progress_apply(connected_in_graph, axis=1, G=participants_graph_14l , idtype='Inchikey14l')
df_combs_14l = df_combs_14l[df_combs_14l['connected']==True]
df_combs_14l['pair']=df_combs_14l.progress_apply(get_inchikey14l_pair, axis=1)
df_combs_14l = df_combs_14l[df_combs_14l['pair']!=False]
df_combs_14l.to_csv('inchikey14l_connected.tsv', sep='\t')
len(df_combs_14l)

In [None]:
print(len(set(df_combs_14l['pair'].to_list())-set(df_combs['pair'].to_list())))
print(len(set(df_combs['pair'].to_list())-set(df_combs_14l['pair'].to_list())))
print(len(set(df_combs['pair'].to_list()).intersection(set(df_combs_14l['pair'].to_list()))))

In [None]:
set(df_combs_14l['pair'].to_list())-set(df_combs['pair'].to_list())

In [None]:
nx.shortest_path(participants_graph_14l, 'NJOIWWRMLFSDTM', 'YNPGYMZVNLIZLD')