In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from matplotlib import pyplot as plt
%matplotlib notebook
import seaborn as sns
from IPython.display import display
# %pprint

In [13]:
def clean_data(rels_list):
    "returns a a list of dataframes for each gen run"
    generation=[]
    generation.append(rels_list[0])
    for rel_no in range(len(rels_list)-1): 
        rel_indexes=list(rels_list[rel_no].index)
        temp=rels_list[rel_no+1].drop(rels_list[rel_no+1].index[rel_indexes])
        temp.reset_index(drop=True, inplace=True)
        generation.append(temp)
    return generation


def gen_rxns(gen):
    a=gen.groupby(["RNo","Reaction_Name"])
    return a   

def number_of_reactions(rels_list):
    "Counts number of reactions"
    l=[]
    for each in clean_data(rels_list):
        l.append(len(gen_rxns(each)))
    return l


def load_data(reaction_name):
    " Load the data"
    rels_0=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_0.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_1=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_1.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_2=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_2.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_3=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_3.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"])
    rels_4=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_4.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"])

    nrels_0=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_0.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_1=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_1.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_2=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_2.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_3=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_3.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"])
    nrels_4=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_4.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"])
#     return [[rels_0,rels_1,rels_2], [nrels_0,nrels_1,nrels_2]]

    return [[rels_0,rels_1,rels_2,rels_3,rels_4], [nrels_0,nrels_1,nrels_2,nrels_3,nrels_4]]


def load_data_4gens(reaction_name):
    " Load the data"
    rels_0=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_0.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_1=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_1.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_2=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_2.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_3=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_3.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"])

    nrels_0=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_0.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_1=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_1.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_2=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_2.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_3=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_3.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"])
#     return [[rels_0,rels_1,rels_2], [nrels_0,nrels_1,nrels_2]]

    return [[rels_0,rels_1,rels_2,rels_3], [nrels_0,nrels_1,nrels_2,nrels_3]]

def load_data_3gens(reaction_name):
    " Load the data"
    rels_0=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_0.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_1=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_1.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    rels_2=pd.read_csv(f'{reaction_name}/old/Neo4j_Imports/rels/rels_2.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 

    nrels_0=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_0.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_1=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_1.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    nrels_2=pd.read_csv(f'{reaction_name}/new/Neo4j_Imports/rels/rels_2.csv', delimiter='\t',names=["RNo", "Compound", "RorP", "Reaction_Name"]) 
    return [[rels_0,rels_1,rels_2], [nrels_0,nrels_1,nrels_2]]




def relsdf_list(gen):
    "Converts rels data into list of reactions "
    a=gen.groupby(["RNo", "Reaction_Name"])
    reactions=[]
    for n, group in a:
        single_reaction=[]
        reactant=(group.loc[group['RorP']==-1,'Compound']).values.tolist()
        product=((group.loc[group['RorP']==1,'Compound']).values.tolist())
        single_reaction.append(n)
        single_reaction.append(reactant)
        single_reaction.append(product)
        reactions.append(single_reaction)
    return reactions

def rxns_counter(list_of_rels):
    "Counts number of times a rule has been applied"
    temp=[] #list that stores lists of relsdf_list output
    for e in clean_data(list_of_rels):
        temp.append(relsdf_list(e))

    temp2=set() #set to store all the reactions only once
    
    for r in list_of_rels:
        temp2=temp2 | set(r.Reaction_Name.values.tolist())

    myDict = {key: 0 for key in list(temp2)}
    for k in temp:
        for p in k:
            myDict[(p[0][1])]=myDict[(p[0][1])]+1
    return {k: v for k, v in sorted(myDict.items(), key=lambda item: item[1])}



def plotter(reaction_name,rels_list):
    bars1=number_of_reactions(rels_list[0])
    bars2=number_of_reactions(rels_list[1])
    print(sum(bars1))
    print(sum(bars2))
    
    barWidth=0.4
    fig, ax = plt.subplots()

    r1=np.arange(len(bars1))

    x=ax.bar(r1-barWidth/2, bars1, color='blue', width=barWidth, edgecolor='white', label='non-iron')
    y=ax.bar(r1+barWidth/2, bars2, color='red', width=barWidth, edgecolor='white', label='iron-inclusive')

    def insert_data_labels(bars):
        for bar in bars:
            bar_height = bar.get_height()
            
            ax.annotate('{0:.0f}'.format(bar.get_height()),
                xy=(bar.get_x() + bar.get_width() / 2, bar_height),
                xytext=(5, 4),
                textcoords='offset points',
                ha='center',
                va='bottom'
            )


    plt.xlabel('Generations', fontweight='bold')
    plt.ylabel('N', fontweight='bold')

    plt.title(f'{reaction_name} Network | Number of reactions vs Generation')
    
    insert_data_labels(x)
    insert_data_labels(y)
    plt.legend()
    plt.show()

    
    
def reactions_df(rels_list):
    x=pd.DataFrame(rxns_counter(rels_list[0]).items())
    y=pd.DataFrame(rxns_counter(rels_list[1]).items())
    df=pd.merge(x,y,how='right',on=0,sort=False,suffixes=("_nonFe", "_withFe")).fillna('0')
    df.columns=['reaction_name','non_iron', 'iron_inclusive']
    df["non_iron"] = pd.to_numeric(df["non_iron"])
    df['percent_change']=(df["iron_inclusive"]-df["non_iron"])*100/df["non_iron"]
    df.replace([np.inf, -np.inf], 0, inplace=True)
    return df

In [14]:
formose_rels=load_data('formose')
plotter('Formose',formose_rels)

54525
81049


<IPython.core.display.Javascript object>

In [15]:
pyruvic_rels=load_data('pyruvic')
plotter('Pyruvic',pyruvic_rels)
print()

17446
45367


<IPython.core.display.Javascript object>




In [16]:
glucose_rels=load_data_4gens('glucose')
plotter('Glucose',glucose_rels)

19271
23661


<IPython.core.display.Javascript object>

In [17]:
maillard_rels=load_data_3gens('maillard')
plotter('Maillard',maillard_rels)

8856
10460


<IPython.core.display.Javascript object>

In [7]:
formose_rxns=reactions_df(formose_rels)
pyruvic_rxns=reactions_df(pyruvic_rels)
glucose_rxns=reactions_df(glucose_rels)
maillard_rxns=reactions_df(maillard_rels)
list_rxns=[formose_rxns,pyruvic_rxns,glucose_rxns,maillard_rxns]


In [9]:
for x in list_rxns:
    display(x.sort_values(['percent_change'], ascending=False)[['reaction_name','non_iron','iron_inclusive']].iloc[:10])

Unnamed: 0,reaction_name,non_iron,iron_inclusive
1,Beta-gamma Unsaturated Acid Decarboxylation,12.0,54
0,Alpha-Beta Unsaturated Acid Decarboxylation,5.0,17
6,Alpha-Keto Acid Decarboxylation,78.0,187
22,"Ring Closure 5 membered O, O",398.0,946
13,"Michael Addition 0,2, (reverse)",221.0,520
14,"Ring Closure 6 membered O, O",227.0,527
7,"Ring Closure 7 membered O, O",82.0,190
9,Beta Decarboxylation,106.0,222
19,Benzilic Acid Rearrangement (inverse),365.0,670
17,Benzilic Acid Rearrangement,326.0,592


Unnamed: 0,reaction_name,non_iron,iron_inclusive
9,"Ring Closure 7 membered O, O",40.0,194
17,Hemiacetal Formation for 7 membered rings,87.0,321
18,"Ring Closure 6 membered O, O",170.0,543
22,Hemiacetal Formation for 6 membered rings,297.0,925
23,"Michael Addition 0,2, (reverse)",318.0,979
29,Hemiacetal Formation for 5 membered rings,522.0,1563
36,"Michael Addition 0,2,",1902.0,5528
6,Cannizarro 1,54.0,154
0,"Hemiacetal Formation for 7 membered rings, inv...",5.0,14
31,Elimination + enol to keto,654.0,1763


Unnamed: 0,reaction_name,non_iron,iron_inclusive
0,Beta-gamma Unsaturated Acid Decarboxylation,2.0,6
1,Alpha-Beta Unsaturated Acid Decarboxylation,3.0,8
5,Alpha-Keto Acid Decarboxylation,11.0,19
8,"Ring Closure 7 membered O, O",29.0,49
15,Benzilic Acid Rearrangement (inverse),65.0,103
14,"Ring Closure 5 membered O, O",66.0,102
13,Cannizarro 1,68.0,102
9,"Ring Closure 6 membered O, O",44.0,64
17,Benzilic Acid Rearrangement,99.0,144
11,"Hemiacetal Formation for 6 membered rings, inv...",53.0,68


Unnamed: 0,reaction_name,non_iron,iron_inclusive
43,"Strecker Degradation Dicarbonyl, H, H, H, C",13.0,22
44,"Strecker Degradation Dicarbonyl, H, H, C, H",13.0,22
61,"Strecker Degradation Dicarbonyl, C, H, C, H",52.0,88
60,"Strecker Degradation Dicarbonyl, C, H, H, C",52.0,88
34,Ammonolysis of Esters,9.0,15
35,"Amide Formation Hydrolysis, C",9.0,15
21,"Ring Closure 5 membered, inverse N, O",3.0,5
20,"Ring Closure 7 membered, inverse N, O",3.0,5
19,"Ring Closure 6 membered, inverse N, O",3.0,5
40,Cannizarro 1,13.0,21


In [18]:
i=0
for x in list_rxns:
    i=i+1
    x.to_csv(f'rxn{i}.csv')