## Reactions Dataset
### Date: February 19, 2025
### Author: Selin Kaplanoglu

**Goal:** We want to use Neo4j and query gene-gene interactions to get information on them

In [1]:
#imports
from neo4j import GraphDatabase
from itertools import combinations
import random
import logging
import time
import json
import requests
import os
from dotenv import load_dotenv

In [2]:
#seeting up neo4j connection and inititializing the driver
load_dotenv()
uri = os.getenv("URI")  
username = os.getenv("USERNAME")          
password = os.getenv("PASSWORD")  

driver = GraphDatabase.driver(uri, auth=(username, password))


In [None]:
# # Query 1: Retrieve pathways along with their reactions
# pathway_query = """
# MATCH (p:Pathway)
# OPTIONAL MATCH (p)-[:hasEvent*]->(r:ReactionLikeEvent)
# OPTIONAL MATCH (p)-[:summation]->(s)
# RETURN p.displayName AS pathway_name, collect(DISTINCT {id: r.stId, name: r.displayName}) AS reactions, s.text AS summary
# """

# # Query 2: Retrieve all participating molecules/components for a given reaction
# reaction_query = """
# MATCH (r:ReactionLikeEvent {stId: $reaction_id})
# OPTIONAL MATCH (r)-[:input]->(input:PhysicalEntity)
# OPTIONAL MATCH (r)-[:output]->(output:PhysicalEntity)
# OPTIONAL MATCH (r)-[:catalystActivity]->(catalyst:CatalystActivity)
# OPTIONAL MATCH (r)-[:regulatedBy|regulator]->(regulator:Regulation)
# RETURN 
#     r.displayName AS reaction_name, 
#     collect(DISTINCT {name: input.displayName}) AS inputs, 
#     collect(DISTINCT {name: output.displayName}) AS outputs, 
#     collect(DISTINCT {name: catalyst.displayName}) AS catalysts,
#     collect(DISTINCT {name: regulator.displayName}) AS regulators
# """

# # Query 3: Retrieve components for a given participant
# participant_query = """
# MATCH (p:Complex {stId: $participant_id})
# OPTIONAL MATCH (p)-[:hasComponent|hasMember|hasCandidate*]->(component:PhysicalEntity)
# RETURN p.displayName AS complex_name, collect(DISTINCT component.displayName) AS components
# """

# combined_results = []

# with driver.session() as session:
#     pathway_result = session.run(pathway_query)

#     for record in pathway_result:
#         pathway_name = record["pathway_name"]
#         reactions = record["reactions"]
#         summary = record["summary"] if record["summary"] else ""

#         pathway_data = {
#             "pathway_name": pathway_name,
#             "summary": summary.replace('\u00a0', ' '),  # Replace non-breaking spaces
#             "reactions": []
#         }

#         # Loop through each reaction for the pathway
#         for reaction in reactions:
#             reaction_name = reaction["name"]

#             reaction_result = session.run(reaction_query, {"reaction_id": reaction_id})

#             reaction_data = {
#                 "reaction_name": reaction_name,
#                 "inputs": [],
#                 "outputs": [],
#                 "catalysts": [],
#                 "regulators": []
#             }

#             participant_ids = set()  # Store unique participant IDs for later lookup

#             for rec in reaction_result:
#                 reaction_data["inputs"].extend(rec["inputs"])
#                 reaction_data["outputs"].extend(rec["outputs"])
#                 reaction_data["catalysts"].extend(rec["catalysts"])
#                 reaction_data["regulators"].extend(rec["regulators"])

#                 # Collect unique participant IDs
#                 for category in ["inputs", "outputs", "catalysts", "regulators"]:
#                     for participant in rec[category]:
#                         if participant["id"]:
#                             participant_ids.add(participant["id"])

#             # Fetch components for each unique participant ID
#             for participant_id in participant_ids:
#                 participant_result = session.run(participant_query, {"participant_id": participant_id})
#                 for p_rec in participant_result:
#                     complex_name = p_rec["complex_name"]
#                     components = p_rec["components"]

#                     # Store components in the relevant category
#                     for category in ["inputs", "outputs", "catalysts", "regulators"]:
#                         for participant in reaction_data[category]:
#                             if participant["id"] == participant_id:
#                                 participant["components"] = components

#             # Append reaction details to pathway
#             pathway_data["reactions"].append(reaction_data)

#         # Add pathway data to results
#         combined_results.append(pathway_data)


# # Save results to JSON
# with open('reactions_output.json', 'w') as outfile:
#     json.dump(combined_results, outfile, indent=4)

# print("Data successfully written to reactions_output.json")


In [4]:
import json

# Debug: Print query before execution
print("Executing Neo4j query...")

gene_query = """
MATCH (n)-[:referenceDatabase]->(rd:ReferenceDatabase) 
WHERE toLower(rd.displayName) = toLower("ENSEMBL")
WITH DISTINCT n  
MATCH (pe:PhysicalEntity)-[:referenceEntity|referenceSequence|crossReference|referenceGene*]->(n)
WITH DISTINCT pe 
MATCH (rle:ReactionLikeEvent)
    OPTIONAL MATCH (rle)-[:input]->(input:PhysicalEntity)
    OPTIONAL MATCH (rle)-[:output]->(output:PhysicalEntity)
    OPTIONAL MATCH (rle)-[:catalystActivity]->(catalyst:CatalystActivity)
    OPTIONAL MATCH (rle)-[:physicalEntity]->(physicalEntity:PhysicalEntity)
    OPTIONAL MATCH (rle)-[:entityFunctionalStatus]->(entityFunctionalStatus:EntityFunctionalStatus)
    OPTIONAL MATCH (rle)-[:diseaseEntity]->(diseaseEntity:Disease)
    OPTIONAL MATCH (rle)-[:regulatedBy|regulator]->(regulator:Regulation)
    OPTIONAL MATCH (rle)-[:hasComponent|hasMember|hasCandidate|repeatedUnit]->(pe)
WITH DISTINCT pe, rle, input, output, catalyst, physicalEntity, entityFunctionalStatus, diseaseEntity, regulator
MATCH (:Species{taxId:"9606"})<-[:species]-(p:Pathway)-[:hasEvent]->(rle) 
RETURN pe.displayName AS gene_name,
       p.displayName AS pathway_name, 
       collect(DISTINCT rle.displayName) AS reactions,
       collect(DISTINCT input.displayName) AS inputs, 
       collect(DISTINCT output.displayName) AS outputs, 
       collect(DISTINCT catalyst.displayName) AS catalysts,
       collect(DISTINCT physicalEntity.displayName) AS physical_entities,
       collect(DISTINCT entityFunctionalStatus.displayName) AS entity_functional_status,
       collect(DISTINCT diseaseEntity.displayName) AS diseases,
       collect(DISTINCT regulator.displayName) AS regulators
ORDER BY gene_name, pathway_name
"""

genes_results_list = []
with driver.session() as session:
    try:
        # Run the query
        genes_results = session.run(gene_query)

        # Debug: Print raw data from Neo4j
        print("\nRaw Neo4j Query Output:")
        for record in genes_results:
            print(record)  # See raw data for debugging

            gene_data = {
                "gene_name": record.get("gene_name", "Unknown"),  # Handle missing values
                "pathway_name": record.get("pathway_name", "Unknown"),
                "reactions": record.get("reactions", []),  # Ensure lists exist
                "inputs": record.get("inputs", []),
                "outputs": record.get("outputs", []),
                "catalysts": record.get("catalysts", []),
                "physical_entities": record.get("physical_entities", []),
                "entity_functional_status": record.get("entity_functional_status", []),
                "diseases": record.get("diseases", []),
                "regulators": record.get("regulators", [])
            }

            # Debug: Print processed data step-by-step
            print("\nProcessed Data for Gene:")
            print(json.dumps(gene_data, indent=4))  

            genes_results_list.append(gene_data)

    except Exception as e:
        print(f"Error executing query: {e}")  # Print error message

# Debug: Check final JSON before saving
print("\nFinal JSON Output:")
print(json.dumps(genes_results_list, indent=4))

# Save results to JSON
with open('genes_output2.json', 'w') as outfile:
    json.dump(genes_results_list, outfile, indent=4)

print("JSON file saved successfully: genes_output2.json")


Executing Neo4j query...

Raw Neo4j Query Output:


In [5]:
# Retrieve all the interacttions that exist within the graph database.
interaction_query = """
MATCH (n:Interaction)-[:interactor]->(p) 
RETURN 
    n.displayName AS interaction , n.score AS confidence_score,
    collect(DISTINCT{name:p.displayName, function: p.comment}) AS gene_information
"""
interactions_results_list = []
with driver.session() as session:
    results = session.run(interaction_query)
    for record in results:
        interactions_results_list.append({
            "interaction": record["interaction"],
            "confidence_score": record["confidence_score"],
            "gene_information": record["gene_information"]
        })
    
driver.close()

with open('interactions_output.json', 'w') as outfile:
    json.dump(interactions_results_list, outfile, indent=4)

print("Data successfully written to interactions_output.json")

Data successfully written to interactions_output.json
