## Reactions Dataset
### Date: February 19, 2025
### Author: Selin Kaplanoglu

**Goal:** We want to use Neo4j and query gene-gene interactions to get information on them

In [1]:
#imports
from neo4j import GraphDatabase
from itertools import combinations
import random
import logging
import time
import json
import requests
import os
from dotenv import load_dotenv

In [2]:
#seeting up neo4j connection and inititializing the driver
load_dotenv()
uri = os.getenv("URI")  
username = os.getenv("USERNAME")          
password = os.getenv("PASSWORD")  

driver = GraphDatabase.driver(uri, auth=(username, password))


In [3]:
import json
from neo4j import GraphDatabase

print("Executing Neo4j query...")

gene_query = """
MATCH (n)-[:referenceDatabase]->(rd:ReferenceDatabase) 
WHERE toLower(rd.displayName) = toLower("ENSEMBL")
WITH DISTINCT n  
MATCH (pe:PhysicalEntity)-[:referenceEntity|referenceSequence|crossReference|referenceGene*]->(n)
WITH DISTINCT pe 
MATCH (:Species{taxId:"9606"})<-[:species]-(p:Pathway)-[:hasEvent]->(rle) 
MATCH (rle:ReactionLikeEvent)
    OPTIONAL MATCH (rle)-[:input]->(input:PhysicalEntity)
    OPTIONAL MATCH (rle)-[:output]->(output:PhysicalEntity)
    OPTIONAL MATCH (rle)-[:catalystActivity]->(catalyst:CatalystActivity)
    OPTIONAL MATCH (rle)-[:physicalEntity]->(physicalEntity:PhysicalEntity)
    OPTIONAL MATCH (rle)-[:entityFunctionalStatus]->(entityFunctionalStatus:EntityFunctionalStatus)
    OPTIONAL MATCH (rle)-[:diseaseEntity]->(diseaseEntity:Disease)
    OPTIONAL MATCH (rle)-[:regulatedBy|regulator]->(regulator:Regulation)
    OPTIONAL MATCH (rle)-[:hasComponent|hasMember|hasCandidate|repeatedUnit]->(pe)
WITH DISTINCT p, pe, rle, input, output, catalyst, physicalEntity, entityFunctionalStatus, diseaseEntity, regulator
RETURN pe.displayName AS gene_name,
       p.displayName AS pathway_name, 
       collect(DISTINCT rle.displayName) AS reactions,
       collect(DISTINCT input.displayName) AS inputs, 
       collect(DISTINCT output.displayName) AS outputs, 
       collect(DISTINCT catalyst.displayName) AS catalysts,
       collect(DISTINCT physicalEntity.displayName) AS physical_entities,
       collect(DISTINCT entityFunctionalStatus.displayName) AS entity_functional_status,
       collect(DISTINCT diseaseEntity.displayName) AS diseases,
       collect(DISTINCT regulator.displayName) AS regulators
ORDER BY gene_name, pathway_name
"""

genes_results_dict = {}

with driver.session() as session:
    try:
        # Run the query
        genes_results = session.run(gene_query)

        # Debug: Print raw data from Neo4j
        print("\nRaw Neo4j Query Output:")
        for record in genes_results:
            print(record)  # See raw data for debugging
            
            gene_name = record.get("gene_name", "Unknown")

            pathway_data = {
                "pathway_name": record.get("pathway_name", "Unknown"),
                "reactions": record.get("reactions", []),
                "inputs": record.get("inputs", []),
                "outputs": record.get("outputs", []),
                "catalysts": record.get("catalysts", []),
                "physical_entities": record.get("physical_entities", []),
                "entity_functional_status": record.get("entity_functional_status", []),
                "diseases": record.get("diseases", []),
                "regulators": record.get("regulators", [])
            }

            if gene_name in genes_results_dict:
                genes_results_dict[gene_name]["pathways"].append(pathway_data)
            else:
                genes_results_dict[gene_name] = {
                    "gene_name": gene_name,
                    "pathways": [pathway_data]
                }

    except Exception as e:
        print(f"Error executing query: {e}")  # Print error message

# Convert dictionary to list for JSON output
genes_results_list = list(genes_results_dict.values())

# Debug: Check final JSON before saving
print("\nFinal JSON Output:")
print(json.dumps(genes_results_list, indent=4))

# Save results to JSON
with open('genes_output3.json', 'w') as outfile:
    json.dump(genes_results_list, outfile, indent=4)

print("JSON file saved successfully: genes_output3.json")


Executing Neo4j query...

Raw Neo4j Query Output:
<Record gene_name='((1,6)-alpha-glucosyl)poly((1,4)-alpha-glucosyl)GYG1 [cytosol]' pathway_name='2-LTR circle formation' reactions=['Association of  XRCC4:DNA ligase IV complex with viral DNA ends', '2-LTR formation due to circularization of viral DNA', 'Association of Ku heterodimer with viral DNA ends'] inputs=['Ku proteins bound to viral DNA [nucleoplasm]', 'XRCC4:LIG4 [nucleoplasm]', 'viral DNA:Ku proteins:XRCC4:DNA ligase IV complex [nucleoplasm]', "IN bound to sticky 3' ends of viral DNA in PIC [nucleoplasm]", 'XRCC5:XRCC6 [nucleoplasm]'] outputs=['viral DNA:Ku proteins:XRCC4:DNA ligase IV complex [nucleoplasm]', 'IN (Integrase) (P04585) protein [nucleoplasm]', 'XRCC4:LIG4 [nucleoplasm]', '2-LTR form of circular viral DNA [nucleoplasm]', 'XRCC5:XRCC6 [nucleoplasm]', 'Ku proteins bound to viral DNA [nucleoplasm]', 'viral PIC proteins [nucleoplasm]'] catalysts=['DNA ligase activity of viral DNA:Ku proteins:XRCC4:DNA ligase IV comple

In [None]:
# Retrieve all the interacttions that exist within the graph database.
interaction_query = """
MATCH (n:Interaction)-[:interactor]->(p) 
RETURN 
    n.displayName AS interaction , n.score AS confidence_score,
    collect(DISTINCT{name:p.displayName, function: p.comment}) AS gene_information
"""
interactions_results_list = []
with driver.session() as session:
    results = session.run(interaction_query)
    for record in results:
        interactions_results_list.append({
            "interaction": record["interaction"],
            "confidence_score": record["confidence_score"],
            "gene_information": record["gene_information"]
        })
    
driver.close()

with open('interactions_output.json', 'w') as outfile:
    json.dump(interactions_results_list, outfile, indent=4)

print("Data successfully written to interactions_output.json")