## Reactions Dataset
### Date: February 19, 2025
### Author: Selin Kaplanoglu

**Goal:** We want to use Neo4j and query gene-gene interactions to get information on them

In [1]:
#imports
from neo4j import GraphDatabase
from itertools import combinations
import random
import logging
import time
import json
import requests
import os
from dotenv import load_dotenv
import time

In [2]:
#seeting up neo4j connection and inititializing the driver
load_dotenv()
uri = os.getenv("URI")  
username = os.getenv("USERNAME")          
password = os.getenv("PASSWORD")  

driver = GraphDatabase.driver(uri, auth=(username, password))


In [3]:
BATCH_SIZE = 1000
LAST_ID_FILE = "last_id.txt"

if os.path.exists(LAST_ID_FILE):
    with open(LAST_ID_FILE, "r") as f:
        last_id = int(f.read().strip())
else:
    last_id = -1  # Start from scratch

while True:
    print(f"\n Executing batch with last_id > {last_id}...")

    batched_query = """
    MATCH (n)-[:referenceDatabase]->(rd:ReferenceDatabase)
    WHERE toLower(rd.displayName) = toLower("ENSEMBL")
    WITH DISTINCT n  
    MATCH (pe:PhysicalEntity)-[:referenceEntity|referenceSequence|crossReference|referenceGene*]->(n)
    WHERE id(pe) > $last_id
    WITH DISTINCT pe ORDER BY id(pe) LIMIT $batch_size

    MATCH (pe)<-[:input|output|catalystActivity|regulatedBy|regulator*0..]-(rle:ReactionLikeEvent)
        OPTIONAL MATCH (rle)-[:input]->(input:PhysicalEntity)
        OPTIONAL MATCH (rle)-[:output]->(output:PhysicalEntity)
        OPTIONAL MATCH (rle)-[:catalystActivity]->(catalyst:CatalystActivity)
        OPTIONAL MATCH (rle)-[:regulatedBy|regulator]->(regulator:Regulation)
        OPTIONAL MATCH (rle)-[:diseaseEntity]->(diseaseEntity:Disease)
    MATCH (:Species{taxId:"9606"})<-[:species]-(p:Pathway)-[:hasEvent]->(rle)
    WITH DISTINCT pe, rle, input, output, catalyst, regulator, diseaseEntity, p
    RETURN pe.displayName AS gene_name,
           id(pe) AS pe_id,
           p.displayName AS pathway_name, 
           collect(DISTINCT rle.displayName) AS reactions,
           collect(DISTINCT input.displayName) AS inputs, 
           collect(DISTINCT output.displayName) AS outputs, 
           collect(DISTINCT catalyst.displayName) AS catalysts,
           collect(DISTINCT regulator.displayName) AS regulators,
           collect(DISTINCT diseaseEntity.displayName) AS diseases
    ORDER BY gene_name, pathway_name
    """

    with driver.session() as session:
        try:
            results = session.run(batched_query, last_id=last_id, batch_size=BATCH_SIZE)
            batch_results = {}
            batch_count = 0

            for record in results:
                batch_count += 1
                gene_name = record.get("gene_name", "Unknown")
                pe_id = record.get("pe_id", last_id)

                pathway_data = {
                    "pathway_name": record.get("pathway_name", "Unknown"),
                    "reactions": record.get("reactions", []),
                    "inputs": record.get("inputs", []),
                    "outputs": record.get("outputs", []),
                    "catalysts": record.get("catalysts", []),
                    "regulators": record.get("regulators", []),
                    "diseases": record.get("diseases", [])
                }

                if gene_name in batch_results:
                    batch_results[gene_name]["pathways"].append(pathway_data)
                else:
                    batch_results[gene_name] = {
                        "gene_name": gene_name,
                        "pathways": [pathway_data]
                    }

                last_id = max(last_id, pe_id)

            print(f"Processed {batch_count} records up to pe_id {last_id}")

            if batch_count == 0:
                print("All data processed!")
                break

            # Save batch output to its own JSON file
            output_file = f"genes_batch_{last_id}.json"
            with open(output_file, "w") as outfile:
                json.dump(list(batch_results.values()), outfile, indent=4)
            print(f"Saved batch to {output_file}")

            # Update progress
            with open(LAST_ID_FILE, "w") as f:
                f.write(str(last_id))

        except Exception as e:
            print(f"Error in batch: {e}")
            break

    # Optional: delay between batches
    time.sleep(0.5)

print("Done!")

import glob

print("Merging all batch JSON files into one...")

all_data = []

for filename in sorted(glob.glob("genes_batch_*.json")):
    with open(filename, "r") as f:
        batch = json.load(f)
        all_data.extend(batch)

# Save merged output
with open("gene_pathways_output.json", "w") as f:
    json.dump(all_data, f, indent=4)

print("Merged into gene_pathways_output.json")

for filename in glob.glob("genes_batch_*.json"):
    os.remove(filename)
print("Cleaned up batch files.")


🚀 Executing batch with last_id > -1...
✅ Processed 33 records up to pe_id 2740
📝 Saved batch to genes_batch_2740.json

🚀 Executing batch with last_id > 2740...
✅ Processed 24 records up to pe_id 5734
📝 Saved batch to genes_batch_5734.json

🚀 Executing batch with last_id > 5734...
✅ Processed 53 records up to pe_id 9854
📝 Saved batch to genes_batch_9854.json

🚀 Executing batch with last_id > 9854...
✅ Processed 15 records up to pe_id 13733
📝 Saved batch to genes_batch_13733.json

🚀 Executing batch with last_id > 13733...
✅ Processed 22 records up to pe_id 17816
📝 Saved batch to genes_batch_17816.json

🚀 Executing batch with last_id > 17816...
✅ Processed 43 records up to pe_id 21535
📝 Saved batch to genes_batch_21535.json

🚀 Executing batch with last_id > 21535...
✅ Processed 67 records up to pe_id 25079
📝 Saved batch to genes_batch_25079.json

🚀 Executing batch with last_id > 25079...
✅ Processed 122 records up to pe_id 28295
📝 Saved batch to genes_batch_28295.json

🚀 Executing batch 