## Gene-Gene Pairwise Interactions Dataset
### Date: October 30, 2024
### Author: Selin Kaplanoglu

**Goal:** We want to create a dataset with gene-gene interactions of genes associated with biological process GO terms. These interactions can be different in nature, and have different points of evidence. 

In [1]:
from neo4j import GraphDatabase
from itertools import combinations
import random
import logging
import time
import json
import requests

In [3]:
# Neo4j connection details
uri = "bolt://localhost:7687"  # Replace with your Neo4j URI
username = "neo4j"            # Your Neo4j username
password = "ORNLneo4j1907"    # Your Neo4j password

# Initialize driver
driver = GraphDatabase.driver(uri, auth=(username, password))

# Logging setup
logging.basicConfig(
    filename='gene_pairwise.log',
    level=logging.INFO,
    format='%(asctime)s - %(message)s'
)

In [4]:
def query_genes_for_pathways(driver, input_file_path):
    """
    Query pathways for a list of genes and find common pathways between gene pairs.

    Parameters
    ----------
    driver: Neo4j database driver used to establish a session and run queries.
    input_file_path: List of gene names.

    Returns
    -------
    gene_names : list
        A list of unique gene names extracted from the input file.
        
    common_pathways_dict: dict
        A dictionary where keys are tuples of gene pairs (gene1, gene2) and values 
        are lists of dictionaries with common pathway IDs and names for those pairs.
    """
    logging.info("Starting to query genes for pathways...")
    start_time_total = time.time()

    with open(gene_file_path, "r") as f:
        parsed_data = json.load(f)  # Correct way to load JSON from a file
    
    # Extract genes
    gene_list = []
    for entry in parsed_data:
        gene_list.extend(entry["genes"].split(","))  # Splitting the gene string into a list
    gene_names = []
    for gene in gene_list:
        if gene not in gene_names:
            gene_names.append(gene)

    print("First 10 gene names:", gene_names[:10])  # Ensure list elements are properly extracted
    logging.info(f"Total genes loaded: {len(gene_names)}")
    gene_names = gene_names[:10]

    # Query for pathways associated with each gene
    query = query = """
    UNWIND $gene_names AS gene_name
    MATCH (n)-[:referenceDatabase]->(rd:ReferenceDatabase)
    WHERE toLower(rd.displayName) = "uniprot" AND 
      ANY(g IN n.geneName WHERE toLower(g) = toLower(gene_name))
    WITH DISTINCT gene_name, n
    MATCH (pe:PhysicalEntity)-[:referenceEntity|referenceSequence|crossReference|referenceGene*]->(n)
    WITH DISTINCT gene_name, pe
    MATCH (rle:ReactionLikeEvent)-[:input|output|catalystActivity|physicalEntity|entityFunctionalStatus|diseaseEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate|repeatedUnit*]->(pe)
    WITH DISTINCT gene_name, rle
    MATCH (:Species {taxId: "9606"})<-[:species]-(p:Pathway)-[:hasEvent]->(rle)
    RETURN DISTINCT gene_name AS gene, p.stId AS pathway_id, p.displayName AS pathway_name
    ORDER BY gene, pathway_id
    """
    with driver.session() as session:
        query_results = session.run(query, {"gene_names": gene_names})
        results_dict = {}
        for record in query_results:
            gene = record["gene"]
            pathway_info = {
                "pathway_id": record["pathway_id"],
                "pathway_name": record["pathway_name"]
            }
            if gene not in results_dict:
                results_dict[gene] = []
            results_dict[gene].append(pathway_info)

    common_pathways_dict = {}

    # Generate all possible gene pairs
    all_gene_pairs = list(combinations(gene_names, 2))
    logging.info(f"Using the first 10 genes to create {len(all_gene_pairs)} pairs for processing.")

    for gene1, gene2 in all_gene_pairs:
        pair_start_time = time.time()  # Start time for this pair
        pathways_gene1 = {(item["pathway_id"], item["pathway_name"]) for item in results_dict.get(gene1, [])}
        pathways_gene2 = {(item["pathway_id"], item["pathway_name"]) for item in results_dict.get(gene2, [])}
        
        # Find common pathways
        common_pathways = pathways_gene1.intersection(pathways_gene2)
        common_pathways_dict[f"{gene1} , {gene2}"] = [
            {"pathway_id": pathway[0], "pathway_name": pathway[1]} for pathway in common_pathways
        ]
          
          
        # Log time for this pair
        pair_end_time = time.time()
        elapsed_time_pair = pair_end_time - pair_start_time
        logging.info(f"Queried pathways for pair ({gene1}, {gene2}) in {elapsed_time_pair:.2f} seconds.")

    # Log total time
    end_time_total = time.time()
    total_elapsed_time = end_time_total - start_time_total
    logging.info(f"Total time for querying pathways: {total_elapsed_time:.2f} seconds.")

    return common_pathways_dict

In [8]:
gene_file_path = "/mnt/DGX01/Personal/slndir/geneset_datasets/go_terms_dict.json"
common_pathways_dict = query_genes_for_pathways(driver, gene_file_path)
print(common_pathways_dict)

First 10 gene names: ['VTN', 'ITGB3', 'ICAM1', 'ITGB1', 'ITGAV', 'ITGA5', 'ITGA2', 'ITGB6', 'FBN1', 'CCN3']
{'VTN , ITGB3': [{'pathway_id': 'R-HSA-216083', 'pathway_name': 'Integrin cell surface interactions'}, {'pathway_id': 'R-HSA-2129379', 'pathway_name': 'Molecules associated with elastic fibres'}, {'pathway_id': 'R-HSA-3000170', 'pathway_name': 'Syndecan interactions'}, {'pathway_id': 'R-HSA-3000178', 'pathway_name': 'ECM proteoglycans'}], 'VTN , ICAM1': [{'pathway_id': 'R-HSA-216083', 'pathway_name': 'Integrin cell surface interactions'}], 'VTN , ITGB1': [{'pathway_id': 'R-HSA-216083', 'pathway_name': 'Integrin cell surface interactions'}, {'pathway_id': 'R-HSA-2129379', 'pathway_name': 'Molecules associated with elastic fibres'}, {'pathway_id': 'R-HSA-3000170', 'pathway_name': 'Syndecan interactions'}, {'pathway_id': 'R-HSA-3000178', 'pathway_name': 'ECM proteoglycans'}], 'VTN , ITGAV': [{'pathway_id': 'R-HSA-216083', 'pathway_name': 'Integrin cell surface interactions'}, {'path

In [13]:
def query_string_interactions(common_pathways_dict):
    """
    Queries STRING API for protein-protein interactions for every pairwise combination of genes.

    Parameters
    ----------
    common_pathways_dict : dict
        A dictionary where keys are gene pairs and values are their shared pathways.

    Returns
    -------
    interactions_dict : dict
        A dictionary where keys are gene pairs and values are their interaction data from STRING.
    """
    string_api_url = "https://version-11-5.string-db.org/api"
    output_format = "json"
    method = "network"

    request_url = "/".join([string_api_url, output_format, method])

    interactions_dict = {}
    
    logging.info(f"Querying STRING for {len(common_pathways_dict)} gene pairs...")

    for gene_pair in common_pathways_dict.keys():
        try:
            gene1, gene2 = gene_pair.split(" , ")  # Correctly split gene pairs
            
            params = {
                "identifiers": f"{gene1}%0d{gene2}",  # Properly formatted query
                "species": 9606,  # Human species ID
            }

            response = requests.post(request_url, data=params)

            if response.status_code == 200:
                interactions = response.json()  # Convert response to JSON
                
                # Store interaction data in the dictionary
                interactions_dict[f"{gene1} - {gene2}"] = interactions
                logging.info(f"Successfully retrieved data for {gene1} - {gene2}.")
            else:
                logging.warning(f"Failed to retrieve data for {gene1} - {gene2}: HTTP {response.status_code}")

        except requests.exceptions.RequestException as e:
            logging.error(f"Error querying STRING API for {gene1} - {gene2}: {e}")

        time.sleep(1)  # To prevent API rate-limiting

    logging.info("Finished querying STRING API.")
    return interactions_dict

In [14]:
interactions_dict= query_string_interactions(common_pathways_dict)
print(interactions_dict)

{'VTN - ITGB3': [{'stringId_A': '9606.ENSP00000226218', 'stringId_B': '9606.ENSP00000452786', 'preferredName_A': 'VTN', 'preferredName_B': 'ITGB3', 'ncbiTaxonId': '9606', 'score': 0.996, 'nscore': 0, 'fscore': 0, 'pscore': 0, 'ascore': 0.063, 'escore': 0.235, 'dscore': 0.9, 'tscore': 0.951}, {'stringId_A': '9606.ENSP00000226218', 'stringId_B': '9606.ENSP00000452786', 'preferredName_A': 'VTN', 'preferredName_B': 'ITGB3', 'ncbiTaxonId': '9606', 'score': 0.996, 'nscore': 0, 'fscore': 0, 'pscore': 0, 'ascore': 0.063, 'escore': 0.235, 'dscore': 0.9, 'tscore': 0.951}], 'VTN - ICAM1': [{'stringId_A': '9606.ENSP00000226218', 'stringId_B': '9606.ENSP00000264832', 'preferredName_A': 'VTN', 'preferredName_B': 'ICAM1', 'ncbiTaxonId': '9606', 'score': 0.622, 'nscore': 0, 'fscore': 0, 'pscore': 0, 'ascore': 0, 'escore': 0, 'dscore': 0, 'tscore': 0.622}, {'stringId_A': '9606.ENSP00000226218', 'stringId_B': '9606.ENSP00000264832', 'preferredName_A': 'VTN', 'preferredName_B': 'ICAM1', 'ncbiTaxonId': '9

In [23]:
def find_pathway_reaction(driver, common_pathways_dict):
    """
    Function to find any reactions associated with the pathways in the common_pathways dictionary.

    Parameters
    ----------
    driver : Neo4j database driver used to establish a session and run queries.
    common_pathways_dict : dict
        A dictionary where keys are tuples of gene pairs (gene1, gene2) and values
        are lists of dictionaries with pathway information, including 'pathway_id'.

    Returns
    -------
    reactions_dict : list
        A list of dictionaries with reaction data.
    """
    logging.info("Starting to query pathways for reactions...")
    start_time_total = time.time()

    # Initialize reactions_dict
    reactions_dict = []

    # Iterate through each gene pair and their common pathways
    for gene_pair, pathways in common_pathways_dict.items():
        gene1, gene2 = gene_pair.split(" , ")  # Splitting the gene pair string

        for pathway in pathways:
            pathway_id = pathway['pathway_id']
            pathway_name = pathway['pathway_name']

            pathway_start_time = time.time()

            # Query for reactions
            query = """
            MATCH (p:Pathway {stId: $pathway_id})-[:hasEvent*]->(rle:ReactionLikeEvent)
            RETURN DISTINCT
                $gene1 + '-' + $gene2 AS gene_pair,
                p.displayName AS pathway_name,
                rle.displayName AS reaction_name
            ORDER BY pathway_name, reaction_name
            """
            with driver.session() as session:
                result = session.run(query, {
                    "pathway_id": pathway_id,
                    "gene1": gene1,
                    "gene2": gene2
                })

                # Store reactions within the pathway entry
                reactions = [{"reaction_name": record["reaction_name"]} for record in result]

                # Add reactions to the pathway and append it to the reactions_dict
                if reactions:
                    pathway["reactions"] = reactions
                    reactions_dict.append({
                        "gene_pair": f"{gene1} - {gene2}",
                        "pathway_name": pathway_name,
                        "pathway_id": pathway_id,
                        "reactions": reactions
                    })

            pathway_end_time = time.time()
            logging.info(f"Queried reactions for pathway '{pathway_name}' in pair ({gene1}, {gene2}) in {pathway_end_time - pathway_start_time:.2f} seconds.")

    logging.info(f"Total time for querying reactions: {time.time() - start_time_total:.2f} seconds.")
    return reactions_dict
