In [1]:
import pandas as pd

In [2]:
def load_kg_node_map(filepath):
    return pd.read_csv(filepath, sep='\t')

def load_kg_edgelist(filepath):
    return pd.read_csv(filepath, sep='\t')

def map_phenotypes_to_genes(phenotypes, kg_node_map, kg_edgelist):
    phenotype_gene_mapping = {}
    
    for phenotype in phenotypes:
        # Find the node_idx for the phenotype
        phenotype_node = kg_node_map[kg_node_map['node_id'] == phenotype]
        
        if not phenotype_node.empty:
            phenotype_idx = phenotype_node['node_idx'].values[0]
            
            # Find connected genes in the edgelist
            connected_genes = kg_edgelist[
                ((kg_edgelist['x_idx'] == phenotype_idx) | (kg_edgelist['y_idx'] == phenotype_idx)) &
                (kg_edgelist['full_relation'].str.contains('gene/protein'))
            ]
            
            # Get the gene indices
            gene_indices = connected_genes['x_idx'].tolist() + connected_genes['y_idx'].tolist()
            gene_indices = [idx for idx in gene_indices if idx != phenotype_idx]
            
            # Map gene indices to gene names
            gene_names = kg_node_map[kg_node_map['node_idx'].isin(gene_indices)]['node_name'].tolist()
            
            phenotype_gene_mapping[phenotype] = gene_names
        else:
            phenotype_gene_mapping[phenotype] = []
    
    return phenotype_gene_mapping


In [None]:

# Load the data
kg_node_map = load_kg_node_map('./KG_node_map_test.txt')
kg_edgelist = load_kg_edgelist('KG_edgelist.txt')

# Extract positive phenotypes from the patient data
patient_data = {
    "true_genes": ["ENSG00000173801"],
    "positive_phenotypes": ["HP:0000982", "HP:0004751", "HP:0010719", "HP:0000113", "HP:0001324", "HP:0001640", "HP:0003202", "HP:0000708", "HP:0011675", "HP:0001948", "HP:0000956", "HP:0001645", "HP:0200114", "HP:0005881", "HP:0025230", "HP:0003010", "HP:0000540"],
    "all_candidate_genes": ["ENSG00000185339", "ENSG00000157423", "ENSG00000130638", "ENSG00000073756", "ENSG00000082701", "ENSG00000168000", "ENSG00000173801", "ENSG00000136732", "ENSG00000147655", "ENSG00000162426", "ENSG00000115361", "ENSG00000090932", "ENSG00000117425", "ENSG00000133107", "ENSG00000116679", "ENSG00000122641", "ENSG00000070614", "ENSG00000055118", "ENSG00000166685", "ENSG00000115760", "ENSG00000166603"],
    "true_diseases": ["34217"]
}

positive_phenotypes = patient_data["positive_phenotypes"]

# Map phenotypes to genes
phenotype_gene_mapping = map_phenotypes_to_genes(positive_phenotypes, kg_node_map, kg_edgelist)

# Print the results
for phenotype, genes in phenotype_gene_mapping.items():
    print(f"Phenotype: {phenotype}")
    print(f"Connected Genes: {', '.join(genes) if genes else 'No connected genes found'}")
    print()