In [None]:
# Connect notebook to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Navigate to the shared drive folder
%cd /content/drive/'My Drive'/'Cardiovascular Knowledge Graph'
!pwd
!ls

/content/drive/My Drive/Cardiovascular Knowledge Graph
/content/drive/My Drive/Cardiovascular Knowledge Graph
cardiac_conduction.ipynb  data


In [None]:
%cd data

/content/drive/My Drive/Cardiovascular Knowledge Graph/data


In [3]:
import pandas as pd
import re
import json

def extract_main_pathway(file_name, pathway_name = 'Cardiac conduction'):
    """
    extracts main pathway IDs with the selected pathway_name
    file_name : UniProt2Reactome_All_Levels.txt
    """
    database = open(file_name, 'r')
    pathway_string = []
    main_pathways = []

    for line in database:
        if pathway_name in line:
            pathway_string.append(line)

    for k in pathway_string:
        indice = [m.start() for m in re.finditer('\t', k)] #extracts indices for '\t' in pathway_string
        main_pathways.append(k[indice[0]+1 : indice[1]])

    return main_pathways

def pathway_hierarchy(main_file, hierarchy_file, pathway_name = 'Cardiac conduction'):
    """
    extracts pathway IDs corresponding to the main pathway IDs from extract_main_pathway()
    file_name : ReactomePathwaysRelation.txt
    {pathway_name : [sub-pathways belonging to pathway_name]}
    """
    #main_pathways = extract_main_pathway(file_name, pathway_name)
    database = open(hierarchy_file, 'r')
    main_pathways = list(set(extract_main_pathway(main_file, pathway_name)))
    pathway_hierarchy = []

    for line in database:
        pathway_hierarchy.append(line)

    hierarchy_dict = {}
    for i in main_pathways:
        hierarchy_dict[i] = []
        for j in pathway_hierarchy:
            t = j.split('\t')
            if i in t[0]:
                indice = j.find('\t')
                hierarchy_dict[i].append(j[indice + 1 : -1])

    return hierarchy_dict

def pathway_to_protein(main_file, hierarchy_file, pathway_name = 'Cardiac conduction'):
    """
    {sub-pathway ID : [protein accession IDs]}
    %%under development ; still incomplete
    """
    hierarchy_dict = pathway_hierarchy(main_file, hierarchy_file, pathway_name)
    sub_pathways = []
    for value in hierarchy_dict.values():
        sub_pathways.append(value)

    sub_pathways = list(set([j for i in sub_pathways for j in i]))
    main_data_file = open(main_file, 'r')
    main_database = []
    for line in main_data_file:
        main_database.append(line)

    sub_pathway_protein = {}

    for i in sub_pathways:
        sub_pathway_protein[i] = []
        for line in main_database:
            indice = None
            if i in line:
                indice = [m.start() for m in re.finditer('\t', line)]
                sub_pathway_protein[i].append(line[0 : indice[0]])

    return sub_pathway_protein

def pathway_id_to_name(main_file, hierarchy_file, pathway_name = 'Cardiac conduction'):
    """
    {pathway ID : pathway name}
    """
    sub_pathway_protein = pathway_to_protein(main_file, hierarchy_file, pathway_name)
    main_data_file = open(main_file, 'r')
    main_database = []
    for line in main_data_file:
        main_database.append(line)

    id_to_name = {}
    for i in list(sub_pathway_protein.keys()):
        for line in main_database:
            if i in line:
                indice = [m.start() for m in re.finditer('\t', line)]
                id_to_name[i] = line[indice[2] + 1: indice[3]]

    return id_to_name

def pathway_id_to_organism(main_file, hierarchy_file, pathway_name = 'Cardiac conduction'):
    """
    {pathway ID : organism name}
    """
    main_data_file = open(main_file, 'r')
    main_database = []
    for line in main_data_file:
        main_database.append(line)

    id_name_dict = pathway_id_to_name(main_file, hierarchy_file, pathway_name)
    id_to_organism = {}

    for i in list(id_name_dict.keys()):
        for line in main_database:
            if i in line:
                indice = [m.start() for m in re.finditer('\t', line)]
                id_to_organism[i] = line[indice[4]+1:-1]

    return id_to_organism

def create_dataframe(main_file, hierarchy_file, pathway_name = 'Cardiac conduction'):
    """
    create dataframe in the following format:
    Protein ID | Reactome Pathway ID
    %%under development
    """
    sub_pathway_protein = pathway_to_protein(main_file, hierarchy_file, pathway_name)
    proteins = []
    corresponding_pathway = []
    for key in list(sub_pathway_protein.keys()): #[list of pathway IDs]
        for i in sub_pathway_protein[key]:
            proteins.append(i)
            corresponding_pathway.append(key)

    d = {'Protein ID' : proteins, 
       'Reactome Pathway ID' : corresponding_pathway}

    return pd.DataFrame(d)


In [None]:
main_file_name = 'UniProt2Reactome_All_Levels.txt'
hierarchy_file = 'ReactomePathwaysRelation.txt'

In [None]:
pathway_protein_dict = pathway_to_protein(main_file_name, hierarchy_file)

In [None]:
pathway_protein_dict['R-CEL-5576894']

['A0A4V0IK35', 'O16638', 'O45313', 'Q95XD1']

In [None]:
protein_pathway_df = create_dataframe(main_file_name, hierarchy_file)
protein_pathway_df

Unnamed: 0,Protein ID,Reactome Pathway ID
0,A0A4V0IK35,R-CEL-5576894
1,O16638,R-CEL-5576894
2,O45313,R-CEL-5576894
3,Q95XD1,R-CEL-5576894
4,M9PD84,R-DME-5576893
...,...,...
1154,A0A5G2QVE3,R-SSC-5576894
1155,A0A5G2R345,R-SSC-5576894
1156,F1S8T5,R-SSC-5576894
1157,F1SBP4,R-SSC-5576894


In [None]:
protein_pathway_df.to_json("cardiac_conduction.json", orient = 'records')