In [1]:
# Connect notebook to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Navigate to the shared drive folder
%cd /content/drive/'My Drive'/'Cardiovascular Knowledge Graph'
!pwd
!ls

/content/drive/My Drive/Cardiovascular Knowledge Graph
/content/drive/My Drive/Cardiovascular Knowledge Graph
cardiac_conduction.ipynb  filter_organism.ipynb     output
data			  muscle_contraction.ipynb


In [11]:
%cd data

/content/drive/My Drive/Cardiovascular Knowledge Graph/data


In [12]:
!ls

ReactomePathwaysRelation.txt  UniProt2Reactome_All_Levels.txt


In [28]:
import pandas as pd
import re
import json

def hierarchy_database():
  """
  text file with pathway hierarchy
  text file name: ReactomePathwaysRelation.txt
  """
  hierarchy_file = 'ReactomePathwaysRelation.txt'
  hierarchy_data = open(hierarchy_file, 'r')
  database = []

  for line in hierarchy_data:
    database.append(line)

  return database

def uniprot_reactome_all_levels():
  """
  text file with all levels of pathways
  text file name: UniProt2Reactome_All_Levels.txt
  """
  file_name = 'UniProt2Reactome_All_Levels.txt'
  file_data = open(file_name, 'r')
  database = []

  for line in file_data:
    database.append(line)
  
  return database

def hierarchy_reverse(filtered_df):
  """
  gets pathways and converts to general cardiac conduction pathway
  """
  hierarchy_txt = hierarchy_database()
  pathway_ids = list(filtered_df['Reactome Pathway ID'])
  protein_ids = list(filtered_df['Protein ID'])

  cardiac_ids = []

  for pathway in pathway_ids:
    for line in hierarchy_txt:
      indice = None
      t = line.split('\t')
      if pathway in t[1]:
        indice = line.find('\t')
        cardiac_ids.append(line[0 : indice])
  
  muscle_pathways = []

  for cardiac in cardiac_ids:
    for line in hierarchy_txt:
      indice = None
      t = line.split('\t')
      if cardiac in t[1]:
        indice = line.find('\t')
        muscle_pathways.append(line[0 : indice])

  return muscle_pathways

def protein_to_pathway(filtered_df):
  muscle_pathway_ids = list(set(hierarchy_reverse(filtered_df)))
  uniprot_reactome_txt = uniprot_reactome_all_levels()

  proteins = [] #{protein accession : pathway ID}
  pathways = []

  for muscle in muscle_pathway_ids:
    for line in uniprot_reactome_txt:
      if muscle in line:
        indice = [m.start() for m in re.finditer('\t', line)]
        proteins.append(line[0 : indice[0]])
        pathways.append(muscle)
    
  d = {'Protein ID' : proteins,
       'Reactome Pathway ID' : pathways}

  return pd.DataFrame(data = d)

In [9]:
filtered_df = pd.read_csv("cardiac_conduction_filtered.csv")
filtered_df

Unnamed: 0.1,Unnamed: 0,Protein ID,Reactome Pathway ID
0,0,A0A0G2K548,R-RNO-5576890
1,1,A5HKJ1,R-RNO-5576890
2,2,O08962,R-RNO-5576890
3,3,P63161,R-RNO-5576890
4,4,Q71FD8,R-RNO-5576890
...,...,...,...
514,514,Q8BUW1,R-MMU-5576886
515,515,Q8JZN3,R-MMU-5576886
516,516,Q8R1P5,R-MMU-5576886
517,517,Q9JK62,R-MMU-5576886


In [20]:
set(hierarchy_reverse(filtered_df))

{'R-HSA-397014', 'R-MMU-397014', 'R-RNO-397014', 'R-SSC-397014'}

In [30]:
protein_muscle_df = protein_to_pathway(filtered_df)
protein_muscle_df

Unnamed: 0,Protein ID,Reactome Pathway ID
0,A0A096MK15,R-RNO-397014
1,A0A0G2K0D3,R-RNO-397014
2,A0A0G2K2B5,R-RNO-397014
3,A0A0G2K2W5,R-RNO-397014
4,A0A0G2K548,R-RNO-397014
...,...,...
673,Q9Y490,R-HSA-397014
674,Q9Y5Q5,R-HSA-397014
675,Q9Y5Y9,R-HSA-397014
676,Q9Y6H6,R-HSA-397014


In [31]:
protein_muscle_df.to_csv("muscle_contraction_filtered.csv")

In [32]:
protein_muscle_df.to_json("muscle_contraction_filtered.json", orient = 'records')