In [None]:
! pip install OmicsIntegrator --no-deps
! pip install pandas==1.3.5 axial pcst_fast


In [197]:
%matplotlib inline
import numpy as np
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt

import OmicsIntegrator as oi


In [198]:
interactome_file = "inputs/mippie_ppi_v1_0.tsv"
prize_file = "inputs/prize.txt"
proteins_file = "inputs/mippie_proteins_v1_0.tsv"
tf_receptor_classification = "inputs/tf_rec_annotations.tsv"

interactome = pd.read_csv(interactome_file, delimiter = '\t')
prize = pd.read_csv(prize_file)
proteins = pd.read_csv(proteins_file, delimiter = '\t')
del proteins['mgi']
tf_receptor_classification = pd.read_csv(tf_receptor_classification, delimiter = '\t')


interactome = interactome[['entrezA', 'entrezB', 'MIPPIE_score']].rename(columns={'entrezA': 'protein1_entrez', 'entrezB': 'protein2_entrez'})
interactome['cost'] = 1.5 - interactome['MIPPIE_score']
del interactome['MIPPIE_score']

interactome = interactome.merge(proteins, how = 'left', left_on = 'protein1_entrez' , right_on = 'entrez').rename(columns={'entrez': 'del1_entrez', 
                                                                                                                           'official_symbol': 'protein1', 
                                                                                                                           'uniprot_accession': 'protein1_accession'})

interactome = interactome.merge(proteins, how = 'left', left_on = 'protein2_entrez' , right_on = 'entrez').rename(columns={'entrez': 'del2_entrez', 
                                                                                                                           'official_symbol': 'protein2', 
                                                                                                                           'uniprot_accession': 'protein2_accession'})

interactome = interactome.merge(tf_receptor_classification, how = 'left', left_on = 'protein1_entrez' , right_on = 'entrez').rename(columns={'entrez': 'del1_entrez', 
                                                                                                                           'tfa': 'tfa1', 'rec': 'rec1'})
interactome = interactome.merge(tf_receptor_classification, how = 'left', left_on = 'protein2_entrez' , right_on = 'entrez').rename(columns={'entrez': 'del2_entrez', 
                                                                                                                           'tfa': 'tfa2', 'rec': 'rec2'})

interactome = interactome.drop(columns = ['del1_entrez','del2_entrez'])


interactome['protein1'] = interactome['protein1'].replace('CYTB', 'Mt-Cyb')
interactome['protein2'] = interactome['protein2'].replace('CYTB', 'Mt-Cyb')

interactome['protein1'] = interactome['protein1'].replace('2610301B20Rik','Cfap418')
interactome['protein2'] = interactome['protein2'].replace('2610301B20Rik','Cfap418')

# removing self edges
interactome = interactome[interactome['protein2']!= interactome['protein1']]


# removing 3 hubs with more than 1000 interactions (didnt apply the hub removal, since it make us lose 4 terminal proteins from interactome)
'''
interactome = interactome[interactome['protein1']!= 'Ywhae'] #3920 interactions
interactome = interactome[interactome['protein2']!= 'Ywhae']

interactome = interactome[interactome['protein1']!= 'Fancd2']# 1682 interactions
interactome = interactome[interactome['protein2']!= 'Fancd2']
interactome = interactome[interactome['protein1']!= 'Eed'] # 1193 interactions
interactome = interactome[interactome['protein2']!= 'Eed']
'''


interactome_file = 'interactome.tsv'

interactome = interactome[['protein1','protein2','cost']]
interactome.to_csv(interactome_file, sep = '\t',index = False)




# Network building

In [199]:
params = {
    "noise": 0.1, 
    "dummy_mode": "terminals", 
    "exclude_terminals": False, 
    "seed": 1
}

graph = oi.Graph(interactome_file, params)
graph.prepare_prizes(prize_file)


01:04:19 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
01:04:19 - OI2: INFO - []
01:04:19 - OI2: INFO - Members of the prize file not present in the interactome:
01:04:19 - OI2: INFO - ['Clk4', 'Custos', 'Cyp20a1', 'Dars2', 'LRWD1', 'Lin9', 'Lrrc45', 'Neurod6', 'Prxl2b', 'Rbms2', 'Recql4', 'Retreg3', 'Sox13', 'Suco', 'Surf2', 'Tamm41', 'Tceal3', 'Znf131', 'Znf148', 'Znf593']


# Grid search

In [200]:
Ws = list(np.arange(2,6,1))
Bs = list(np.arange(2,7,1))
Gs = list(np.arange(2,7,1))

# Or:

#Ws = [2,3]
#Bs = [4,5,6]
#Gs = [3,4,5]

params = {
    "noise": 0.1, 
    "dummy_mode": "terminals", 
    "exclude_terminals": False, 
    "seed": 1
}


results = graph.grid_search(prize_file, Ws, Bs, Gs)
membership_df = oi.summarize_grid_search(results, "membership")



prize = pd.read_csv(prize_file,sep="\t")
initial_nodes=list(prize.name)          
results_with_terminals = membership_df[membership_df.index.isin(initial_nodes)]
Initial_node_covers = results_with_terminals.sum().sort_values(ascending=False).to_frame(name="Covering_nodes")


out = set(Initial_node_covers[Initial_node_covers["Covering_nodes"]==max(Initial_node_covers["Covering_nodes"])].index)

Initial_node_covers.sort_index(axis=0, inplace=True)
membership_df.sort_index(axis=1, inplace=True)
Total_node = membership_df.sum().to_frame(name="Total_nodes")

membership_df.loc['Covering nodes']=(Initial_node_covers['Covering_nodes'])
membership_df.loc['Total_nodes']=(Total_node['Total_nodes'])
membership_df.to_csv('membership_df_with_node_numbers.csv')

01:04:27 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
01:04:27 - OI2: INFO - []
01:04:27 - OI2: INFO - Members of the prize file not present in the interactome:
01:04:27 - OI2: INFO - ['Clk4', 'Custos', 'Cyp20a1', 'Dars2', 'LRWD1', 'Lin9', 'Lrrc45', 'Neurod6', 'Prxl2b', 'Rbms2', 'Recql4', 'Retreg3', 'Sox13', 'Suco', 'Surf2', 'Tamm41', 'Tceal3', 'Znf131', 'Znf148', 'Znf593']
01:04:31 - OI2: INFO - Single PCSF run for W_2.00_B_6.00_G_6.00
01:04:31 - OI2: INFO - Single PCSF run for W_2.00_B_2.00_G_2.00
01:04:31 - OI2: INFO - Single PCSF run for W_2.00_B_4.00_G_4.00
01:04:31 - OI2: INFO - Single PCSF run for W_3.00_B_2.00_G_4.00
01:04:31 - OI2: INFO - Single PCSF run for W_2.00_B_2.00_G_5.00
01:04:31 - OI2: INFO - Single PCSF run for W_2.00_B_3.00_G_6.00
01:04:31 - OI2: INFO - Single PCSF run for W_3.00_B_3.00_G_2.00
01:04:31 - OI2: INFO - Single PCSF run for W_3.00_B_3.00_G_5.00
01:04:31 - OI2: INFO - Single PCSF run for W_2.00_B_3.00_G_3.00
01:04:31

In [208]:
# tuned parameters:

w = 2
b = 6
g = 2


# Among 66 DEGs 20 were not present in the interactome making 46 terminal nodes from total 85 nodes.
# So the resulting output network with these parameters have 46 terminal proteins and 39 steiners making a 85 node connected network with 6 louvain clusters.

In [209]:
graph = oi.Graph(interactome_file, {'w':w, 'b':b, 'g':g,})
graph.prepare_prizes(prize_file)
vertex_indices, edge_indices = graph.pcsf()

print(len(vertex_indices))

forest, augmented_forest = graph.output_forest_as_networkx(vertex_indices, edge_indices)


# the part specific for mouse subcellular localization data
annotation = pd.read_pickle('inputs/Subcellular location from Jensen Lab/subcellular_location.pickle')
annotation = annotation.reindex(augmented_forest.nodes())
nx.set_node_attributes(augmented_forest, annotation.apply(lambda x: x.dropna().to_dict(), axis=1).to_dict())



print(graph.pcsf_objective_value(augmented_forest))

# removing self loops
augmented_forest.remove_edges_from(nx.selfloop_edges(augmented_forest))
forest.remove_edges_from(nx.selfloop_edges(forest))

oi.get_networkx_graph_as_dataframe_of_edges(augmented_forest).to_csv('edges.txt', sep='\t', header=True, index=False)
oi.get_networkx_graph_as_dataframe_of_nodes(augmented_forest).to_csv('nodes.txt', sep='\t', header=True, index=True)
oi.output_networkx_graph_as_interactive_html(augmented_forest, output_dir='', filename='pcsf_results262.html')

01:09:49 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
01:09:49 - OI2: INFO - []
01:09:49 - OI2: INFO - Members of the prize file not present in the interactome:
01:09:49 - OI2: INFO - ['Clk4', 'Custos', 'Cyp20a1', 'Dars2', 'LRWD1', 'Lin9', 'Lrrc45', 'Neurod6', 'Prxl2b', 'Rbms2', 'Recql4', 'Retreg3', 'Sox13', 'Suco', 'Surf2', 'Tamm41', 'Tceal3', 'Znf131', 'Znf148', 'Znf593']


85
342.7509068248637


PosixPath('/Users/sina/Library/CloudStorage/OneDrive-KocUniversitesi/Nozlulab/Heterotopia/Netwrok analysis/pcsf_results262.html')

In [210]:
annotation[annotation['location'].isna()]

Unnamed: 0_level_0,location
GeneSymbol,Unnamed: 1_level_1
Adgrl3,
Mt-Cyb,
Nectin1,
