In [89]:
import networkx as nx
import pandas as pd

In [261]:
def initialise_network(protein_network, essential_proteins, remove_essentials=False, threshold=700, 
                       largest_connected_component=False):
    '''
    Given a protein protein interaction network, and a list of
    essential proteins, return a new pruned network that meets
    some given specifications.
    
    Parameters
    ----------
    protein_network : networkx graph
    
    essential_proteins : list
    
    remove_essentials : bool, default False
    
    threshold : int, default 700
    '''
    
    ## remove all necessary nodes ##
    if remove_essentials:
         for node in essential_proteins:
            protein_network.remove_node(node)
    edges_to_remove = []
    for edge in protein_network.edges:
        edge_weights = list(protein_network.get_edge_data(edge[0], edge[1]).values())
        if edge_weights[0] < threshold:
            edges_to_remove.append(edge)
    for edge in edges_to_remove:
        protein_network.remove_edge(edge[0], edge[1])
    ## now create a copy that is unweighted ##
    H = nx.create_empty_copy(protein_network)
    for edge in protein_network.edges:
        H.add_edge(edge[0], edge[1])
    ## fix node names ##
    mapping = []
    for node in protein_network.nodes:
        mapping.append(node.removeprefix('4932.'))
    H = nx.relabel_nodes(H, dict(zip(H, mapping)), copy=False) # <-- cute little python hack i just learnt! (i feel very proud of myself)
    ## (if it is needed) return the largest connected subgraph ##
    if largest_connected_component:
        return H.subgraph(max(nx.connected_components(H), key=len))
    return H

## Compare Connected Components

In [264]:
protein_network = nx.read_weighted_edgelist("4932.protein.links.v12.0.txt",comments="#",nodetype=str)
df = pd.read_table("EssentialProteins_YeastMine_cerevisiae.csv", sep=",", header = None)
df.columns = ['primary identifier','secondary identifier','organism short name','symbol','description']
essential_nodes = [n for n in protein_network.nodes() if n[5:] in list(df['secondary identifier'])]
G = initialise_network(protein_network, essential_nodes, remove_essentials=False, largest_connected_component=False)

In [265]:
print(G)

Graph with 6538 nodes and 104188 edges


In [219]:
protein_network = nx.read_weighted_edgelist("4932.protein.links.v12.0.txt",comments="#",nodetype=str)
df = pd.read_table("EssentialProteins_YeastMine_cerevisiae.csv", sep=",", header = None)
df.columns = ['primary identifier','secondary identifier','organism short name','symbol','description']
essential_nodes = [n for n in protein_network.nodes() if n[5:] in list(df['secondary identifier'])]
G = initialise_network(protein_network, essential_nodes, remove_essentials=False, largest_connected_component=True)
print(G)

Graph with 5716 nodes and 104138 edges


In [267]:
list(G.nodes)[0]

'Q0010'

# Methods on the Graph

In [271]:
def shortest_path(G, node1, node2):
    '''
    Given a connected undirected graph and two nodes, 
    return the shortest path between them
    
    Parameters
    ----------
    G : networkx graph
        Undirected connected networkx graph.
        
    node1, node2 : networkx nodes
        Nodes of the graph G
    
    Returns
    -------
    path : networkx path
        Shortest path between node1 and node2.
    '''
    return None