# 6. Analysis of the Predicted HP-PPI Network

Network analysis on the human-*Streptococcus pneumoniae* strain D39 protein-protein interaction network with NetworkX module

In [1]:
import os
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import networkx as nx
from networkx.algorithms import bipartite
from pyvis.network import Network

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())
dir_in = dir_out = os.path.join(parent_dir, 'analyses')

In [3]:
# Load predicted network as DataFrame
f_in = os.path.join(dir_in, 'predicted_interactions.tsv')
df = pd.read_csv(f_in, sep='\t')

# Get sets of unique proteins from each organism
pathogen_proteins = list(set(df.Pathogen_Protein))
human_proteins = list(set(df.Human_Protein))

print('Loaded %i interactions involving %i STRP2 proteins and %i human proteins\n'\
      % (len(df), len(pathogen_proteins), len(human_proteins)))
df.head()

Loaded 5823 interactions involving 30 STRP2 proteins and 324 human proteins



Unnamed: 0,Pathogen_Protein,Human_Protein
0,Q04KG2,P24071
1,Q04KG2,P31994
2,Q04KG2,P30273
3,Q04KG2,Q92637
4,Q04KG2,P10909


In [4]:
# Save list of proteins
for organism in ['Pathogen', 'Human']:
    f_out = os.path.join(dir_out, 'protein_lists', '%s_protein_list' % organism)
    protein_list = set(df['%s_Protein' % organism])
    with open(f_out, 'w') as f:
        f.write('\n'.join(protein_list))

309

2283

## Create graph of protein interactions

In [5]:
# Rename proteins by gene names
for organism in ['STRP2', 'HUMAN']:
    f_in = os.path.join(dir_in, 'protein_lists', '%s_uniprot_mapping.tab' % organism)
    
    gene_mapping = pd.read_csv(f_in, sep='\t')[['Entry', 'Gene names']].values
    map_dict = {uniprot: gene.split(' ')[0] for uniprot, gene in gene_mapping}
    df.replace(map_dict, inplace=True)

df.head()

Unnamed: 0,Pathogen_Protein,Human_Protein
0,eno,FCAR
1,eno,FCGR2B
2,eno,FCER1G
3,eno,FCGR1B
4,eno,CLU


In [6]:
# Initialize a bipartite graph 
pathogen_proteins = list(set(df.Pathogen_Protein))
human_proteins = list(set(df.Human_Protein))
B = nx.Graph()

# Add proteins as nodes with a bipartite attribute
B.add_nodes_from(pathogen_proteins, bipartite=0)
B.add_nodes_from(human_proteins, bipartite=1)

# Add interactions as edges between nodes
B.add_edges_from(df.values)

In [7]:
# Create DataFrame to store properties of pathogen proteins
f_in = os.path.join(dir_in, 'protein_lists', 'STRP2_uniprot_mapping.tab')

df = pd.read_csv(f_in, sep='\t')[['Gene names', 'Protein names']]
df['Gene names'] = df['Gene names'].apply(lambda x: x.split(' ')[0])

_ = df.set_index('Gene names', inplace=True)
df.head()

Unnamed: 0_level_0,Protein names
Gene names,Unnamed: 1_level_1
proV,Choline transporter
prtA,Cell wall-associated serine protease PrtA
SPD_1617,Cell wall surface anchor family protein
pcpA,Choline binding protein PcpA
proWX,Choline transporter (Glycine betaine transport...


In [8]:
# Analyze topological properties
properties = {
    'Degree_centrality': bipartite.degree_centrality,
    'Betweenness_centrality': bipartite.betweenness_centrality
}

for prop, func in properties.items():
    props = func(B, nodes=pathogen_proteins)
    df_prop = pd.DataFrame.from_dict(props, orient='index', columns=[prop])
    
    # Obtain only pathogen proteins
    df_prop = df_prop[df_prop.index.isin(pathogen_proteins)]
    
    # Concatenate to the empty DataFrame
    df = pd.concat([df, df_prop], axis=1, sort=False)

In [9]:
# Examine top-scoring properties
df.index.name = 'Gene_name'
df.sort_values(by='Degree_centrality', ascending=False, inplace=True)
df

Unnamed: 0_level_0,Protein names,Degree_centrality,Betweenness_centrality
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bgaA,"Beta-galactosidase, putative",1.0,0.451853
SPD_0250,"Pullulanase, extracellular",0.775316,0.081955
prtA,Cell wall-associated serine protease PrtA,0.699367,0.02925
pcpA,Choline binding protein PcpA,0.693038,0.026479
pck,Choline kinase,0.68038,0.02239
SPD_0537,Matrixin family protein,0.670886,0.019983
eno,Enolase (EC 4.2.1.11) (2-phospho-D-glycerate h...,0.648734,0.015419
gatA,Glutamyl-tRNA(Gln) amidotransferase subunit A ...,0.639241,0.013786
nanB,Sialidase B (EC 3.2.1.18),0.636076,0.013316
nanA,Sialidase A (EC 3.2.1.18),0.636076,0.013316


In [10]:
# Save centralities
f_out = os.path.join(dir_out, 'network_centrality.tsv')
df.to_csv(f_out, sep='\t')

In [11]:
# Get enriched human proteins
f_in = os.path.join(dir_in, 'KEGG_pathway_enrichment.txt')
df_kegg = pd.read_csv(f_in, sep='\t')
df_kegg.Term = df_kegg.Term.apply(lambda x: x.split(':')[1])

# Set Term column as index
df_kegg.set_index('Term', inplace=True)

# Obtain only relevant pathways
pathways = ['Complement and coagulation cascades',
            'Phagosome',
            'Fc gamma R-mediated phagocytosis',
            'Bacterial invasion of epithelial cells']

In [12]:
# Create subgraph for each pathway
colors = ['red', 'orange', 'gold', 'khaki']

for i, term in enumerate(pathways):
    human_proteins = df_kegg.loc[term]['Genes'].split(', ')
    subgraph = B.subgraph(pathogen_proteins + human_proteins)

    # Visualize the network
    net = Network()
    net.inherit_edge_colors_from(False)
    
    # Color human & pathogen nodes differently
    m = len(human_proteins)
    n = len(pathogen_proteins)
    net.add_nodes(pathogen_proteins, color=['blue']*n)
    net.add_nodes(human_proteins, color=[colors[i]]*m)
    net.add_edges(subgraph.edges())
    
    # Miscellaneous graph settings
    net.force_atlas_2based(gravity=-200, spring_length=150, overlap=0.2)
    
    net.show_buttons()
    net.show('subgraph_%i.html' % (i+1))

<hr></hr>