In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import cm
import sys
import numpy as np

basedir = '../'
sys.path.append(basedir)

from lda_for_fragments import Ms2Lda
import visualisation.networkx.community as community
import visualisation.networkx.lda_visualisation as vis_util
from visualisation.pylab.lda_for_fragments_viz import Ms2Lda_Viz

<h1>Community detection demo</h1>

Load some karate club graph.

In [None]:
# load some data
G = nx.karate_club_graph()
print G
print "nodes = " + str(G.nodes())
nx.draw(G)

Partition the graph into several 'communities' based on modularity (https://en.wikipedia.org/wiki/Modularity_(networks)). 

Using the algorithm from Blondel, Vincent D., et al. "Fast unfolding of communities in large networks." Journal of Statistical Mechanics: Theory and Experiment 2008.10 (2008): P10008. http://arxiv.org/pdf/0803.0476.pdf

In [None]:
def find_community(G):
    print "Finding community"
    partition = community.best_partition(G)
    count = list(set([i for i in partition.values()]))
    print "Number of communities found = " + str(len(count))
    return partition

partition = find_community(G)
print "Partition = " + str(partition)

Visualise the communities

In [None]:
def plot_community(G, partition):
    size = float(len(set(partition.values())))
    count = 0.
    for com in set(partition.values()) :
        count = count + 1.
        list_nodes = [nodes for nodes in partition.keys()
                                    if partition[nodes] == com]
        print "Community " + str(com) + " = " + str(list_nodes)
        H = G.subgraph(list_nodes)
        pos = nx.spring_layout(H)
        nx.draw_networkx_nodes(H, pos, list_nodes, node_size = 200,
                                    node_color = str(count / size))
        nx.draw_networkx_edges(H, pos, alpha=0.5)
        plt.show()

In [None]:
plot_community(G, partition)

<h1>Beer3pos Data</h1>

Now we do a similar analysis but with our topics data. Basically we want to check to see if there are terms that these topics share in common. If these exist and they make some biological sense, we can learn this through some form of hierarchical modelling. Otherwise, it's no go.

First, we load our beer3pos LDA results and do some thresholding as usual.

In [None]:
ms2lda = Ms2Lda.resume_from('results/beer3pos.project')

In [None]:
ms2lda.do_thresholding(th_doc_topic=0.05, th_topic_word=0.01)

Then we create a network graph of topics relationship. Nodes in the graph are topics and the terms. An edge is drawn from topic to term if the term is 'present' in the thresholded topic distribution.

In [None]:
plotter = Ms2Lda_Viz(ms2lda.model, ms2lda.ms1, ms2lda.ms2, ms2lda.docdf, ms2lda.topicdf)

In [None]:
json, G = vis_util.get_json_from_topicdf(plotter.topicdf)

So we end up with a messy graph here. Then we run community detection algorithm on the graph. The idea here is that hopefully, within each community, the set of terms (fragment/loss words) that topics share might represent some chemical meaning??

In [None]:
node_names = {}
for n in G.nodes():
    name = G.node[n]['name']
    node_names[n] = name
partition = find_community(G)

Define a plotting function ..

In [None]:
import matplotlib as mpl
def plot_community(G, partition, node_names, show_label=False):

    norm = mpl.colors.Normalize(vmin=0, vmax=1)
    cmap = cm.hot
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    
    # loop over all partitions
    for com in set(partition.values()) :

        list_nodes = [nodes for nodes in partition.keys()
                                    if partition[nodes] == com]
        if len(list_nodes) == 1:
            continue

        print "================================================================================================"
        print "Community " + str(com)
        print "================================================================================================"
        
        topic_nodes = []
        other_nodes = []       
        topic_names = []
        other_names = []
        topic_labels = {}
        other_labels = {}
        other_degrees = []
        selected_topics = []

        H = G.subgraph(list_nodes)
        for n in list_nodes:
            name = node_names[n]
            if 'topic' in name.lower():
                topic_nodes.append(n)
                topic_names.append(name)
                topic_id = name.split(' ')[1]
                selected_topics.append(int(topic_id))
                topic_labels[n] = name
            else:
                other_nodes.append(n)
                other_names.append(name)
                other_labels[n] = name
                node_degree = H.degree(n)
                other_degrees.append(node_degree)
                # print " - " + name + " degree " + str(node_degree)

        if len(topic_nodes) == 1:
            print "Nothing interesting"
            print
            continue
                
        ms2lda.print_topic_words(selected_topics=selected_topics, with_probabilities=False)
        
        fig = plt.figure(1, figsize=(16, 4))
        ax = plt.subplot(111)
        width = 0.8
        for n in range(len(other_names)):
            d = other_degrees[n]/float(len(topic_nodes))
            c = m.to_rgba(d)
            plt.bar(n, d, width=width, color=c)
        ax.set_xticks(np.arange(len(other_names)) + (width*0.8))
        ax.set_xticklabels(other_names, rotation=90) 
        plt.ylim((0, 1))
        plt.title('Term degrees / no. of topics')
        plt.show()
        
        plt.figure(1, figsize=(12, 12))
        pos = nx.spring_layout(H, scale=2)
        nx.draw_networkx_nodes(H, pos, topic_nodes, node_size=500, node_color='blue', alpha=0.5)
        nx.draw_networkx_nodes(H, pos, other_nodes, node_size=500, cmap=plt.get_cmap('hot'), 
                               node_color=other_degrees, alpha=0.50)
        nx.draw_networkx_edges(H, pos, alpha=0.25)
        if show_label:
            nx.draw_networkx_labels(H, pos, font_size=14, font_weight="bold", font_color='black', labels=topic_labels)        
            nx.draw_networkx_labels(H, pos, font_size=10, font_weight="normal", font_color='black', labels=other_labels)
        plt.title('Network of shared terms across topics')
        plt.show()

For each community, the first plot show the ratio between the degrees of terms in that community to the number of topics. If this ratio is 1, then the term is used by all topics in the community. If partitioning is sensible, we'd expect to see more terms with ratio = 1 here.

The second plot shows the network relationship of topics, based on the terms.

In [None]:
plot_community(G, partition, node_names, show_label=True)