In [0]:
# Analyses of the Hofree et al. original datasets

## Import all the preprocessed data in Matlab format
For the current analyses, all the required preprocessed data are already in the folder "data."

If you want to generate them from the original Hofree et al. dataset, you have to use the script "Matlab2Python.m" in the "tools" folder. The original dataset from Hofree et al. with their Matlab code is available on the UCSD's [Network Based Stratification](http://chianti.ucsd.edu/~mhofree/wordpress/?page_id=26) webpage.

In [0]:
from scipy.io import loadmat
dataFolder='data/'

# Patients' somatic mutation profiles
somatic = loadmat(dataFolder+'somatic_data_UCEC.mat')
samples_id = [k[0][0][:12] for k in somatic['sample_id']]
# Patients' full phenotypes
phenotypes = loadmat(dataFolder+'UCEC_clinical_phenotype.mat')
patients = [c[0][0] for c in phenotypes['UCECppheno'][0][0][0]]
tmp = [c[0][0] for c in phenotypes['UCECppheno'][0][0][10]]
cancer = [tmp[patients.index(p)] for p in samples_id]
tmp = [c[0][0] for c in phenotypes['UCECppheno'][0][0][17]]
grade = [tmp[patients.index(p)] for p in samples_id]

# Adjacency matrix
network = loadmat(dataFolder+'adj_mat.mat')
# Correspondance between matrices rows number and entrez id
entrez_to_idmat = loadmat(dataFolder+'entrez_to_idmat.mat')

## Check preprocessed data format

In [0]:
print somatic.keys()
len(somatic['gene_id_all'])

In [0]:
mutations=somatic['gene_indiv_mat']
mutations.shape

In [0]:
print network.keys()
net=network['adj_mat']
net.shape

In [0]:
entrez_to_idmat.keys()

In [0]:
len(entrez_to_idmat['keymat'][0])

## Extract all the ids

In [0]:
keys=[x[0] for x in entrez_to_idmat['keymat'][0]]
ids=[x[0][0] for x in entrez_to_idmat['entrezid'][0]]
genes = [x[0] for x in somatic['gene_id_all']]

In [0]:
print "Ensembl ID:", keys[0]
print "Entrez ID:", ids[0]
print "Check on NCBI: http://www.ncbi.nlm.nih.gov/gene/%i" % ids[0]

## Extract indexes of the genes in the adjacency matrix

In [0]:
import numpy as np
l=[]
subnet=[]
good=[]
bad=[]
for j,g in enumerate(genes):
    try:
        i=ids.index(g)
        subnet.append(i)
        good.append(j)
    except:
        i=np.nan
        bad.append(j)
    l.append(i)

In [0]:
print "All genes:",len(l)
print "Referenced in the PPI:",len(good)
print "On their own:",len(bad)

## Extract the submatrices of references genes

In [0]:
nnet=net[subnet][:,subnet]
nnet.shape

In [0]:
nmut=mutations[:,good]
nmut.shape

## Zero-padding of the adjacency matrix

In [0]:
nnnet=np.bmat([[np.matrix(nnet.todense()), np.matrix(np.zeros([nnet.shape[0],len(bad)]))], [np.matrix(np.zeros([len(bad),nnet.shape[0]])), np.matrix(np.diagflat(np.zeros(len(bad))))]])
nnmut=mutations[:,good+bad]
symbols=somatic['gene_id_symbol'][good+bad]

In [0]:
print "Network size:",nnnet.shape
print "Mutation size:",nnmut.shape

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
degree=np.squeeze(np.array(nnnet.sum(axis=0)))
plt.figure(1,figsize=(16,10))
plt.plot(degree)
plt.ylabel("Degree (number of neighboors in the PPI)")
plt.xlabel("Genes (keys)")
plt.show()

## Filtering according to Hofree et al.

### Computing network influence score [warning: very long!]
For more details, see: Vandin, F., Upfal, E., & Raphael, B. J. (2011). Algorithms for Detecting Significantly Mutated Pathways in Cancer. Journal of Computational Biology, 18(3), 507–522. http://doi.org/10.1089/cmb.2010.0265

In [0]:
#remove genes on their own
nnnetFiltered=nnnet[degree>0,:][:,degree>0]

In [0]:
from numpy import linalg as LA

diffusionFactor=0.7
computeInfluence=False

if computeInfluence:
    from scipy.io import savemat
    from IPython.html.widgets import FloatProgress
    from IPython.display import display 

    influence=np.zeros(nnnetFiltered.shape)
    influencers=np.zeros(nnnetFiltered.shape)
    f = FloatProgress(min=0, max=nnnetFiltered.shape[0])
    display(f)

    for i in range(nnnetFiltered.shape[0]):
        f.value = i
        tmp=np.array(nnnetFiltered.sum(axis=0))
        tmp[tmp==0]=1
        A=nnnetFiltered*np.diagflat(1./tmp)
        mutationProfile=np.zeros(nnnetFiltered.shape[0])
        mutationProfile[i]=1
        X1=mutationProfile
        X2=diffusionFactor*X1*A+(1-diffusionFactor)*mutationProfile
        while LA.norm(X2-X1)>10e-6:
            X1=X2
            X2=diffusionFactor*X1*A+(1-diffusionFactor)*mutationProfile
        influence[i,:]=np.squeeze(X2)
    
    #Save the raw influence distance matrix (heavy!)
    savemat(dataFolder+'influenceDistance.mat',{'influence':influence, 'diffusionFactor':diffusionFactor})
    #Save the sparse influence distance by merging with PPI
    from scipy.sparse import lil_matrix
    PPI_influence=lil_matrix(np.multiply(np.max(np.dstack((influence, influence.T)),axis=2),np.array(nnnetFiltered)))
    savemat(dataFolder+'PPI_influence.mat',{'PPI_influence':PPI_influence, 'diffusionFactor':diffusionFactor}, do_compression=True)
else:
    influence_data = loadmat(dataFolder+'PPI_influence.mat')
    PPI_influence=influence_data['PPI_influence']
    diffusionFactor=influence_data['diffusionFactor'][0][0]

In [0]:
PPI_influence=lil_matrix(np.multiply(np.max(np.dstack((influence, influence.T)),axis=2),np.array(nnnetFiltered)))
savemat(dataFolder+'PPI_influence.mat',{'PPI_influence':PPI_influence, 'diffusionFactor':diffusionFactor}, do_compression=True)
plt.figure(figsize=(18,18))
plt.imshow(influence)
plt.show()

## Keeping only the connections with the best influencers
"The degree to which local network topology versus global network topology constrains W is determined by the number of nearest neighbors. We experimented with neighbor counts ranging from 5 to 50 to include in the nearest network, and we observed only small changes in outcome (data not shown). For the work presented in this manuscript, the 11 most influential neighbors of each gene in the network as determined by network influence distance were used."

In [0]:
PPIneighboorsMax=11
influenceMat=PPI_influence.todense()
newnet=np.zeros(nnnetFiltered.shape)
for i in range(nnnetFiltered.shape[0]):
    bestInfluencers=np.argsort(influenceMat[i,:])[:,-PPIneighboorsMax:]
    newnet[i,bestInfluencers]=np.squeeze(np.array(nnnetFiltered[i,bestInfluencers]))

newnet=np.max(np.dstack((newnet, newnet.T)),axis=2)

In [0]:
plt.figure(1,figsize=(18,9))
plt.subplot(121)
plt.imshow(nnnetFiltered)
plt.set_cmap('Greys')
plt.title("Original adjacency")
plt.subplot(122)
plt.imshow(newnet)
plt.title("With only the "+str(PPIneighboorsMax)+" best influencers")
plt.show()

In [0]:
plt.figure(1,figsize=(16,10))
plt.plot(newnet.sum(axis=0))
plt.show()

In [0]:
notAlone=newnet.sum(axis=1)>0
print nnnet.shape, nnnet[degree>0,:][:,degree>0].shape, newnet[notAlone,:][:,notAlone].shape

In [0]:
nnnetFiltered=nnnet[degree>0,:][:,degree>0]
filteredGenes=degree==0
filteredGenes[filteredGenes==False]=newnet.sum(axis=1)==0

In [0]:
mutationsMin=10
filteredPatients=nnmut.sum(axis=1)<mutationsMin
print "Removing %i genes filtered with the %i influencers criterion" % (filteredGenes.sum(), PPIneighboorsMax)
print "Removing %i patients with less than %i mutations" % (filteredPatients.sum(),mutationsMin)
notAlone=newnet.sum(axis=1)>0
nnnetFiltered=newnet[notAlone,:][:,notAlone]
nnmutFiltered=nnmut[filteredPatients==False,:]
nnmutFiltered=nnmutFiltered[:,filteredGenes==False]
print "New adjacency matrix:",nnnetFiltered.shape
print "New mutation profile matrix:",nnmutFiltered.shape

## Diffusion of the mutation profiles according to the PPI

In [0]:
import scipy.sparse as sp
def mutationProfileDiffusion(mutationProfile,PPIAdjacencyMatrix,diffusionFactor):
    PPIAdjacencyMatrix=PPIAdjacencyMatrix+np.diagflat(np.ones(PPIAdjacencyMatrix.shape[0]))
    tmp=np.array(PPIAdjacencyMatrix.sum(axis=0))
    A=np.dot(PPIAdjacencyMatrix,np.diagflat(1./tmp))
    X1=mutationProfile
    X2=diffusionFactor*X1.dot(A)+(1-diffusionFactor)*mutationProfile
    while LA.norm(X2-X1)>10e-6:
        X1=X2
        X2=diffusionFactor*X1.dot(A)+(1-diffusionFactor)*mutationProfile
    return X2

In [0]:
nnmutDiffused=mutationProfileDiffusion(sp.lil_matrix(nnmutFiltered),sp.lil_matrix(nnnetFiltered), diffusionFactor)
nnmutDiffused[np.isnan(nnmutDiffused)]=0

In [0]:
plt.figure(1,figsize=(16,10))
plt.subplot(211)
plt.plot(np.squeeze(np.asarray(nnmutFiltered[0,:])))
plt.xlim([0,nnmutFiltered.shape[1]])
plt.title("Original mutation profile")
plt.subplot(212)
plt.plot(np.squeeze(np.asarray(nnmutDiffused[0,:])))
plt.xlim([0,nnmutFiltered.shape[1]])
plt.title("Diffused mutation profile")
plt.show()

In [0]:
plt.figure(1,figsize=(16,5))
plt.subplot(311)
plt.imshow(nnmutFiltered)
plt.title("Original mutation profile")
plt.subplot(312)
plt.imshow(nnmutDiffused)
plt.title("Diffused mutation profile")
plt.subplot(313)
plt.hist(np.array(np.squeeze(nnmutDiffused.reshape((1,-1)))).T, 50, normed=1, histtype='stepfilled')
plt.title("Weigths histogram after diffusion")
plt.show()

## Non-Negative Matrix (NMF) decomposition 

In [0]:
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=3, init='nndsvdar', random_state=0)
model.fit(np.matrix(nnmutFiltered))
sklearnComp=model.components_
sklearnStrat=np.argmax(model.transform(np.matrix(nnmutFiltered)),axis=1)
model.fit(np.matrix(nnmutDiffused))
sklearnCompDiff=model.components_
sklearnStratDiff=np.argmax(model.transform(np.matrix(nnmutDiffused)),axis=1)

plt.figure(1,figsize=(16,10))
plt.subplot(311)
plt.plot(sklearnComp.T/sklearnComp.max())
plt.ylabel("Weight")
plt.xlabel("Genes")
plt.title("NMF decomposition on raw mutation profiles")
plt.xlim([0,sklearnComp.shape[1]])
plt.subplot(312)
plt.plot(sklearnCompDiff.T/sklearnCompDiff.max())
plt.ylabel("Weight")
plt.xlabel("Genes")
plt.title("NMF decomposition on diffused mutation profiles")
plt.xlim([0,sklearnCompDiff.shape[1]])
plt.subplot(313)
plt.plot(sklearnCompDiff.T/sklearnCompDiff.max()-sklearnComp.T/sklearnComp.max())
plt.ylabel("Weight difference")
plt.xlabel("Genes")
plt.title("Difference")
plt.xlim([0,sklearnCompDiff.shape[1]])
plt.legend({'Component 1','Component 2','Component 3'})
plt.show()

## GNMF Implementation

In [0]:
## Reuse scikit-learn functions
from sklearn.utils import check_random_state
from sklearn.utils.extmath import randomized_svd, safe_sparse_dot

def check_non_negative(X, whom):
    X = X.data if sp.issparse(X) else X
    if (X < 0).any():
        raise ValueError("Negative values in data passed to %s" % whom)

def _sparseness(x):
    """Hoyer's measure of sparsity for a vector"""
    sqrt_n = np.sqrt(len(x))
    return (sqrt_n - LA.norm(x, 1) / LA.norm(x)) / (sqrt_n - 1)

def safe_vstack(Xs):
    if any(sp.issparse(X) for X in Xs):
        return sp.vstack(Xs)
    else:
        return np.vstack(Xs)

def NBS_init(X,n_components,init=None):
        n_samples, n_features = X.shape
        if init is None:
            if n_components < n_features:
                init = 'nndsvd'
            else:
                init = 'random'


        if init == 'nndsvd':
            W, H = _initialize_nmf(X, n_components)
        elif init == "random":
            rng = check_random_state(random_state)
            W = rng.randn(n_samples, n_components)
            # we do not write np.abs(W, out=W) to stay compatible with
            # numpy 1.5 and earlier where the 'out' keyword is not
            # supported as a kwarg on ufuncs
            np.abs(W, W)
            H = rng.randn(n_components, n_features)
            np.abs(H, H)
        else:
            raise ValueError(
                'Invalid init parameter: got %r instead of one of %r' %
                (init, (None, 'nndsvd', 'nndsvda', 'nndsvdar', 'random')))
        return W, H

def _initialize_nmf(X, n_components, variant=None, eps=1e-6,
                    random_state=None):
    """NNDSVD algorithm for NMF initialization.

    Computes a good initial guess for the non-negative
    rank k matrix approximation for X: X = WH

    Parameters
    ----------

    X : array, [n_samples, n_features]
        The data matrix to be decomposed.

    n_components : array, [n_components, n_features]
        The number of components desired in the approximation.

    variant : None | 'a' | 'ar'
        The variant of the NNDSVD algorithm.
        Accepts None, 'a', 'ar'
        None: leaves the zero entries as zero
        'a': Fills the zero entries with the average of X
        'ar': Fills the zero entries with standard normal random variates.
        Default: None

    eps: float
        Truncate all values less then this in output to zero.

    random_state : numpy.RandomState | int, optional
        The generator used to fill in the zeros, when using variant='ar'
        Default: numpy.random

    Returns
    -------

    (W, H) :
        Initial guesses for solving X ~= WH such that
        the number of columns in W is n_components.

    Remarks
    -------

    This implements the algorithm described in
    C. Boutsidis, E. Gallopoulos: SVD based
    initialization: A head start for nonnegative
    matrix factorization - Pattern Recognition, 2008

    http://tinyurl.com/nndsvd
    """
    check_non_negative(X, "NMF initialization")
    if variant not in (None, 'a', 'ar'):
        raise ValueError("Invalid variant name")

    U, S, V = randomized_svd(X, n_components)
    W, H = np.zeros(U.shape), np.zeros(V.shape)

    # The leading singular triplet is non-negative
    # so it can be used as is for initialization.
    W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
    H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

    for j in range(1, n_components):
        x, y = U[:, j], V[j, :]

        # extract positive and negative parts of column vectors
        x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
        x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

        # and their norms
        x_p_nrm, y_p_nrm = LA.norm(x_p), LA.norm(y_p)
        x_n_nrm, y_n_nrm = LA.norm(x_n), LA.norm(y_n)

        m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

        # choose update
        if m_p > m_n:
            u = x_p / x_p_nrm
            v = y_p / y_p_nrm
            sigma = m_p
        else:
            u = x_n / x_n_nrm
            v = y_n / y_n_nrm
            sigma = m_n

        lbd = np.sqrt(S[j] * sigma)
        W[:, j] = lbd * u
        H[j, :] = lbd * v

    W[W < eps] = 0
    H[H < eps] = 0

    if variant == "a":
        avg = X.mean()
        W[W == 0] = avg
        H[H == 0] = avg
    elif variant == "ar":
        random_state = check_random_state(random_state)
        avg = X.mean()
        W[W == 0] = abs(avg * random_state.randn(len(W[W == 0])) / 100)
        H[H == 0] = abs(avg * random_state.randn(len(H[H == 0])) / 100)

    return W, H

In [0]:
# Adapted version of the NMF function to integrate graph-regularization
#
# See:
# https://github.com/luispedro/milk/blob/master/milk/unsupervised/nnmf/lee_seung.py
# https://www.researchgate.net/profile/Zhigang_Luo/publication/258350768_Limited-memory_fast_gradient_descent_method_for_graph_regularized_nonnegative_matrix_factorization/links/0c9605282f7f611648000000.pdf
from sklearn.utils.validation import check_arrays
import warnings

def GNMF(X,L,lambd=0,n_components=None,tol=1e-4,max_iter=100,verbose=False):      
        X = check_arrays(X)[0]
        check_non_negative(X, "NMF.fit")
        n_samples, n_features = X.shape
  
        if not n_components:
            n_components = n_features
        else:
            n_components = n_components
  
        #W, H = NBS_init(X,n_components)
        W = np.random.normal(0,1,(n_samples,n_components))**2
        H = np.random.normal(0,1,(n_components,n_features))**2
        
        reconstruction_err_ = LA.norm(X - np.dot(W, H))
        eps=1e-4#spacing(1) #10e-14
        Lp = (abs(L)+L)/2
        Lm = (abs(L)-L)/2
       
        for n_iter in range(1, max_iter + 1):
            if verbose:
                print "Iteration =", n_iter,"/",max_iter, "— Error =", reconstruction_err_,"/",tol
            
            h1=lambd*np.dot(H,Lm)+np.dot(W.T,(X+eps)/(np.dot(W,H)+eps))
            h2=lambd*np.dot(H,Lp)+np.dot(W.T,np.ones(X.shape))
            H = np.multiply(H,(h1+eps)/(h2+eps))
            H[H<=0]=eps
            H[np.isnan(H)]=eps
            
            w1=np.dot((X+eps)/(np.dot(W,H)+eps),H.T)
            w2=np.dot(np.ones(X.shape),H.T)
            W = np.multiply(W,(w1+eps)/(w2+eps))
            W[H<=0]=eps
            W[np.isnan(W)]=eps            
            
            if not sp.issparse(X):
                if reconstruction_err_ > LA.norm(X - np.dot(W, H)):
                    H=(1-eps)*H+eps*np.random.normal(0,1,(n_components,n_features))**2
                    W=(1-eps)*W+eps*np.random.normal(0,1,(n_samples,n_components))**2
                reconstruction_err_ = LA.norm(X - np.dot(W, H))
            else:
                norm2X = np.sum(X.data ** 2)  # Ok because X is CSR
                normWHT = np.trace(np.dot(np.dot(H.T, np.dot(W.T, W)), H))
                cross_prod = np.trace(np.dot((X * H.T).T, W))
                reconstruction_err_ = sqrt(norm2X + normWHT - 2. * cross_prod)
                    
            if reconstruction_err_<tol:
                warnings.warn("Tolerance error reached during fit")
                break
            
            if np.isnan(W).any() or np.isnan(H).any():
                warnings.warn("NaN values at "+ str(n_iter)+" Error="+str(reconstruction_err_))
                break
                              
            if n_iter == max_iter:
                warnings.warn("Iteration limit reached during fit")
  
        return np.squeeze(np.asarray(W)), np.squeeze(np.asarray(H)), reconstruction_err_

In [0]:
WNMF, stratipyCompGNMF, reconstruction_err_ = GNMF(np.matrix(nnmutFiltered),np.matrix(nnnetFiltered),0.,n_components=3,tol=1e-3)
WNMFDiff, stratipyCompGNMFDiff, reconstruction_err_Diff = GNMF(np.matrix(nnmutDiffused),np.matrix(nnnetFiltered),0.,n_components=3,tol=1e-3)
W, stratipyCompG, reconstruction_err_ = GNMF(np.matrix(nnmutFiltered),np.matrix(nnnetFiltered),diffusionFactor,n_components=3,tol=1e-3)
WDiff, stratipyCompGDiff, reconstruction_err_Diff = GNMF(np.matrix(nnmutDiffused),np.matrix(nnnetFiltered),diffusionFactor,n_components=3,tol=1e-3)

In [0]:
plt.figure(1,figsize=(16,10))
plt.subplot(411)
plt.plot(stratipyCompG.T/stratipyCompGNMF.max())
plt.ylabel("Weight")
plt.xlabel("Genes")
plt.title("NMF decomposition on raw mutation profiles")
plt.xlim([0,stratipyCompG.shape[1]])
plt.subplot(412)
plt.plot(stratipyCompGDiff.T/stratipyCompGNMFDiff.max())
plt.ylabel("Weight")
plt.xlabel("Genes")
plt.title("NMF decomposition on diffused mutation profiles")
plt.xlim([0,stratipyCompGDiff.shape[1]])
plt.subplot(413)
plt.plot(stratipyCompGNMF.T/stratipyCompG.max())
plt.ylabel("Weight")
plt.xlabel("Genes")
plt.title("GNMF decomposition on raw mutation profiles")
plt.xlim([0,stratipyCompGNMF.shape[1]])
plt.subplot(414)
plt.plot(stratipyCompGNMFDiff.T/stratipyCompGDiff.max())
plt.ylabel("Weight")
plt.xlabel("Genes")
plt.title("GNMF decomposition on diffused mutation profiles")
plt.xlim([0,stratipyCompGNMFDiff.shape[1]])
plt.legend({'Component 1','Component 2','Component 3'})
plt.show()

In [0]:
Stratification=np.argmax(stratipyCompGDiff,axis=0)
Weights=np.array([stratipyCompGDiff[i,idx] for idx,i in enumerate(Stratification)])
plt.figure(1,figsize=(16,5))
plt.hist(Weights,200)
plt.show()

In [0]:
for comp in range(3):
    selectedGenes=symbols[((Stratification==comp)*(Weights>0.1))]
    print comp+1,len(selectedGenes)
    for g in selectedGenes:
        print g[0][0]
    print '\n'

In [0]:
print "Type of Cancers:"
for c in sorted(list(set(cancer))):
    print "- "+c.capitalize()+":"
    for p in range(3):
        print "Component",p,":", len([v for i,v in enumerate(np.argmax(WNMFDiff,axis=1)==p) if v and cancer[i]==c])

print "\nGrade of Cancers:"
for c in sorted(list(set(grade))):
    print "- "+c.capitalize()+":"
    for p in range(3):
        print "Component",p,":", len([v for i,v in enumerate(np.argmax(WNMFDiff,axis=1)==p) if v and grade[i]==c])

## Network visualization

In [0]:
import pandas as pd
tmp=[k for i,k in enumerate(good+bad) if degree[i]>0]
selectedGenes=[k for i,k in enumerate(tmp) if notAlone[i]]

df0=pd.DataFrame({'EntrezId':[g[0] for g in somatic['gene_id_all'][selectedGenes]],'Genes':[g[0][0] for g in somatic['gene_id_symbol'][selectedGenes]]})
df1=pd.DataFrame({'StartiPyDiff_1':stratipyCompGDiff[0,:].T,'StartiPyDiff_2':stratipyCompGDiff[1,:].T,'StartiPyDiff_3':stratipyCompGDiff[2,:].T,'StratiPyDiff_W':stratipyCompGDiff.sum(axis=0).T,'StratiPyDiff_Comp':np.argmax(stratipyCompGDiff, axis=0).T})
df2=pd.DataFrame({'StartiPy_1':stratipyCompG[0,:].T,'StartiPy_2':stratipyCompG[1,:].T,'StartiPy_3':stratipyCompG[2,:].T,'StratiPy_W':stratipyCompG.sum(axis=0).T,'StratiPy_Comp':np.argmax(stratipyCompG, axis=0).T})
df3=pd.DataFrame({'NNF_1':sklearnComp[0,:].T,'NNF_2':sklearnComp[1,:].T,'NNF_3':sklearnComp[2,:].T,'NNF_W':sklearnComp.sum(axis=0).T,'NNF_Comp':np.argmax(sklearnComp, axis=0).T})
df4=pd.DataFrame({'NNFDiff_1':sklearnCompDiff[0,:].T,'NNFDiff_2':sklearnCompDiff[1,:].T,'NNFDiff_3':sklearnCompDiff[2,:].T,'NNFDiff_W':sklearnCompDiff.sum(axis=0).T,'NNFDiff_Comp':np.argmax(sklearnCompDiff, axis=0).T})
pd.concat([df0,df1,df2,df3,df4],axis=1).to_csv(dataFolder+'StratificationResults.csv')

In [0]:
import networkx as nx
H=nx.from_numpy_matrix(np.matrix(nnnetFiltered))
nx.write_edgelist(H, dataFolder+"Hofree-edgelist.csv")

In [0]:
plt.figure(1,figsize=(16,10))
pos=nx.graphviz_layout(H,prog="neato")
node_color=np.argmax(stratipyCompGDiff, axis=0)
nx.draw(H,pos,with_labels=False,node_size=50,node_color=node_color,cmap = plt.cm.Pastel1)
cut = 1.05
xmax= cut*max(xx for xx,yy in pos.values())
ymax= cut*max(yy for xx,yy in pos.values())
plt.xlim(0,xmax)
plt.ylim(0,ymax)
plt.show()

## Check the effects of the parameters

In [0]:
err=np.zeros((20,11))
for ncomp in range(20):
    for smooth in range(11):
        print "Ncomp=",ncomp+1," Smooth=",smooth/10.,
        WDiff2,stratipyCompGDiff2,error = GNMF(np.matrix(nnmutDiffused),np.matrix(nnnetFiltered),smooth/10.,n_components=ncomp+1,tol=1e-3,max_iter=5)
        err[ncomp,smooth]=error
        print " Error=",error

In [0]:
err2=np.zeros((20,11))
for ncomp in range(20):
    for smooth in range(11):
        print "Ncomp=",ncomp+1," Smooth=",smooth/10.,
        WDiff2,stratipyCompGDiff2,error = GNMF(np.matrix(nnmutFiltered),np.matrix(nnnetFiltered),smooth/10.,n_components=ncomp+1,tol=1e-3,max_iter=5)
        err2[ncomp,smooth]=error
        print " Error=",error

In [0]:
plt.figure(figsize=(20,11))
plt.subplot(121)
plt.imshow(err, interpolation="nearest")
plt.gca().invert_yaxis()
plt.xticks(np.arange(11),np.arange(11)/10.)
plt.yticks(np.arange(20),np.arange(20)+1)
plt.ylabel("Number of Component(s)")
plt.xlabel("Smoothing factor")
plt.title("Absolute error")
plt.colorbar()
plt.subplot(122)
plt.imshow(err-np.matrix(np.mean(err,axis=1)).T*np.matrix(np.ones(11)), interpolation="nearest")
plt.gca().invert_yaxis()
plt.xticks(np.arange(11),np.arange(11)/10.)
plt.yticks(np.arange(20),np.arange(20)+1)
plt.ylabel("Number of Component(s)")
plt.xlabel("Smoothing factor")
plt.title("Relative error by number of component")
plt.colorbar()
plt.show()

In [0]:
plt.figure(figsize=(20,10))
plt.subplot(121)
plt.imshow(err2, interpolation="nearest")
plt.gca().invert_yaxis()
plt.xticks(np.arange(11),np.arange(11)/10.)
plt.yticks(np.arange(20),np.arange(20)+1)
plt.ylabel("Number of Component(s)")
plt.xlabel("Smoothing factor")
plt.title("Absolute error")
plt.colorbar()
plt.subplot(122)
plt.imshow(err2-np.matrix(np.mean(err2,axis=1)).T*np.matrix(np.ones(11)), interpolation="nearest")
plt.gca().invert_yaxis()
plt.xticks(np.arange(11),np.arange(11)/10.)
plt.yticks(np.arange(20),np.arange(20)+1)
plt.ylabel("Number of Component(s)")
plt.xlabel("Smoothing factor")
plt.title("Relative error by number of component")
plt.colorbar()
plt.show()

In [0]:
plt.figure(figsize=(16,10))
plt.subplot(121)
plt.plot(np.vstack((err.mean(axis=1)-err.mean(),err2.mean(axis=1)-err2.mean())).T)
plt.ylabel("Average relative reconstruction error")
plt.xlabel("Number of Component(s)")
plt.subplot(122)
plt.plot(np.vstack((err.mean(axis=0)-err.mean(),err2.mean(axis=0)-err2.mean())).T)
plt.ylabel("Average relative reconstruction error")
plt.xlabel("Smoothing factor")
plt.legend({"Diffused","Filtered"})
plt.show()

## Under construction ...