In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Graph Embeddings

These are different Graph embeddings which bear important resemblance to the node level representations in a particular context. Graph embeddings are  use to capture dofferent aspects of representations arising from node /subgraph level heuristics. These include relations between the different nodes in a graph, different representations amongst different sub graphs, and projection of the embedding vectors in lower dimensional space. Graph embeddings work on knowledge graph data structures and capture representations based on different factors. These representations are encoded in a lower dimensional space where each input representation is mapped to. The metrics can be based on edge weights, neighbourhood count, degrees ,centralities, heat kernels and many other factors. The most simplistic idea of generating a graph embedding is to create a representation where 2 nodes ```x``` and ```y``` can be represented as ```grad(|x-y|w)``` where w is the edge weight connecting the nodes. There are different types of graph embeddings such as node embeddings, structural embeddings, graph embeddings, community based embeddings, geometric embeddings and others. In this notebook, we will be looking at some of the most popular node embeddings which can be used to create representations of a graph along with the associated research papers. The following shows a representation of how subgraph level instructions are encoded in a  lower dimensional vector.

<img src="https://miro.medium.com/max/1212/1*qYGg0y_0hRdITlMIT5TadA.png">


In [None]:
!pip install networkx
!pip install gensim
!pip install torch
!pip install tensorflow

## Laplacian Eigenvalues

Solve the generalized eigenvalue problem:

                                                   Lv=λDv

The Laplacian Eigenmap uses the smallest eigenvectors. But not the very smallest eigenvector, v1, which is constant (we can scale it to be a vector of 1s), and corresponds to an eigenvalue of zero. So if you want to reduce to two dimensions, use the second-smallest and third-smallest eigenvectors.

Let’s do a brief bit of rearranging:

                                  Lv=λDv(Dexp(−1))Lv=λv(D−1D−D−1W)v=λv(I−P)v=λvLrwv=λv

So it turns out that the standard eigenvalue problem with Lrw will produce the same results as the generalized eigenvalue problem with L and D. A non-generalized eigenvalue problem is preferable to the generalized problem, at least in R, because generalized problems require installing the CRAN package geigen. You could even use the eigenvectors of P, although you have to bear in mind that the eigenvalues of P differ from Lrw although in that case the order of the eigenvectors are reversed, i.e. you want those associated with the largest eigenvalues, ignoring the top eigenvector (which is the constant eigenvector). P might even be preferable because it’s ever-so-slightly less work to calculate than Lrw. We’ll revisit the relationship between Lrw and P when we talk about diffusion maps.

### Output

Now that you have k eigenvectors, stack them columnwise to form an N x k matrix (let’s call it Y):

                                             Y=[v2|v3|…|vk]

where, as noted above, we are not using the uninformative smallest eigenvector, v1. The rows of that matrix are the coordinates of the graph vertices in the reduced dimension, i.e. the ith row of the 2D Laplacian Eigenmap representing vertex i would be:

yi=(vi,2,vi,3)

The Connection with Locally Linear Embedding
The Laplacian Eigenmap paper demonstrates a connection between LE and LLE, in that LLE is approximately computing the eigenvectors of L2, which has the same eigenvectors as L (and the square of the eigenvalues).

The paper on [Laplacian Eigen maps](https://papers.nips.cc/paper/2001/file/f106b7f99d2cb30c3db1c3cc0fde9ccb-Paper.pdf)
[Blog](http://www.dakotamurray.me/post/graph_laplacian_eigenmap/)

<img src="https://ww2.mathworks.cn/matlabcentral/mlc-downloads/downloads/submissions/36141/versions/2/screenshot.png">

In many cases the Eigen map can be used to represent node level representations in a social network:

<img src="http://www.dakotamurray.me/post_images/graph_laplacian_eigenmaps/karate_graph.png">

In [None]:
import networkx as nx
import scipy
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

train_df=pd.read_csv('../input/google-quest-challenge/train.csv')
print(train_df.columns)
g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)
edgelist=[]
ed_c=g.number_of_edges()
for e in g.edges():
    node1=e[0]
    node2=e[1]
    n_c=0
    n_c+=len([j for j in g.neighbors(node1)])
    n_c+=len([j for j in g.neighbors(node2)])
    normalized_count=n_c/ed_c
    g[e[0]][e[1]]['weight']=normalized_count

nx.draw(g,pos=nx.spring_layout(g))
class laplacian_maps():
    def __init__(self,graph,is_weighted,d):
        self.graph=graph
        self.is_weighted=is_weighted
        self.graph=graph.to_undirected()
        self.d=d
    def create_embedding(self):
        laplace_matrix=nx.normalized_laplacian_matrix(self.graph)
        if(self.graph.number_of_nodes()<self.d):
            einsum,embedding_vectors=scipy.sparse.linalg.eigsh(laplace_matrix,k=self.graph.number_of_nodes()-1,which='LM',ncv=10*self.d, return_eigenvectors=True)
            diff=self.d-embedding_vectors.shape[0]-1
            einsum= np.pad(embedding_vectors, (1, diff), 'constant', constant_values=0)
        else:
            einsum,embedding_vectors=scipy.sparse.linalg.eigsh(laplace_matrix,k=self.d,which='LM',ncv=10*self.d, return_eigenvectors=True)
        self.embedding=embedding_vectors
        return embedding_vectors
    
    def plot_embedding(self,node_pos,node_colors=None, di_graph=None, labels=None):
        node_num,embedding_dimension = node_pos.shape
        if(embedding_dimension > 3):
            print("Embedding dimension greater than 3, use tSNE to reduce it to 3")
            model = TSNE(n_components=3)
            node_pos = model.fit_transform(node_pos)

        if di_graph is None:
            
            plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors)
        else:
            pos = {}
            for i in range(node_num):
                pos[i] = node_pos[i, :]
            if node_colors is not None:
                nx.draw_networkx_nodes(di_graph, pos,
                                   node_color=node_colors,
                                   width=0.1, node_size=100,
                                   arrows=False, alpha=0.8,
                                   font_size=5, labels=labels)
            else:
                nx.draw_networkx(di_graph, pos, node_color=node_colors,
                             width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12, labels=labels)

    def node_level_embedding(self,node,embed):
        embed_node=embed[node]
        vals=list(self.graph.nodes())
        def chebyshev_distance(node1,node2):
            return scipy.spatial.distance.chebyshev(node1,node2)
        distances=[]
        questions=[]
        for i in range(self.graph.number_of_nodes()):
            if i!=node:
                distances.append(chebyshev_distance(embed_node,embed[i]))
                questions.append(vals[i])
        return vals[node],distances,questions
        
dimension=500
laplace=laplacian_maps(g,True,dimension) 
embeddings=laplace.create_embedding()
laplace.plot_embedding(embeddings)
node_num=24
node,distances,questions=laplace.node_level_embedding(node_num,embeddings)
laplace_df=pd.DataFrame(columns=['Question','Sample_Question','Chebyshev_Distance'])
laplace_df['Question']=[node]*len(distances)
laplace_df['Sample_Question']=questions
laplace_df['Chebyshev_Distance']=distances
laplace_df.head()

laplace_df.to_csv('../Laplacian_Embeddings.csv',index=False)

## A Driver for Plotting the Graphs

The below code sample is used for drawing the graphs created with any kind of node level representational embeddings.

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly
import plotly.graph_objs as go
init_notebook_mode(connected=True)
laplacian_g=nx.from_pandas_edgelist(laplace_df,source='Question',target='Sample_Question',edge_attr='Chebyshev_Distance')
G=laplacian_g
def plotter(G,title):
    pos = nx.spring_layout(G, k=0.5, iterations=50)
    for n, p in pos.items():
        G.nodes[n]['pos'] = p
    edge_trace = go.Scatter(
        x=[],
        y=[],
        line=dict(width=0.5,color='#888'),
        hoverinfo='none',
        mode='lines')

    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_trace['x'] += tuple([x0, x1, None])
        edge_trace['y'] += tuple([y0, y1, None])
    node_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='RdBu',
            reversescale=True,
            color=[],
            size=15,
            colorbar=dict(
                thickness=10,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line=dict(width=0)))

    for node in G.nodes():
        x, y = G.nodes[node]['pos']
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
    for node, adjacencies in enumerate(G.adjacency()):
        node_trace['marker']['color']+=tuple([len(adjacencies[1])])
        node_info = adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
        node_trace['text']+=tuple([node_info])
    fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title=title,
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="No. of connections",
                    showarrow=False,
                    xref="paper", yref="paper") ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    iplot(fig)
plotter(G,'Laplace Eigenmap Distance')

## HOPE

An implementation of `"HOPE" <https://www.kdd.org/kdd2016/papers/files/rfp0184-ouA.pdf>`_ from the KDD '16 paper "Asymmetric Transitivity Preserving Graph Embedding". The procedure uses sparse SVD on the neighbourhood overlap matrix. The singular value rescaled left and right singular vectors are used as the node embeddings after concatenation.
A diagram depicting the operation of HOPE is shown here:

<img src="https://d3i71xaburhd42.cloudfront.net/07627bf7eb649220ffbcdf6bf233e3a4a76e8590/2-Figure2-1.png">

In this case, we are creating an adjacency matrix and then converting it to a sparse matrix which is in triplet format (according to scipy). Then a decomposition method is applied on the sparse matrix (ideally SVD) such that the matrix can be siubdivided into nomralizes/rescaled left and right submatrix (subvector). The final embedding vector is attained by combining both the left and right counterparts. HOPE is used for preserving the higher order degree based adjacency matrix similarities by decomposing through a standard reduction technique. A walkthrough of the same is present in the video:

[Video]("https://www.semanticscholar.org/paper/Asymmetric-Transitivity-Preserving-Graph-Embedding-Ou-Cui/07627bf7eb649220ffbcdf6bf233e3a4a76e8590/video/4d967a6b")

In [None]:
class HOPE():
    def __init__(self,graph,d):
        self.graph=graph
        self.dimension=d
    def create_embeddings(self):
        nodes=self.graph.number_of_nodes()
        adj_matrix=nx.adjacency_matrix(self.graph,nodelist=range(nodes))
        S_matrix=scipy.sparse.coo_matrix(adj_matrix.dot(adj_matrix))
        U,sigma,vt=scipy.sparse.linalg.svds(S_matrix,k=self.dimension//2)
        sigma_norm=np.diag(np.sqrt(sigma))
        self.left_embedding = np.dot(U, sigma_norm)
        self.right_embedding = np.dot(vt.T, sigma_norm)
        return np.concatenate([self.left_embedding, self.right_embedding], axis=1)
    def plot_embedding(self,node_pos,node_colors=None, di_graph=None, labels=None):
        node_num,embedding_dimension = node_pos.shape
        if(embedding_dimension > 3):
            print("Embedding dimension greater than 3, use tSNE to reduce it to 3")
            model = TSNE(n_components=3)
            node_pos = model.fit_transform(node_pos)

        if di_graph is None:
            
            plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors)
        else:
            pos = {}
            for i in range(node_num):
                pos[i] = node_pos[i, :]
            if node_colors is not None:
                nx.draw_networkx_nodes(di_graph, pos,
                                   node_color=node_colors,
                                   width=0.1, node_size=100,
                                   arrows=False, alpha=0.8,
                                   font_size=5, labels=labels)
            else:
                nx.draw_networkx(di_graph, pos, node_color=node_colors,
                             width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12, labels=labels)

    def node_level_embedding(self,node,embed):
        embed_node=embed[node]
        vals=list(self.graph.nodes())
        def chebyshev_distance(node1,node2):
            return scipy.spatial.distance.chebyshev(node1,node2)
        distances=[]
        questions=[]
        for i in range(self.graph.number_of_nodes()):
            if i!=node:
                distances.append(chebyshev_distance(embed_node,embed[i]))
                questions.append(vals[i])
        return vals[node],distances,questions
g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)
dimension=200
hope=HOPE(g,dimension)
hope_embeddings=hope.create_embeddings()
hope.plot_embedding(hope_embeddings)
node_num=24
node,distances,questions=laplace.node_level_embedding(node_num,hope_embeddings)
hope_df=pd.DataFrame(columns=['Question','Sample_Question','Chebyshev_Distance'])
hope_df['Question']=[node]*len(distances)
hope_df['Sample_Question']=questions
hope_df['Chebyshev_Distance']=distances
hope_df.head()
hope_df.to_csv('../HOPE_Embeddings.csv',index=False)
hope_g=nx.from_pandas_edgelist(hope_df,source='Question',target='Sample_Question',edge_attr='Chebyshev_Distance')
G=hope_g
plotter(G,'HOPE Embeddings')

## GraRep

An implementation of `"GraRep" <https://dl.acm.org/citation.cfm?id=2806512>`_from the CIKM '15 paper "GraRep: Learning Graph Representations with Global Structural Information". The procedure uses sparse truncated SVD to learn embeddings for the powers of the PMI matrix computed from powers of the normalized adjacency matrix.

<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ0i5YeMIs2BkTNd4PVQleBaJMqC70Ag5SbsQ&usqp=CAU">

The abstract of the paper reads as follows:

```In this paper, we present GraRep, a novel model for learning vertex representations of weighted graphs. This model learns low dimensional vectors to represent vertices appearing in a graph and, unlike existing work, integrates global structural information of the graph into the learning process. We also formally analyze the connections between our work and several previous research efforts, including the DeepWalk model of Perozzi et al. as well as the skip-gram model with negative sampling of Mikolov et al. We conduct experiments on a language network, a social network as well as a citation network and show that our learned global representations can be effectively used as features in tasks such as clustering, classification and visualization. Empirical results demonstrate that our representation significantly outperforms other state-of-the-art methods in such tasks.```

Grarep uses multihop similarity by running on k nearest neighbours.In this implementation, we create the adjacency matrix and normalize the weights with the degree values ```(1/degree(node))```. Then we use the degree matrix created and multiply it with the adjacency matrix. The loss function followed here is similar to the one shown in the below image:

<img src="https://i.imgur.com/Ia2wvcZ.png">

Since we are concerned with reducing the effective distance between the vectors, we take the lower scores,which is better known as the Pointwise Mutual Information PMI matrix. The remaining part is the same as decomposing the vector space with truncated svd followed in HOPE. 
[Resource](http://snap.stanford.edu/proj/embeddings-www/files/nrltutorial-part1-embeddings.pdf)

In [None]:
import math
# from sklearn.decomposition import TruncatedSVD
class GraRep():
    def __init__(self,graph,d,order):
        self.graph=graph
        self.dimension=d
        self.order=order
        self._embeddings=[]
    def create_matrix(self):
        adj_matrix=nx.adjacency_matrix(self.graph,nodelist=range(self.graph.number_of_nodes()))
        idx=np.arange(self.graph.number_of_nodes())
        degree_vals=np.array([(1.0/self.graph.degree[node]) for node in list(self.graph.nodes())])
        n=self.graph.number_of_nodes()
        #(data,row,columns)
        D=scipy.sparse.coo_matrix((degree_vals,(idx,idx)),shape=(n,n))
        adj_matrix=D.dot(adj_matrix)
        return adj_matrix
    
    def create_loss(self):
        adj_matrix=self.create_matrix()
        #Apply log loss
        alpha=1e-5
        adj_tilde=scipy.sparse.coo_matrix(adj_matrix.dot(adj_matrix))
        loss_scores=np.log(adj_tilde.data)-math.log(adj_tilde.shape[0])- alpha
        rows=adj_tilde.row[loss_scores<0]
        cols=adj_tilde.col[loss_scores<0]
        loss_scores=loss_scores[loss_scores<0]
        loss_matrix=scipy.sparse.coo_matrix((loss_scores,(rows,cols)),shape=adj_tilde.shape)
        return loss_matrix
    
    def single_embedding(self):
        S_matrix=self.create_loss()
        
        U,sigma,vt=scipy.sparse.linalg.svds(S_matrix,k=self.dimension//2)
        sigma_norm=np.diag(np.sqrt(sigma))
        self.left_embedding = np.dot(U, sigma_norm)
        self.right_embedding = np.dot(vt.T, sigma_norm)
        embedding=np.concatenate([self.left_embedding, self.right_embedding], axis=1)
        '''
        svd = TruncatedSVD(n_components=self.dimension,
                           n_iter=6,
                           random_state=42)
        svd.fit(S_matrix)
        embedding = svd.transform(S_matrix)
        '''
        self._embeddings.append(embedding)
        
    def create_embeddings(self):
        
        loss_matrix=self.create_loss()
        single_emb=self.single_embedding()
        for i in range(self.order-1):
            loss_matrix=self.create_loss()
            self.single_embedding()
        return np.concatenate(self._embeddings,axis=1)
    def plot_embedding(self,node_pos,node_colors=None, di_graph=None, labels=None):
        node_num,embedding_dimension = node_pos.shape
        if(embedding_dimension > 3):
            print("Embedding dimension greater than 3, use tSNE to reduce it to 3")
            model = TSNE(n_components=3)
            node_pos = model.fit_transform(node_pos)

        if di_graph is None:
            
            plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors)
        else:
            pos = {}
            for i in range(node_num):
                pos[i] = node_pos[i, :]
            if node_colors is not None:
                nx.draw_networkx_nodes(di_graph, pos,
                                   node_color=node_colors,
                                   width=0.1, node_size=100,
                                   arrows=False, alpha=0.8,
                                   font_size=5, labels=labels)
            else:
                nx.draw_networkx(di_graph, pos, node_color=node_colors,
                             width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12, labels=labels)

    def node_level_embedding(self,node,embed):
        embed_node=embed[node]
        vals=list(self.graph.nodes())
        def chebyshev_distance(node1,node2):
            return scipy.spatial.distance.chebyshev(node1,node2)
        distances=[]
        questions=[]
        for i in range(self.graph.number_of_nodes()):
            if i!=node:
                distances.append(chebyshev_distance(embed_node,embed[i]))
                questions.append(vals[i])
        return vals[node],distances,questions    
    
        

g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)
dimension=200
order=5
grarep=GraRep(g,dimension,order)
grarep_embeddings=grarep.create_embeddings()
print(grarep_embeddings.shape)
grarep.plot_embedding(grarep_embeddings)
node_num=24
node,distances,questions=grarep.node_level_embedding(node_num,grarep_embeddings)
grarep_df=pd.DataFrame(columns=['Question','Sample_Question','Chebyshev_Distance'])
grarep_df['Question']=[node]*len(distances)
grarep_df['Sample_Question']=questions
grarep_df['Chebyshev_Distance']=distances
grarep_df.head()
grarep_df.to_csv('../Grarep_Embeddings.csv',index=False)
grarep_g=nx.from_pandas_edgelist(hope_df,source='Question',target='Sample_Question',edge_attr='Chebyshev_Distance')
G=grarep_g
plotter(G,'Grarep Embeddings')        
        
        
        

## NEMF

An implementation of `"NetMF" <https://keg.cs.tsinghua.edu.cn/jietang/publications/WSDM18-Qiu-et-al-NetMF-network-embedding.pdf>`_ from the WSDM '18 paper "Network Embedding as Matrix Factorization: Unifying DeepWalk, LINE, PTE, and Node2Vec". The procedure uses sparse truncated SVD to learn embeddings for the pooled powers of the PMI matrix computed from powers of the normalized adjacency matrix.

In this case, we are creating the adjacency matrix as in grarep. We are also creating a pooled adjacency matrix by multiplying the edge weights with the number of edges in the graph and then dividing by the number of dimensions required for in the embedding space. Then we are decomposing the PMI matrix with SVD to learn embedding vectors.
<img src="https://media.springernature.com/lw685/springer-static/image/art%3A10.1007%2Fs00439-020-02226-3/MediaObjects/439_2020_2226_Fig3_HTML.png">



In [None]:
class NEMF():
    def __init__(self,graph,d,order,negative_samples):
        self.graph=graph
        self.dimension=d
        self.order=order
        self.negative_samples=negative_samples
        self._embeddings=[]
    def create_matrix(self):
        adj_matrix=nx.adjacency_matrix(self.graph,nodelist=range(self.graph.number_of_nodes()))
        idx=np.arange(self.graph.number_of_nodes())
        degree_vals=np.array([(1.0/self.graph.degree[node]) for node in list(self.graph.nodes())])
        n=self.graph.number_of_nodes()
        #(data,row,columns)
        D=scipy.sparse.coo_matrix((degree_vals,(idx,idx)),shape=(n,n))
        adj_matrix=D.dot(adj_matrix)
        return adj_matrix,D
    
    def create_loss(self):
        adj_matrix,D=self.create_matrix()
        #Apply log loss
        alpha=1e-5
        a_pooled=adj_matrix
        for i in range(self.order-1):
            adj_tilde=scipy.sparse.coo_matrix(adj_matrix.dot(adj_matrix))
            a_pooled+=adj_tilde
        a_pooled = (self.graph.number_of_edges()*a_pooled)/(self.order*self.negative_samples)
        a_pooled=scipy.sparse.coo_matrix(a_pooled.dot(D))
        loss_scores=np.log(a_pooled.data)-math.log(a_pooled.shape[0])- alpha
        rows=a_pooled.row[loss_scores<0]
        cols=a_pooled.col[loss_scores<0]
        loss_scores=loss_scores[loss_scores<0]
        loss_matrix=scipy.sparse.coo_matrix((loss_scores,(rows,cols)),shape=a_pooled.shape)
        return loss_matrix
    
    def single_embedding(self):
        S_matrix=self.create_loss()
        
        U,sigma,vt=scipy.sparse.linalg.svds(S_matrix,k=self.dimension//2)
        sigma_norm=np.diag(np.sqrt(sigma))
        self.left_embedding = np.dot(U, sigma_norm)
        self.right_embedding = np.dot(vt.T, sigma_norm)
        embedding=np.concatenate([self.left_embedding, self.right_embedding], axis=1)
        '''
        svd = TruncatedSVD(n_components=self.dimension,
                           n_iter=6,
                           random_state=42)
        svd.fit(S_matrix)
        embedding = svd.transform(S_matrix)
        '''
        self._embeddings.append(embedding)
        
    def create_embeddings(self):
        
        loss_matrix=self.create_loss()
        single_emb=self.single_embedding()
        for i in range(self.order-1):
            loss_matrix=self.create_loss()
            self.single_embedding()
        return np.concatenate(self._embeddings,axis=1)
    def plot_embedding(self,node_pos,node_colors=None, di_graph=None, labels=None):
        node_num,embedding_dimension = node_pos.shape
        if(embedding_dimension > 3):
            print("Embedding dimension greater than 3, use tSNE to reduce it to 3")
            model = TSNE(n_components=3)
            node_pos = model.fit_transform(node_pos)

        if di_graph is None:
            
            plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors)
        else:
            pos = {}
            for i in range(node_num):
                pos[i] = node_pos[i, :]
            if node_colors is not None:
                nx.draw_networkx_nodes(di_graph, pos,
                                   node_color=node_colors,
                                   width=0.1, node_size=100,
                                   arrows=False, alpha=0.8,
                                   font_size=5, labels=labels)
            else:
                nx.draw_networkx(di_graph, pos, node_color=node_colors,
                             width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12, labels=labels)

    def node_level_embedding(self,node,embed):
        embed_node=embed[node]
        vals=list(self.graph.nodes())
        def chebyshev_distance(node1,node2):
            return scipy.spatial.distance.chebyshev(node1,node2)
        distances=[]
        questions=[]
        for i in range(self.graph.number_of_nodes()):
            if i!=node:
                distances.append(chebyshev_distance(embed_node,embed[i]))
                questions.append(vals[i])
        return vals[node],distances,questions    
g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)
dimension=200
order=5
negative_samples=1
nemt=NEMF(g,dimension,order,negative_samples)
nemt_embeddings=nemt.create_embeddings()
print(nemt_embeddings.shape)
nemt.plot_embedding(nemt_embeddings)
node_num=24
node,distances,questions=nemt.node_level_embedding(node_num,nemt_embeddings)
nemt_df=pd.DataFrame(columns=['Question','Sample_Question','Chebyshev_Distance'])
nemt_df['Question']=[node]*len(distances)
nemt_df['Sample_Question']=questions
nemt_df['Chebyshev_Distance']=distances
nemt_df.head()
nemt_df.to_csv('../NEMT_Embeddings.csv',index=False)
nemt_g=nx.from_pandas_edgelist(hope_df,source='Question',target='Sample_Question',edge_attr='Chebyshev_Distance')
G=nemt_g
plotter(G,'NEMT Embeddings')        
        
        
        

## Walklets 

Walklets are based on randomized deep walks in a neighbour of a particular node to create embeddings. These operate in a manner similar to node2vec/deepwalk but skip some internal nodes in the walk stage. This is greatly used for capturing multi hop based representation patterns corresponding to a particular node embedding. The [paper](https://arxiv.org/abs/1605.02115) provides this idea. 

The abstract reads as follows:

```We present Walklets, a novel approach for learning multiscale representations of vertices in a network. In contrast to previous works, these representations explicitly encode multiscale vertex relationships in a way that is analytically derivable. Walklets generates these multiscale relationships by subsampling short random walks on the vertices of a graph. By `skipping' over steps in each random walk, our method generates a corpus of vertex pairs which are reachable via paths of a fixed length. This corpus can then be used to learn a series of latent representations, each of which captures successively higher order relationships from the adjacency matrix. We demonstrate the efficacy of Walklets's latent representations on several multi-label network classification tasks for social networks such as BlogCatalog, DBLP, Flickr, and YouTube. Our results show that Walklets outperforms new methods based on neural matrix factorization. Specifically, we outperform DeepWalk by up to 10% and LINE by 58% Micro-F1 on challenging multi-label classification tasks. Finally, Walklets is an online algorithm, and can easily scale to graphs with millions of vertices and edges.```


<img src="https://d3i71xaburhd42.cloudfront.net/37cf46e45777e67676f80c9110bed675a9840590/7-Figure4-1.png">

In this implementation, we have created a Random Walker Template from the previous node2vec approach.
```python
for step in range(power+1):
                neighbors = [n for i, n in enumerate(walk[step:]) if i % power == 0]
                walklets.append(neighbors)
```
Where skipping happens based in the count of the neighbours of a particular node. A Word2Vec model is then trained based on negative skipgram sampling to create the new embedding vectors for the walked paths.


In [None]:
import random
from gensim.models.word2vec import Word2Vec
class RandomWalkerTemplate:
    def __init__(self, walk_length, walk_number):
        self.walk_length = walk_length
        self.walk_number = walk_number

    def do_walk(self, node):
        walk = [node]
        for _ in range(self.walk_length-1):
            nebs = [node for node in self.graph.neighbors(walk[-1])]
            if len(nebs) > 0:
                walk = walk + random.sample(nebs, 1)
        walk = [str(w) for w in walk]
        return walk

    def do_walks(self, graph):
        self.walks = []
        self.graph = graph
        for node in self.graph.nodes():
            for _ in range(self.walk_number):
                walk_from_node = self.do_walk(node)
                self.walks.append(walk_from_node)
class Walklet():
    def __init__(self,graph, walk_number=10, walk_length=80, dimensions=32, workers=4,
                 window_size=4, epochs=1, learning_rate=0.05, min_count=1, seed=42):
        self.graph=graph
        self.walk_number = walk_number
        self.walk_length = walk_length
        self.dimensions = dimensions
        self.workers = workers
        self.window_size = window_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.min_count = min_count
        self.seed = seed
        
    def _select_walklets(self, walks, power):
        walklets = []
        for walk in walks:
            for step in range(power+1):
                neighbors = [n for i, n in enumerate(walk[step:]) if i % power == 0]
                walklets.append(neighbors)
        return walklets

    def create_embeddings(self):
        
        walker = RandomWalkerTemplate(self.walk_length, self.walk_number)
        walker.do_walks(self.graph)
        num_of_nodes = self.graph.number_of_nodes()

        self._embedding = []
        for power in range(1, self.window_size+1):
            walklets = self._select_walklets(walker.walks, power)
            model = Word2Vec(walklets,
                             hs=0,
                             alpha=self.learning_rate,
                             
                             
                             window=1,
                             min_count=self.min_count,
                             workers=self.workers,seed=42)
            model.build_vocab(walklets, progress_per=2)
            model.train(
               walklets, total_examples=model.corpus_count, epochs=20, 
               report_delay=1
                )
            l=list(self.graph.nodes)
            embedding = np.array([model.wv[str(n)] for n in l])
            self._embedding.append(embedding)
        return np.concatenate(self._embedding, axis=1)
    def plot_embedding(self,node_pos,node_colors=None, di_graph=None, labels=None):
        node_num,embedding_dimension = node_pos.shape
        if(embedding_dimension > 3):
            print("Embedding dimension greater than 3, use tSNE to reduce it to 3")
            model = TSNE(n_components=3)
            node_pos = model.fit_transform(node_pos)

        if di_graph is None:
            
            plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors)
        else:
            pos = {}
            for i in range(node_num):
                pos[i] = node_pos[i, :]
            if node_colors is not None:
                nx.draw_networkx_nodes(di_graph, pos,
                                   node_color=node_colors,
                                   width=0.1, node_size=100,
                                   arrows=False, alpha=0.8,
                                   font_size=5, labels=labels)
            else:
                nx.draw_networkx(di_graph, pos, node_color=node_colors,
                             width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12, labels=labels)

    def node_level_embedding(self,node,embed):
        embed_node=embed[node]
        vals=list(self.graph.nodes())
        def chebyshev_distance(node1,node2):
            return scipy.spatial.distance.chebyshev(node1,node2)
        distances=[]
        questions=[]
        for i in range(self.graph.number_of_nodes()):
            if i!=node:
                distances.append(chebyshev_distance(embed_node,embed[i]))
                questions.append(vals[i])
        return vals[node],distances,questions    
    
g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)

walklet=Walklet(g,walk_number=10, walk_length=80, dimensions=32, workers=4,
                 window_size=4, epochs=1, learning_rate=0.05, min_count=1, seed=42)
walklet_embeddings=walklet.create_embeddings()
print(walklet_embeddings.shape)
walklet.plot_embedding(walklet_embeddings)
node_num=24
node,distances,questions=walklet.node_level_embedding(node_num,walklet_embeddings)
walklet_df=pd.DataFrame(columns=['Question','Sample_Question','Chebyshev_Distance'])
walklet_df['Question']=[node]*len(distances)
walklet_df['Sample_Question']=questions
walklet_df['Chebyshev_Distance']=distances
walklet_df.head()
walklet_df.to_csv('../Walklet_Embeddings.csv',index=False)
walklet_g=nx.from_pandas_edgelist(hope_df,source='Question',target='Sample_Question',edge_attr='Chebyshev_Distance')
G=walklet_g
plotter(G,'Walklet Embeddings')        

## Graph Level Embeddings

We have seen some of the popular node based embeddings for capturing information related to neighbourhoods and degrees. In this case we will be looking into one of the famous algorithms for graph based embeddings (subgraph embeddings) which uses the Wesfeiler Lehman Isomorphic Test, which is used to determine if 2 graphs are similar /isomorphic based on iterative traversals and hashing of the nodes. The test is then used to create Subgraph embeddings which in this case is the GL2Vec algorithm

## Wesfeiler Lehman Test

The WL Test produces for each graph a canonical form. If the canonical forms of two graphs are not equivalent, then the graphs are definitively not isomorphic. However, it is possible for two non-isomorphic graphs to share a canonical form, so this test alone cannot provide conclusive evidence that two graphs are isomorphic.

The Algorithm:

For iteration  of the algorithm we will be assigning to each node a tuple  containing the node’s old compressed label and a multiset of the node’s neighbors' compressed labels. A multiset is a set (a collection of elements where order is not important) where elements may appear multiple times.
At each iteration we will additionally be assigning to each node  a new “compressed” label  for that node’s set of labels. Any two nodes with the same  will get the same compressed label.

### Finding the Correspondance Between Isomorphic Graphs

The core idea of the Weisfeiler-Lehman isomorphism test is to find for each node in each graph a signature based on the neighborhood around the node. These signatures can then be used to find the correspondance between nodes in the two graphs, which can be used to check for isomorphism.

In the algorithm descibed above, the “compressed labels” serve as the signatures. Since multiple nodes may have the same compressed label, there are multiple possible correspondances suggested by a Weisfeiler-Lehman labeling. The Weisfeiler-Lehman isomorphism test itself does not provide a way of narrowing down the possible correspondances further.

<img src="https://davidbieber.com/post/2019-05-10-weisfeiler-lehman-isomorphism-test/graph-isomorphism-000.png">

[Blog](https://davidbieber.com/post/2019-05-10-weisfeiler-lehman-isomorphism-test/)
[Journal](https://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf)

In [None]:
import hashlib
class Wesfeiler_Lehman_IsoMap():
    def __init__(self,graph,wl_iterations):
        self.wl_iterations = wl_iterations
        self.graph = graph
        self.features = {node: self.graph.degree(node) for node in self.graph.nodes()}
        self._do_recursions()
    def _do_a_recursion(self):
        self.extracted_features = {k: [str(v)] for k, v in self.features.items()}
        new_features = {}
        for node in self.graph.nodes():
            neighbor = self.graph.neighbors(node)
            degs = [self.features[nb] for nb in neighbor]
            features = [str(self.features[node])]+sorted([str(deg) for deg in degs])
            features = "_".join(features)
            #Hasing the compressed values
            hash_object = hashlib.md5(features.encode())
            hashing = hash_object.hexdigest()
            new_features[node] = hashing
        self.extracted_features = {k: self.extracted_features[k] + [v] for k, v in new_features.items()}
        return new_features
    def _do_recursions(self):
        for _ in range(self.wl_iterations):
            self.features = self._do_a_recursion()

    def get_node_features(self):
        return self.extracted_features

    def get_graph_features(self):
        return [feature for node, features in self.extracted_features.items() for feature in features]
    
g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)
wl_isomap=Wesfeiler_Lehman_IsoMap(g,20)
node_features=wl_isomap.get_node_features()
graph_features=wl_isomap.get_graph_features()

## GL2Vec

The GL2Vec algorithm creates the line graph of each graph in the graph dataset.
The procedure creates Weisfeiler-Lehman tree features for nodes in graphs. 
Using these features a document (graph) - feature co-occurence matrix is decomposed in order to generate representations for the graphs.
The procedure assumes that nodes have no string feature present and the WL-hashing defaults to the degree centrality. 

[Paper](https://link.springer.com/chapter/10.1007/978-3-030-36718-3_1)

<img src="https://media.springernature.com/original/springer-static/image/chp%3A10.1007%2F978-3-030-36718-3_1/MediaObjects/493010_1_En_1_Fig1_HTML.png">

This implementation uses a document vectorization technique (Doc2Vec) for creating a higher order representation of deocument vectors. These document vectors are created by performing the WL test on the nodes of the graph. The PMI matrix at a subgraph level is then used to represent the embedding space. 

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
class GL2Vec():
    def __init__(self,graph, wl_iterations=2, dimensions=128, workers=4, down_sampling=0.0001,
                 epochs=10, learning_rate=0.025, min_count=5, seed=42):

        self.wl_iterations = wl_iterations
        self.dimensions = dimensions
        self.workers = workers
        self.down_sampling = down_sampling
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.min_count = min_count
        self.seed = seed
        self.graph=graph
        self._embedding=[]

    def _create_line_graph(self, graph):
        graph = nx.line_graph(self.graph)
        node_mapper = {node: i for i, node in enumerate(self.graph.nodes())}
        edges = [[node_mapper[edge[0]], node_mapper[edge[1]]] for edge in self.graph.edges()]
        line_graph = nx.from_edgelist(edges)
        return line_graph

    def create_embeddings(self, graphs):
        
        graphs = [self._create_line_graph(graph) for graph in graphs]
        docs = [Wesfeiler_Lehman_IsoMap(graph, self.wl_iterations) for graph in graphs]
        docs = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(docs)]

        model = Doc2Vec(docs,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        
                        alpha=self.learning_rate,
                        seed=self.seed)
        model.build_vocab(docs, progress_per=2)
        model.train(
               docs, total_examples=model.corpus_count, epochs=20, 
               report_delay=1
                )
        print('complete')
        l=list(self.graph.nodes)
        
        embedding=np.array([model.docvecs[str(n)] for n,_ in enumerate(docs)])
        self._embedding.append(embedding)
        return self._embedding
    def plot_embedding(self,node_pos,node_colors=None, di_graph=None, labels=None):
        node_num,embedding_dimension = node_pos.shape
        if(embedding_dimension > 3):
            print("Embedding dimension greater than 3, use tSNE to reduce it to 3")
            model = TSNE(n_components=3)
            node_pos = model.fit_transform(node_pos)

        if di_graph is None:
            
            plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors)
        else:
            pos = {}
            for i in range(node_num):
                pos[i] = node_pos[i, :]
            if node_colors is not None:
                nx.draw_networkx_nodes(di_graph, pos,
                                   node_color=node_colors,
                                   width=0.1, node_size=100,
                                   arrows=False, alpha=0.8,
                                   font_size=5, labels=labels)
            else:
                nx.draw_networkx(di_graph, pos, node_color=node_colors,
                             width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12, labels=labels)

    def node_level_embedding(self,node,embed):
        embed_node=embed[node]
        vals=list(self.graph.nodes())
        def chebyshev_distance(node1,node2):
            return scipy.spatial.distance.chebyshev(node1,node2)
        distances=[]
        questions=[]
        for i in range(self.graph.number_of_nodes()):
            if i!=node:
                distances.append(chebyshev_distance(embed_node,embed[i]))
                questions.append(vals[i])
        return vals[node],distances,questions    
        

g=nx.from_pandas_edgelist(train_df[:500],source='question_body',target='category')
nx.draw(g)
list_graph=[g]*20
gl2vec=GL2Vec(g,wl_iterations=2, dimensions=128, workers=4, down_sampling=0.0001,
                 epochs=10, learning_rate=0.025, min_count=5, seed=42)
gl2vec_embeddings=gl2vec.create_embeddings(list_graph)
print(gl2vec_embeddings)
gl2vec.plot_embedding(gl2vec_embeddings[0])
# node_num=24
# node,distances,questions=gl2vec.node_level_embedding(node_num,gl2vec_embeddings)
# gl2vec_df=pd.DataFrame(columns=['Question','Sample_Question','Chebyshev_Distance'])
# gl2vec_df['Question']=[node]*len(distances)
# gl2vec_df['Sample_Question']=questions
# gl2vec_df['Chebyshev_Distance']=distances
# gl2vec_df.head()
# gl2vec_df.to_csv('../Gl2Vec_Embeddings.csv',index=False)
# gl2vec_g=nx.from_pandas_edgelist(gl2vec_df,source='Question',target='Sample_Question',edge_attr='Chebyshev_Distance')
# G=gl2vec_g
# plotter(G,'Gl2Vec Embeddings')        