In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from scipy.sparse import csgraph

In [2]:
def cluster_size():
    """
    Return the size of the cluster required
    
    Args:
    
    Returns:
        size_of_cluster(int) : Size of the cluster 
    """
    
    size_of_cluster = int(pd.read_csv(file_location, sep=" ").columns[4])
    
    return size_of_cluster

In [3]:
def read_data():
    """
    Read the data from the location specified in the `file_location` variable into memory
    
    Args:
    
    Returns:
        data(pd.DataFrame) : The dataset which contains edge information between nodes
    """
    data = pd.read_csv(file_location, sep=" ")
    data.columns = ["0", "1", "2", "3", "4"]
    data = data.drop(["2", "3", "4"], axis=1)
    data.columns = ['Node_1', 'Node_2']
    return data

In [4]:
def extract_nodes(node_number, data):
    """
    Find out all the unique nodes listed in the
    node column specified as an argument
    
    Args:
        node_number(str)   : The node column to extract the unique values from
        data(pd.DataFrame) : The dataset which contains edge information between nodes
        
    Returns:
        unique_set(set)    : The unique set of nodes
    """
    
    unique_set = set(data[node_number].unique())
    return unique_set

In [5]:
def merging_nodes_complete_square(nodeA, nodeB, nodeAdd, nodeBlank):

    """
    Identify nodes present in one column and absent in the other column
    
    Args:
        nodeA(set)     : The set of node column which should contain nodes absent in the other node column
        nodeB(set)     : The set of node column which has nodes absent when compared to nodeA
        nodeAdd(str)   : The name of column where the set difference of nodeA and nodeB is stored
        nodeBlank(str) : The name of column which should remain blank to indicate that the node was absent in nodeB
        
    Returns:
        df(pd.DataFrame)   : Consists of nodes present in one column and absent in the other
    """    
    
    df = pd.DataFrame(list(nodeA.difference(nodeB)))
    df.columns = [nodeAdd]
    df[nodeBlank] = ""
    return df

In [6]:
def preprocess_adjacency_matrix():
    
    """
    Preprocess data to ensure square adjacency matrix by addition of missing nodes from one column into the other
    
    Args:
        
    Returns:
        data(pd.DataFrame)   : Preprocessed data which will ensure creation of square adjacency matrix
    """  
    
    data = read_data()
    nodes_1 = extract_nodes("Node_1", data)
    nodes_2 = extract_nodes("Node_2", data) 
    val_node2_not_node1_Df = merging_nodes_complete_square(nodes_2,
                                                           nodes_1,
                                                           "Node_1",
                                                           "Node_2")

    val_node1_not_node2_Df = merging_nodes_complete_square(nodes_1,
                                                           nodes_2,
                                                           "Node_2",
                                                           "Node_1")
    
    data = pd.concat([data, val_node2_not_node1_Df], ignore_index=True, sort=False)
    data = pd.concat([data, val_node1_not_node2_Df], ignore_index=True, sort=False)
    return data

In [7]:
def create_adjacency_matrix():
    
    """
    Create adjacency matrix
    
    Args:
        
    Returns:
        matrix(pd.DataFrame)   : Adjacency matrix
    """  
    
    data = preprocess_adjacency_matrix()
    matrix = pd.crosstab(data.Node_1, data.Node_2)
    matrix = matrix.drop('', axis=0)
    matrix = matrix.drop('', axis=1)
    return matrix

In [8]:
def own_implementation():
    
    """
    Self-implemented method to generate eigenvalues and eigenvectors
    
    Args:
        
    Returns:
        vals(numpy.ndarray) : Eigenvalues
        vecs(numpy.ndarray) : Eigenvectors
    """ 
    
    D = np.diag(matrix.sum(axis=1))
    L = D-matrix
    vals, vecs = np.linalg.eig(L)
    vecs = vecs[:,np.argsort(vals)]
    vals = vals[np.argsort(vals)]
    return vals, vecs

In [9]:
def pre_implementation():
    
    """
    Scipy-implemented method to generate eigenvalues and eigenvectors
    
    Args:
        
    Returns:
        vals(numpy.ndarray) : Eigenvalues
        vecs(numpy.ndarray) : Eigenvectors
    """ 
    
    laplacian_graph = csgraph.laplacian(matrix)
    vals, vecs = np.linalg.eig(laplacian_graph)
    vecs = vecs[:,np.argsort(vals)]
    vals = vals[np.argsort(vals)]
    return vals, vecs

In [10]:
def run_kmeans(implementation_type, matrix):
    
    """
    Run K-means implementation and match-up the cluster ID with the node number
    
    Args:
        implementation_type(function) : The function to use in order to perform eigen decomposition
        matrix(pd.DataFrame)          : Adjacency matrix
        
    Returns:
        result(pd.DataFrame)          : Dataframe consisting of mapping between cluster ID and node number
    """  
    
    vals, vecs = implementation_type()
    kmeans = KMeans(n_clusters=cluster_size())
    kmeans.fit(vecs)
    result = pd.DataFrame(kmeans.labels_)
    result.columns = ['Cluster_ID']
    result['Node'] = matrix.index
    for clusterid in set(result.Cluster_ID):
        print("Class-" + 
              str(clusterid) + 
              "  Count : "
              + str(len(result.loc[(result.Cluster_ID == clusterid)])))
    return result

In [11]:
#file_location = 'C:/Users/Swati Choudhary/Studies/AMDM/graphs_processed/ca-GrQc.txt'
#matrix = create_adjacency_matrix()

In [12]:
#result = run_kmeans(own_implementation, matrix)
#del result

In [13]:
#result = run_kmeans(pre_implementation, matrix)
#del result

In [14]:
#del matrix
#file_location = 'C:/Users/Swati Choudhary/Studies/AMDM/graphs_processed/Oregon-1.txt'
#matrix = create_adjacency_matrix()

In [15]:
#result = run_kmeans(own_implementation, matrix)
#del result

In [16]:
#result = run_kmeans(pre_implementation, matrix)
#del result

In [17]:
#del matrix
#file_location = 'C:/Users/Swati Choudhary/Studies/AMDM/graphs_processed/roadNet-CA.txt'
#matrix = create_adjacency_matrix()

In [18]:
#result = run_kmeans(own_implementation, matrix)
#del result

In [19]:
#result = run_kmeans(pre_implementation, matrix)
#del result

In [20]:
#del matrix
file_location = 'C:/Users/Swati Choudhary/Studies/AMDM/graphs_processed/soc-Epinions1.txt'
matrix = create_adjacency_matrix()

IndexError: index 1462814639 is out of bounds for axis 0 with size 1462807104

In [None]:
result = run_kmeans(own_implementation, matrix)
del result

In [None]:
result = run_kmeans(pre_implementation, matrix)
del result

In [None]:
del matrix
file_location = 'C:/Users/Swati Choudhary/Studies/AMDM/graphs_processed/web-NotreDame.txt'
matrix = create_adjacency_matrix()

In [None]:
result = run_kmeans(own_implementation, matrix)
del result