### Analysis of the DataSet

In [1]:
import pandas as pd
import numpy as np
import networkx as nx

### Preprocessing

In [2]:
def getSize(fileName):
    dataFrame = pd.read_csv(fileName,delimiter=',')
    traderList = list(dataFrame['Seller'])
    traderList.extend(list(dataFrame['Buyer']))
    traderSet = set(traderList)
    traderSize = len(traderSet)
    return traderSize

### Undirected Graph generation

In [5]:
# total transaction amount between them is the weight
def getUndirectedGraph(fileName):
#     K represents k nearest neighbors
    dataFrame = pd.read_csv(fileName, delimiter =',')
    nodeSize = getSize(fileName)
    myGraph = nx.Graph()
#     Added nodes to the Graph
    myGraph.add_nodes_from(list(range(1, nodeSize+1)))
#    Adding edges with weights to the Graph
    buyer = []
    seller = []
    trader = []
    for ind in dataFrame.index:
        u = dataFrame['Seller'][ind]
        v = dataFrame['Buyer'][ind]
        seller.append(u)
        buyer.append(v)
    
    vSeller = np.zeros(nodeSize+1)
    vBuyer = np.zeros(nodeSize+1)

    for x in seller:
        vSeller[x] =1 
    for x in buyer:
        vBuyer[x] = 1 

    for ind in dataFrame.index:
        u = dataFrame['Seller'][ind]
        v = dataFrame['Buyer'][ind]
        w = dataFrame['Amt'][ind]
        if (vSeller[u]==1) and (vBuyer[u]==1) and (vSeller[v]==1) and (vBuyer[v]==1) and(u!=v):
            trader.append(u)
            trader.append(v)
            if u in list(myGraph.adj[v]):
                myGraph[u][v]['weight']+=w
            else :
                myGraph.add_edge(u, v, weight = w)
    return (myGraph,list(set(trader)))

(undirectedGraph,nodeSet) = getUndirectedGraph('dataset.csv')

### Directed Graph generation

In [0]:
def getDirectedGraph(fileName):
    dataFrame = pd.read_csv(fileName, delimiter =',')
    nodeSize = getSize(fileName)
    myGraph = nx.DiGraph()
#     Added nodes to the Graph
    myGraph.add_nodes_from(list(range(1, nodeSize+1)))
#    Adding edges with weights to the Graph
    cnt = 0 
    for ind in dataFrame.index:
        cnt = (cnt+1)
        u = dataFrame['Seller'][ind]
        v = dataFrame['Buyer'][ind]
        w = dataFrame['Amt'][ind]
        if u in list(myGraph.adj[v]):
            myGraph[v][u]['weight']+=w
        else :
            myGraph.add_edge(u, v, weight = w)
        if(cnt == 1000): break
    return myGraph

DirectedGraph = getDirectedGraph('dataset.csv')

### Shared nearest neighbour Clustering Algorithm

### K-Nearest Neighbour

In [128]:
%%time
def kNear(graph, K):
    Dict = {}
    for node in nodeSet:
        neighbors = np.array(sorted(graph[node].items(), key=lambda e: e[1]["weight"], reverse=True))
        if(neighbors.shape[0]<K):
            neighborList = [x[0] for x in neighbors]
        else : 
            neighbors = neighbors[:K]
            neighborList = [x[0] for x in neighbors]
        Dict[node]=neighborList
    return Dict

knnGraph = kNear(DirectedGraph, 10)

CPU times: user 184 ms, sys: 3.01 ms, total: 187 ms
Wall time: 182 ms


### Union find algorithm

In [0]:
nodeSize = getSize('dataset.csv')
clusterId = np.array(list(range(nodeSize+1)))
clusterSize = np.ones(nodeSize, dtype = int)
def parent(u):
    while clusterId[u]!=u:
        u = clusterId[u]
    return u
def common(u, v):
    neighborU = knnGraph[u]
    neighborV = knnGraph[v]
    cnt = 0 
    for x in neighborU:
        if x in neighborV:
            cnt = cnt+1
    return cnt
def union(u, v):
    x = parent(u)
    y = parent(v)
    global clusterId
    global clusterSize
    if clusterSize[x]>clusterSize[y]:
        clusterId[y] = x
        clusterSize[x]+=clusterSize[y]
    else :
        clusterId[x] = y 
        clusterSize[y]+=clusterSize[x]

### Shared Nearest Neighbor Algorithm

In [130]:
%%time
kt = 2
for u in nodeSet:
    for v in nodeSet:
        x = parent(u)
        y = parent(v)
        if x!= y :
            if u in knnGraph[u] and v in knnGraph[u]:
                if common(u,v)>=kt:
                    union(u,v)
# union find
# set - > all nodes, each node as it's own cluster
# for every (u,v) in G : 
# if v not in C(u) && if k(u) && k(v)>= kt && u->knn(V) && v->knn(u):
# remove c(u), c(v) 
# add c(u)union c(v) to set S

CPU times: user 41.9 s, sys: 7.89 ms, total: 41.9 s
Wall time: 42 s


### Final dictionary of Shared nearest neighbor

In [0]:
SNNClust = {}
vis = np.zeros(nodeSize+1, dtype = int)
for node in range(1, nodeSize+1):
    currId = parent(node)
    if vis[currId]==0:
         SNNClust[currId]=[node]
    else : 
        SNNClust[currId].append(node)
    vis[currId] = 1

In [137]:
Dict = {}
for x in SNNClust:
  if len(SNNClust[x])>1:
    Dict[x]=SNNClust[x]
print(len(Dict))

24


In [0]:
import csv

w = csv.writer(open("output.csv", "w"))
for key, val in Dict.items():
    w.writerow([key, val])

# import csv

# with open('output.csv', 'wb') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerow(Dicto)

In [165]:
clusterList = [727, 9099, 13838, 14917]
for u in clusterList:
  for v in clusterList:
    if u!=v:
      if u in undirectedGraph.adj[v]:
        print(u, v)

727 9099
727 13838
727 14917
9099 727
9099 13838
13838 727
13838 9099
13838 14917
14917 727
14917 13838


### Playing with NetworkX

In [0]:
import networkx as nx

In [0]:
# Creating a graph
G = nx.Graph()

In [0]:
# Add nodes in the graph
G.add_node(1)
G.add_nodes_from([2,3])

In [0]:
# Add edges in the graph with weight
G.add_edge(1, 2, weight=4.7 )
G.add_edge(1,3, weight = 2.3)

In [0]:
# Accessing and modifying weight of a graph
G[1][2]['weight']+=2

In [0]:
# Getting number of nodes and edges in the graph
print(G.number_of_nodes())
print(G.number_of_edges())

In [0]:
# Node list and Edge List
# Neighbor List
nodeList = list(G.nodes)
edgeList = list(G.edges)
n1List = list(G.adj[1])
n2List = list(G.adj[2])
n3List = list(G.adj[4])
print(nodeList)
print(edgeList)
print(n1List)
print(n2List)
print(n3List)

In [0]:
# Remove nodes and edges
G.remove_edge(1,2)

In [0]:
# Creation of diGraph
# edges will be doubled
H = nx.DiGraph()
H.add_node(4)
H.add_edge(1,2, weight=2.3)
H.add_edge(1,2, weight = 3.2)
H.add_edge(2, 1, weight=3.2)
H.add_edge(1, 3, weight = 4.5)
print(list(H.adj[4]))
for (u, v, wt) in H.edges.data('weight'):
    print('(%d, %d, %.3f)' % (u, v, wt))
print(list(H.edges))
S = H.to_undirected()
for (u, v, wt) in S.edges.data('weight'):
    print('(%d, %d, %.3f)' % (u, v, wt))
print(list(S.edges))

[]
(1, 2, 3.200)
(1, 3, 4.500)
(2, 1, 3.200)
[(1, 2), (1, 3), (2, 1)]
(1, 2, 3.200)
(1, 3, 4.500)
[(1, 2), (1, 3)]


In [0]:
# creationg of normal Graph
# edges weight will be added
S = nx.Graph()
S.add_edge(1,2, weight=2.3)
S.add_edge(1,4, weight = 2.4)
S.add_edge(2, 1, weight=3.2)
S.add_edge(1, 3, weight = 2.0)
S.add_edge(2,3,weight=2.8)
S[1][2]['weight']+=2.0
S[2][1]['weight']+=10.0
print(S[1][2]['weight'])
foo = np.array((sorted(S[1].items(), key=lambda e: e[1]["weight"], reverse=True)))[:K]
print(foo.shape)
print(foo)
for (x, y) in foo:
    print(x)

15.2
(3, 2)
[[2 {'weight': 15.2}]
 [4 {'weight': 2.4}]
 [3 {'weight': 2.0}]]
2
4
3


In [0]:
# Accessing edges and neighbors
print(G[1][3]['weight'])
print(G[1])