### Analysis of the DataSet

In [52]:
import pandas as pd
import numpy as np
import networkx as nx
import csv

### Preprocessing

In [53]:
def getSize(fileName):
    dataFrame = pd.read_csv(fileName,delimiter=',')
    traderList = list(dataFrame['Seller'])
    traderList.extend(list(dataFrame['Buyer']))
    traderSet = set(traderList)
    traderSize = len(traderSet)
    return traderSize

### Undirected Graph generation

In [54]:
# total transaction amount between them is the weight
def getUndirectedGraph(fileName):
#     K represents k nearest neighbors
    dataFrame = pd.read_csv(fileName, delimiter =',')
    nodeSize = getSize(fileName)
    myGraph = nx.Graph()
#     Added nodes to the Graph
    myGraph.add_nodes_from(list(range(1, nodeSize+1)))
#    Adding edges with weights to the Graph
    buyer = []
    seller = []
    trader = []
    for ind in dataFrame.index:
        u = dataFrame['Seller'][ind]
        v = dataFrame['Buyer'][ind]
        seller.append(u)
        buyer.append(v)
    
    vSeller = np.zeros(nodeSize+1)
    vBuyer = np.zeros(nodeSize+1)

    for x in seller:
        vSeller[x] =1 
    for x in buyer:
        vBuyer[x] = 1 

    for ind in dataFrame.index:
        u = dataFrame['Seller'][ind]
        v = dataFrame['Buyer'][ind]
        w = dataFrame['Amt'][ind]
        if (vSeller[u]==1) and (vBuyer[u]==1) and (vSeller[v]==1) and (vBuyer[v]==1) and(u!=v):
            trader.append(u)
            trader.append(v)
            if u in list(myGraph.adj[v]):
                myGraph[u][v]['weight']+=w
            else :
                myGraph.add_edge(u, v, weight = w)
    return (myGraph,list(set(trader)))

(undirectedGraph,nodeSet) = getUndirectedGraph('dataset.csv')

### Directed Graph generation

In [97]:
def getDirectedGraph(fileName):
    dataFrame = pd.read_csv(fileName, delimiter =',')
    nodeSize = getSize(fileName)
    myGraph = nx.DiGraph()
#     Added nodes to the Graph
    myGraph.add_nodes_from(list(range(1, nodeSize+1)))
#    Adding edges with weights to the Graph 
    for ind in dataFrame.index:
        u = dataFrame['Seller'][ind]
        v = dataFrame['Buyer'][ind]
        w = dataFrame['Amt'][ind]
        if u in list(myGraph.adj[v]):
            myGraph[v][u]['weight']+=w
        else :
            myGraph.add_edge(v, u, weight = w)
    return myGraph

DirectedGraph = getDirectedGraph('dataset.csv')

### Shared nearest neighbour Clustering Algorithm

### K-Nearest Neighbour

In [56]:
%%time
def kNear(graph, K):
    Dict = {}
    for node in nodeSet:
        neighbors = np.array(sorted(graph[node].items(), key=lambda e: e[1]["weight"], reverse=True))
        if(neighbors.shape[0]<K):
            neighborList = [x[0] for x in neighbors]
        else : 
            neighbors = neighbors[:K]
            neighborList = [x[0] for x in neighbors]
        Dict[node]=neighborList
    return Dict

knnGraph = kNear(undirectedGraph, 6)

CPU times: user 202 ms, sys: 5.59 ms, total: 208 ms
Wall time: 216 ms


### Union find algorithm

In [136]:
nodeSize = getSize('dataset.csv')
clusterId = np.array(list(range(nodeSize+1)))
clusterSize = np.ones(nodeSize, dtype = int)
def parent(u):
    while clusterId[u]!=u:
        u = clusterId[u]
    return u
def common(u, v):
    neighborU = knnGraph[u]
    neighborV = knnGraph[v]
    return len(list(set(neighborU)&set(neighborV)))
def union(u, v):
    x = parent(u)
    y = parent(v)
    global clusterId
    global clusterSize
    if clusterSize[x]>clusterSize[y]:
        clusterId[y] = x
        clusterSize[x]+=clusterSize[y]
    else :
        clusterId[x] = y 
        clusterSize[y]+=clusterSize[x]

### Shared Nearest Neighbor Algorithm

In [137]:
%%time
kt = 2
for u in nodeSet:
    for v in nodeSet:
        x = parent(u)
        y = parent(v)
        if x!= y :
            if (u in knnGraph[v]) and (v in knnGraph[u]):
                if common(u,v)>=kt:
                    union(u,v)
# union find
# set - > all nodes, each node as it's own cluster
# for every (u,v) in G : 
# if v not in C(u) && if k(u) && k(v)>= kt && u->knn(V) && v->knn(u):
# remove c(u), c(v) 
# add c(u)union c(v) to set S

CPU times: user 1min 4s, sys: 243 ms, total: 1min 4s
Wall time: 1min 5s


### Final dictionary of Shared nearest neighbor

In [138]:
SNNClust = {}
vis = np.zeros(nodeSize+1, dtype = int)
for node in range(1, nodeSize+1):
    currId = parent(node)
    if vis[currId]==0:
        SNNClust[currId]=[node]
    else:
        SNNClust[currId].append(node)
    vis[currId] = 1
print(len(SNNClust))

32371


In [140]:
Dict = {}
for x in SNNClust:
    if len(SNNClust[x])>2:
        Dict[x]=SNNClust[x]

In [141]:
w = csv.writer(open("Shared_Nearest_Neighbor.csv", "w"))
for key, val in Dict.items():
    w.writerow([key, val])

### Mutual Nearest Neighbour

In [152]:
knnGraph = kNear(DirectedGraph, 6)
DictofClusters = {}
for i in nodeSet:
    DictofClusters[i] = [i]

In [153]:
def mnvForNodes(u, v):
    listU = knnGraph[u]
    listV = knnGraph[v]
    rankOfVinU = 100
    rankOfUinV = 100
    
    for i in range(len(listU)):
        if listU[i]==v:
            rankOfVinU = i+1
            break
            
    for i in range(len(listV)):
        if listV[i]==u:
            rankOfUinV = i+1
            break

    return rankOfUinV+rankOfVinU

In [154]:
def mnvForClusters(c1, c2):
    cluster1 = DictofClusters[c1]
    cluster2 = DictofClusters[c2]
    s = 0
    for u in cluster1:
        for v in cluster2:
            val = mnvForNodes(u, v)
            s = s + val
    length = len(cluster1) * len(cluster2)
    return s/length

In [159]:
def mutual_NN_neighbour(numofClusters, maxMNV):
    global DictofClusters
    mnvSet = set([])
#     for all pairs of clusters, find the mnv and insert into the set
    for c1 in nodeSet:
        for c2 in nodeSet:
            if c1!=c2:
                currVal = mnvForClusters(c1, c2)
                mnvSet.add((currVal, c1, c2))
    totalClusters = len(DictofClusters)
    print("All pair combination calculations done")
#     keep iterating while you didn't get required number of clusters
    cnt = 0 
    while totalClusters>numofClusters:
        minCluster = min(mnvSet)
        minU = minCluster[1]
        minV = minCluster[2]
        minNodes = [minU, minV]
#         remove all pairs which are either of minU,minV
        for c1 in DictofClusters:
            for node in minNodes:
                val = mnvForClusters(c1, node)
                pair1 = (val, c1, node)
                pair2 = (val, node, c1)
                if pair1 in mnvSet:
                    mnvSet.remove(pair1)
                if pair2 in mnvSet:
                    mnvSet.remove(pair2)
#       remove two clusters and combine them into one
        for v in DictofClusters[minV]:
            DictofClusters[minU].append(v)
        del DictofClusters[minV]
#         insert all the mnvs of the new clusters with the clusters existing
        for c1 in DictofClusters:
            if minU!=c1:
                currVal = mnvForClusters(c1, minU)
                mnvSet.add((currVal, c1, minU))
        print("Iteration ", cnt+1, "completed")
        cnt+=1
        totalClusters -=1
#         remove all the elements computing with c1, c2
# do the manipulations in the dictionary
# calculate the clustered one with every other cluster
# and push it into the set

In [160]:
mutual_NN_neighbour(len(nodeSet)-10, 100)

All pair combination calculations done
Iteration  1 completed
Iteration  2 completed
Iteration  3 completed
Iteration  4 completed
Iteration  5 completed
Iteration  6 completed
Iteration  7 completed
Iteration  8 completed
Iteration  9 completed
Iteration  10 completed


In [161]:
Dict = {}
for x in DictofClusters:
    if len(DictofClusters[x])>1:
        print('hi')
        Dict[x] = DictofClusters[x]
w = csv.writer(open("Mutual_Nearest_Neighbor.csv", "w"))
for key, val in Dict.items():
    w.writerow([key, val])

In [146]:
mySet = {(1,2,3),(3,4,5),(0,5,6),(0,1,2),(0,3,4)}
mySet.add((-1,2,3))
x = min(mySet)
print(x)

(-1, 2, 3)


In [149]:
mySet = {}
mySet.add((1,2,3))

AttributeError: 'dict' object has no attribute 'add'