In [1]:
import os
from collections import defaultdict
import random
import sys
from munkres import Munkres
import numpy
import networkx as nx

In [17]:
from networkx.algorithms import community
import glob

In [18]:
from networkx.algorithms.community import greedy_modularity_communities

In [19]:
circles_files = glob.glob("twitter/*.circles")
edges_files = glob.glob("twitter/*.edges")
egofeat_files = glob.glob("twitter/*.egofeat")
feat_files = glob.glob("twitter/*.feat")
featnames_files = glob.glob("twitter/*.featnames")

In [4]:
def loss1(usersPerCircle, usersPerCircleP):
    psize = max(len(usersPerCircle),len(usersPerCircleP)) # Pad the matrix to be square
    mm = numpy.zeros((psize,psize))
    mm2 = numpy.zeros((psize,psize))
    for i in range(psize):
        for j in range(psize):
            circleP = set() # Match to an empty circle (delete all users)
            circle = set() # Match to an empty circle (add all users)
            if (i < len(usersPerCircleP)):
                circleP = usersPerCircleP[i]
            if (j < len(usersPerCircle)):
                circle = usersPerCircle[j]
            nedits = len(circle.union(circleP)) - len(circle.intersection(circleP)) # Compute the edit distance between the two circles
            mm[i][j] = nedits
            mm2[i][j] = nedits

    if psize == 0:
        return 0 # Edge case in case there are no circles
    else:
        m = Munkres()
        #print mm2 # Print the pairwise cost matrix
        indices = m.compute(mm) # Compute the optimal alignment between predicted and groundtruth circles
        editCost = 0
        for row, column in indices:
            editCost += mm2[row][column]
    return int(editCost)


In [5]:
def read_nodeadjlist(filename):
    G = nx.Graph()
    for line in open(filename):
        lst = line.split(' ')
        el, es = lst[0], lst[1][:-1]
        G.add_edge(int(el),int(es))
    return G

In [20]:
egoNodeList = []
for item in circles_files:
    twitter, circleFilename = item.split("\\")
    filename, abcd = circleFilename.split(".")
    egoNodeList.append(filename)

In [29]:
filterCircleList = []
for item in egoNodeList:
    circle_file= "twitter/"+item+".circles"
    num_lines = sum(1 for line in open(circle_file))
    filterCircleList.append(num_lines)

In [30]:
filterCircleList

[4,
 1,
 4,
 2,
 3,
 1,
 2,
 2,
 2,
 2,
 3,
 7,
 5,
 5,
 2,
 7,
 2,
 2,
 0,
 17,
 5,
 4,
 2,
 3,
 10,
 3,
 2,
 1,
 2,
 0,
 3,
 2,
 5,
 8,
 3,
 6,
 4,
 14,
 18,
 5,
 2,
 1,
 2,
 3,
 2,
 2,
 32,
 2,
 2,
 3,
 2,
 2,
 1,
 2,
 2,
 3,
 1,
 4,
 2,
 3,
 3,
 2,
 0,
 3,
 3,
 2,
 2,
 7,
 2,
 5,
 3,
 2,
 0,
 6,
 4,
 3,
 1,
 1,
 2,
 6,
 2,
 2,
 6,
 8,
 9,
 3,
 2,
 12,
 1,
 11,
 4,
 8,
 4,
 2,
 2,
 2,
 4,
 2,
 7,
 2,
 4,
 5,
 2,
 1,
 2,
 3,
 8,
 4,
 2,
 4,
 11,
 5,
 2,
 2,
 3,
 2,
 3,
 10,
 2,
 1,
 2,
 3,
 2,
 1,
 2,
 1,
 18,
 2,
 2,
 2,
 2,
 3,
 6,
 2,
 2,
 2,
 3,
 4,
 18,
 2,
 1,
 5,
 2,
 4,
 3,
 3,
 1,
 3,
 0,
 4,
 2,
 4,
 2,
 1,
 1,
 6,
 3,
 5,
 2,
 4,
 0,
 1,
 3,
 2,
 2,
 6,
 3,
 5,
 9,
 2,
 12,
 1,
 3,
 2,
 3,
 3,
 2,
 3,
 3,
 2,
 3,
 3,
 3,
 5,
 3,
 3,
 2,
 2,
 5,
 4,
 3,
 2,
 0,
 2,
 2,
 3,
 3,
 4,
 4,
 2,
 2,
 3,
 24,
 1,
 2,
 4,
 2,
 3,
 1,
 3,
 3,
 1,
 4,
 3,
 1,
 3,
 2,
 5,
 2,
 0,
 2,
 4,
 3,
 5,
 3,
 2,
 2,
 14,
 3,
 6,
 12,
 3,
 12,
 1,
 2,
 2,
 2,
 1,
 4,
 2,
 3,
 3,
 2,
 3,
 10,
 5,

In [25]:
from scipy import stats


In [31]:
scipy.stats.describe(filterCircleList)

DescribeResult(nobs=973, minmax=(0, 100), mean=4.177800616649537, variance=33.29654371740702, skewness=7.965901043742477, kurtosis=100.69109310613494)

In [7]:
G1 = read_nodeadjlist("twitter/78813.edges")

In [8]:
communities_generator = community.girvan_newman(G1)

In [9]:
first_iteration_comm = tuple(sorted(c) for c in next(communities_generator))

In [12]:
import itertools


In [39]:
op = []
for communities in itertools.islice(communities_generator, 5):
    op.append((sorted(c) for c in communities))

In [41]:
list(op[-1])

[[586,
  2038,
  2419,
  3839,
  11628,
  13055,
  13405,
  15023,
  113963,
  174853,
  229523,
  428333,
  621713,
  627363,
  643443,
  655613,
  656933,
  697163,
  758185,
  782329,
  789314,
  793219,
  806170,
  813491,
  813715,
  817386,
  821449,
  849131,
  949161,
  992031,
  1018211,
  1375251,
  1847381,
  2195241,
  2384071,
  3361871,
  3375371,
  3558801,
  3594701,
  3640341,
  4044361,
  4958131,
  5027041,
  5362182,
  5385852,
  5435752,
  5497452,
  5541662,
  5637652,
  5676102,
  5725652,
  5746402,
  5796972,
  5849202,
  5963912,
  6217542,
  6271152,
  6339822,
  6376372,
  6515122,
  6813682,
  7434252,
  7684302,
  7921352,
  7924912,
  8091052,
  8168192,
  9184682,
  9235972,
  9363302,
  9460662,
  9462782,
  9535182,
  10359172,
  10461992,
  10751252,
  10760422,
  11323282,
  11336782,
  11375732,
  12101862,
  12199652,
  12307282,
  13170222,
  13434092,
  13535762,
  13753352,
  13837292,
  13910012,
  14048987,
  14058661,
  14066988,
  14079172,


In [24]:
# pred_list = []
# for circle in predCircle:
#     for val in circle:
#         pred_list.append(int(val))


In [42]:
def read_circles(filename):
    final_lst = []
    for line in open(filename):
        lst = line.split('\t')
        el, es = lst[0], lst[1:]
        circle  =set()
        for e in es:
            circle.add(int(e))
        final_lst.append(circle)
    return final_lst

In [43]:
gt_circles =read_circles("twitter/78813.circles")

In [44]:
gt_circles

[{586,
  3839,
  113963,
  174853,
  229523,
  793219,
  813491,
  2384071,
  5676102,
  11336782,
  13535762,
  15948437,
  30313925,
  50393960},
 {3839,
  174853,
  621713,
  849131,
  992031,
  2384071,
  5497452,
  5637652,
  5676102,
  9460662,
  11336782,
  11362622,
  12199652,
  14371227,
  14405111,
  14964767,
  15948437,
  27478849,
  50393960}]

In [45]:
loss1(list(op[-1]), gt_circles)

33