In [30]:
import random

import numpy as np
import matplotlib.pyplot as plt

# read metis output file
with open(r'ig_communities_output.metis', 'r') as fp:
    file = fp.readlines()

# create 3-tupel for every community
# ( id of community | number of members | members of community as list )
communities = {k: [k + 1, len(file[k].replace('\n', '').split(' ')), file[k].replace('\n', '').split(' ')] for k in range(0, len(file))}

# choose 100 communities with size 4, 5, 6
chosen_communities = []
for i in range(0, 100):
    for j in range(0, len(communities)):
        if len(chosen_communities) < 100 and (3 < communities[j][1] < 7):
            chosen_communities.append(communities[j])

print(chosen_communities)

[[10, 5, ['1', '406864', '478569', '586011', '1039976']], [11, 6, ['5', '3623', '261889', '281222', '281372', '2307102']], [13, 4, ['5', '26726', '38154', '46903']], [14, 6, ['5', '33717', '182557', '182695', '423809', '2324062']], [15, 5, ['5', '42809', '167487', '716169', '3934503']], [16, 4, ['5', '47149', '635230', '1917724']], [17, 5, ['5', '66705', '529624', '2690720', '3943000']], [18, 5, ['5', '142374', '609079', '2129626', '2678181']], [19, 4, ['5', '156623', '1259819', '11108343']], [21, 4, ['5', '654648', '695594', '1097207']], [22, 4, ['5', '1001356', '1601637', '7186977']], [23, 5, ['6', '103811', '1403576', '1430352', '1430373']], [24, 5, ['1', '8', '219354', '2541712', '2541822']], [27, 4, ['8', '1750', '39310', '2234455']], [28, 4, ['8', '3604', '40545', '3956732']], [33, 4, ['8', '15764', '29793', '96261']], [37, 6, ['8', '52376', '54208', '84248', '1097449', '1619629']], [38, 6, ['8', '933122', '1301021', '3099962', '3224640', '3964123']], [50, 6, ['9', '99821', '5277

In [31]:
# translate metis ids to original ids
with open(r'output-user-id-mapping.csv', 'r') as fp:
    file = fp.readlines()

# create dict from mapping file ( metis_id | original_id )
id_mapping = dict()

for line in file:
    # skip first line
    if line.startswith("original_user_id"):
        continue
    split_line = line.split(",")
    id_mapping[int(split_line[1])] = int(split_line[0])

# print(id_mapping[12])

# now we can translate the ids in our chosen communities
for i in range(0, len(chosen_communities)):
    for j in range(0, len(chosen_communities[i][2])):
        chosen_communities[i][2][j] = id_mapping[int(chosen_communities[i][2][j])]

print(chosen_communities)

[[10, 5, [208053367, 48438829, 238385602, 1765947972, 2185977161]], [11, 6, [624860143, 1441235330, 1701949716, 254307695, 56049323, 288069627]], [13, 4, [624860143, 1555606461, 49291076, 647551715]], [14, 6, [624860143, 1130289603, 2962343, 1283583325, 1963549027, 242338761]], [15, 5, [624860143, 1632181513, 389637571, 962818512, 485149639]], [16, 4, [624860143, 184374940, 39165418, 213469633]], [17, 5, [624860143, 368816183, 17058814, 1592708942, 491972585]], [18, 5, [624860143, 1571338228, 855437917, 239178487, 10413769]], [19, 4, [624860143, 264385413, 267082543, 486678220]], [21, 4, [624860143, 303542503, 1594332823, 2011660414]], [22, 4, [624860143, 16181890, 1181616933, 534814314]], [23, 5, [1418179390, 11000952, 1437544322, 277817854, 509213742]], [24, 5, [208053367, 1521780637, 1319172046, 3167687, 1393614085]], [27, 4, [1521780637, 8742059, 42877457, 421064829]], [28, 4, [1521780637, 1716359490, 1540685520, 358133159]], [33, 4, [1521780637, 194158496, 293643548, 21659602]], [

In [32]:
import time
# search in input file and count the inner and outer edges for every community

# extend data structure such that we can remember the number of inner and outer edges
# ( number of members | size of overlap | members of community as list | number of inner edges | number of outer edges )
for i in range(0, len(chosen_communities)):
    chosen_communities[i].append(0)
    chosen_communities[i].append(0)

t = time.localtime()
current_time = time.strftime("%H:%M:%S", t)
print(current_time)

edges = set()

with open('graph_edges_total_45mil_new.csv') as fp:
    for line in fp:
        # skip first line
        if line.startswith("ig_source_id,"):
            continue
        # split line first and check if it is even in the current community
        split_line = line.split(",")
        source_member = int(split_line[0])
        target_member = int(split_line[1])
        for i in range(0, len(chosen_communities)):
            if not source_member in chosen_communities[i][2]:
                continue

            # if we have a relevant pair of members, then first check if we already took this edge into account
            # hence we ignore all multi edges
            if (source_member, target_member) in edges or (target_member, source_member) in edges:
                continue

            # now check if the target member is inside our community and increase the number of inner/outer edges accordingly
            if target_member in chosen_communities[i][2]:
                chosen_communities[i][3] += 1
            else:
                chosen_communities[i][4] += 1

            # add edges to the list of edges which we already took into account
            edges.add((source_member, target_member))
            edges.add((target_member, source_member))

print(chosen_communities)

t = time.localtime()
current_time = time.strftime("%H:%M:%S", t)
print(current_time)

11:39:24
[[10, 5, [208053367, 48438829, 238385602, 1765947972, 2185977161], 7, 1085], [11, 6, [624860143, 1441235330, 1701949716, 254307695, 56049323, 288069627], 9, 3788], [13, 4, [624860143, 1555606461, 49291076, 647551715], 5, 1538], [14, 6, [624860143, 1130289603, 2962343, 1283583325, 1963549027, 242338761], 9, 4623], [15, 5, [624860143, 1632181513, 389637571, 962818512, 485149639], 8, 2363], [16, 4, [624860143, 184374940, 39165418, 213469633], 5, 3479], [17, 5, [624860143, 368816183, 17058814, 1592708942, 491972585], 7, 2647], [18, 5, [624860143, 1571338228, 855437917, 239178487, 10413769], 7, 2211], [19, 4, [624860143, 264385413, 267082543, 486678220], 5, 2627], [21, 4, [624860143, 303542503, 1594332823, 2011660414], 5, 1614], [22, 4, [624860143, 16181890, 1181616933, 534814314], 5, 1337], [23, 5, [1418179390, 11000952, 1437544322, 277817854, 509213742], 7, 1633], [24, 5, [208053367, 1521780637, 1319172046, 3167687, 1393614085], 6, 3244], [27, 4, [1521780637, 8742059, 42877457, 4

In [33]:
# write proportions to file
fp = open("chosen_communities_edge_proportions_output.csv", "w")
fp.write("community_id,community_size,inner_edges,outer_edges\n")
for i in range(0, len(chosen_communities)):
    fp.write(str(chosen_communities[i][0]) + "," + str(chosen_communities[i][1]) + "," + str(chosen_communities[i][3])
             + "," + str(chosen_communities[i][4]) + "\n")
fp.close()