In [None]:
import networkx as nx
import numpy as np
import community as comms
import pickle
from networkx.algorithms.community import greedy_modularity_communities
from networkx.generators.community import stochastic_block_model
from scipy.sparse import csr_matrix

In [None]:
def create_graph_and_node_mappings_from_file(filepath, type='undirected'):
    if type == 'undirected':
        G = nx.Graph()
    else:
        G = nx.DiGraph()
    node_mappings = {}
    reverse_node_mappings = {}
    with open(filepath, 'r') as f:
        counter = 0
        for line in f:
            edge = line.strip().split()
            node1 = str(edge[0])
            node2 = str(edge[1])
            if node1 not in node_mappings:
                node_mappings[node1] = counter
                reverse_node_mappings[counter] = node1
                counter += 1
            if node2 not in node_mappings:
                node_mappings[node2] = counter
                reverse_node_mappings[counter] = node2 
                counter += 1
            if type == 'undirected' or type == 'directed':
                source_node = node_mappings[node1]
                target_node = node_mappings[node2]
            elif type == 'reverse_directed':
                source_node = node_mappings[node2]
                target_node = node_mappings[node1]
            G.add_edge(source_node, target_node)
    return G, node_mappings, reverse_node_mappings

def create_greedy_modularity_communities_dict(G, reverse_node_mappings):
    gm_communities = greedy_modularity_communities(G)
    node_communities_gm = {}
    community_counter = 0
    for community in gm_communities:
        for node in community:
            node_communities_gm[reverse_node_mappings[node]] = community_counter
        community_counter += 1
    return node_communities_gm, community_counter + 1

def create_louvain_communities_dict(G, reverse_node_mappings):
    node_communities_louvain = comms.best_partition(G)
    node_communities_louvain_original_ids = {}
    distinct_communities = []
    for reverse_node_id in node_communities_louvain:
        node_communities_louvain_original_ids[reverse_node_mappings[reverse_node_id]] = node_communities_louvain[reverse_node_id]
        if node_communities_louvain[reverse_node_id] not in distinct_communities:
            distinct_communities.append(node_communities_louvain[reverse_node_id])
    return node_communities_louvain_original_ids, len(distinct_communities)

def store_community_dict_as_file(community_dict, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(community_dict, handle, protocol=2)

def calculate_edge_probabilities(G, communities, node_id_community_id_dict, reverse_node_mappings=None):
    between_community_stats = {}
    for edge in G.edges():
        # set source and target id
        source_id = edge[0]
        target_id = edge[1]
        if reverse_node_mappings is not None:
            source_id = reverse_node_mappings[source_id]
            target_id = reverse_node_mappings[target_id]
        source_community_id = node_id_community_id_dict[source_id]
        target_community_id = node_id_community_id_dict[target_id]
        if (source_community_id, target_community_id) not in between_community_stats:
            source_community_size = len(communities[source_community_id])
            target_community_size = len(communities[target_community_id])
            if G.is_directed():
                if source_community_id == target_community_id:
                    max_edge_count = (source_community_size*(source_community_size-1))
                    if has_selfloops(G):
                        max_edge_count += source_community_size
                else:
                    max_edge_count = 2*source_community_size*target_community_size
            else:
                if source_community_id == target_community_id:
                    max_edge_count = (source_community_size*(source_community_size-1))/2
                    if has_selfloops(G):
                        max_edge_count += source_community_size
                else:
                    max_edge_count = source_community_size*target_community_size                  
            between_community_stats[(source_community_id, target_community_id)] = {"existing_edge_count":0, "max_edge_count": max_edge_count}
            if not G.is_directed() and source_community_id != target_community_id:
                between_community_stats[(target_community_id, source_community_id)] = {"existing_edge_count":0, "max_edge_count": max_edge_count}
        between_community_stats[(source_community_id, target_community_id)]['existing_edge_count'] += 1
        if not G.is_directed() and source_community_id != target_community_id:
            between_community_stats[(target_community_id, source_community_id)]['existing_edge_count'] += 1
    for key in between_community_stats:
        between_community_stats[key]['edge_probability'] = between_community_stats[key]['existing_edge_count'] /  between_community_stats[key]['max_edge_count']
    rows = []
    cols = []
    data = []
    for key in between_community_stats:
        rows.append(key[0])
        cols.append(key[1])
        if between_community_stats[key]['edge_probability'] < 0 or between_community_stats[key]['edge_probability'] > 1:
            print(key)
        data.append(between_community_stats[key]['edge_probability'])
    communities_count = len(communities)
    return csr_matrix((data, (rows,cols)), shape=(communities_count, communities_count), dtype=float).todense().tolist()

def has_selfloops(G):
    return G.number_of_selfloops() > 0

def get_block_sizes(communities):
    block_sizes = []
    for community_id in communities:
        block_sizes.append(len(communities[community_id]))
    return block_sizes

def get_nodelist(communities, node_mappings=None):
    nodelist = []
    for community_id in communities:
        community = communities[community_id]
        for node_id in community:
            if node_mappings is not None:
                nodelist.append(node_mappings[node_id])
            else:
                nodelist.append(node_id)
    return nodelist

def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)        

In [None]:
datasets = ['citeseer', 'cora', 'cora_full', 'PubMed']
# graph_types = ['undirected']#, 'directed', 'reverse_directed']
community_count = {}
graphs = {}
node_mappings_dict = {}
reverse_node_mappings_dict = {}

data_directory = '../data/raw/'
communities_directory = '../data/community_id_dicts/'

In [None]:
for dataset in datasets:
    filepath = data_directory + dataset + '/' + dataset + '.cites'
    G, node_mappings, reverse_node_mappings = create_graph_and_node_mappings_from_file(filepath, type)
    graphs[dataset] = G
    node_mappings_dict[dataset] = node_mappings
    reverse_node_mappings_dict[dataset] = reverse_node_mappings
    node_communities_gm, distinct_gm_count = create_greedy_modularity_communities_dict(G, reverse_node_mappings)
    node_communities_louvain, distinct_louvain_count = create_louvain_communities_dict(G, reverse_node_mappings)
    community_count[dataset] = {'distinct_gm_count': distinct_gm_count, 'distinct_louvain_count': distinct_louvain_count}
    store_community_dict_as_file(node_communities_gm, communities_directory + dataset + '/' + dataset + '_greedy_modularity.pickle')
    store_community_dict_as_file(node_communities_louvain, communities_directory + dataset + '/' + dataset + '_louvain.pickle')

In [None]:
data_directory = '../data/raw/'

In [None]:
# create_and_store_graph_transformations(data_directory + 'cora_full' + '/' + 'cora_full' + '.cites')
# create_and_store_graph_transformations_reverse_directed(data_directory + 'cora_full' + '/' + 'cora_full' + '.cites')

In [None]:
print(community_count)

In [None]:
gm_node_id_to_community_id = {}
for dataset in datasets:
    with open(communities_directory + dataset + '/' + dataset + '_greedy_modularity.pickle', 'rb') as handle:
        gm_node_id_to_community_id[dataset] = pickle.load(handle)
        
louvain_node_id_to_community_id = {}
for dataset in datasets:
    with open(communities_directory + dataset + '/' + dataset + '_louvain.pickle', 'rb') as handle:
        louvain_node_id_to_community_id[dataset] = pickle.load(handle)
        

In [None]:
gm_community_id_to_node_id = {}

for dataset in datasets:
    gm_community_id_to_node_id[dataset] = {}
    for node_id in gm_node_id_to_community_id[dataset]:
        community_id = gm_node_id_to_community_id[dataset][node_id]
        if community_id not in gm_community_id_to_node_id[dataset]:
            gm_community_id_to_node_id[dataset][community_id] = []
        gm_community_id_to_node_id[dataset][community_id].append(node_id)


In [None]:
louvain_community_id_to_node_id = {}

for dataset in datasets:
    louvain_community_id_to_node_id[dataset] = {}
    for node_id in louvain_node_id_to_community_id[dataset]:
        community_id = louvain_node_id_to_community_id[dataset][node_id]
        if community_id not in louvain_community_id_to_node_id[dataset]:
            louvain_community_id_to_node_id[dataset][community_id] = []
        louvain_community_id_to_node_id[dataset][community_id].append(node_id)

In [None]:
gm_edge_probabilities = {}
gm_block_sizes = {}
gm_nodelists = {}

louvain_edge_probabilities = {}
louvain_block_sizes = {}
louvain_nodelists = {}

for dataset in datasets:
    G = graphs[dataset]
    node_mappings = node_mappings_dict[dataset]
    reverse_node_mappings = reverse_node_mappings_dict[dataset]
    gm_edge_probabilities[dataset] = calculate_edge_probabilities(G, gm_community_id_to_node_id[dataset], gm_node_id_to_community_id[dataset], reverse_node_mappings)
    gm_block_sizes[dataset] = get_block_sizes(gm_community_id_to_node_id[dataset])
    gm_nodelists[dataset] = get_nodelist(gm_community_id_to_node_id[dataset], node_mappings_dict[dataset])
    louvain_edge_probabilities[dataset] = calculate_edge_probabilities(G, louvain_community_id_to_node_id[dataset], louvain_node_id_to_community_id[dataset], reverse_node_mappings)
    louvain_block_sizes[dataset] = get_block_sizes(louvain_community_id_to_node_id[dataset])
    louvain_nodelists[dataset] = get_nodelist(louvain_community_id_to_node_id[dataset], node_mappings_dict[dataset])

In [None]:
gm_stochastic_block_model_graphs = {}
louvain_stochastic_block_model_graphs = {}

In [None]:
# create greedy modularity-based stochastic block model graphs
for dataset in datasets:
    gm_stochastic_block_model_graphs[dataset] =  []
    louvain_stochastic_block_model_graphs[dataset] =  []
    for seed in range(10):
        G = stochastic_block_model(gm_block_sizes[dataset],
                                   gm_edge_probabilities[dataset],
                                   gm_nodelists[dataset],
                                   seed,
                                   False,
                                   has_selfloops(graphs[dataset]),
                                   True)
        gm_stochastic_block_model_graphs[dataset].append({'seed':seed, 'graph':G})
        G = stochastic_block_model(louvain_block_sizes[dataset],
                                   louvain_edge_probabilities[dataset],
                                   louvain_nodelists[dataset],
                                   seed,
                                   False,
                                   has_selfloops(graphs[dataset]),
                                   True)
        louvain_stochastic_block_model_graphs[dataset].append({'seed':seed, 'graph':G})


In [None]:
for dataset in datasets:
    for entry in gm_stochastic_block_model_graphs[dataset]:
        print(len(entry['graph'].edges()))
    print()

In [None]:
for dataset in datasets:
    for entry in louvain_stochastic_block_model_graphs[dataset]:
        print(len(entry['graph'].edges()))
    print()

In [None]:
for dataset in graphs:
    print(len(graphs[dataset].edges()))

In [187]:
for dataset in datasets:
    for entry in gm_stochastic_block_model_graphs[dataset]:
        with open('../data/sbm_graphs/greedy_modularity_based/' + dataset + '/' + dataset + 'undirected_sbm_gm_seed_' + str(entry['seed']) + '.cites', 'a') as output_file:
            for edge in list(entry['graph'].edges()):
                source_id = reverse_node_mappings_dict[dataset][edge[0]]
                target_id = reverse_node_mappings_dict[dataset][edge[1]]
                output_file.write(source_id + ' ' + target_id + '\n')

In [189]:
for dataset in datasets:
    for entry in louvain_stochastic_block_model_graphs[dataset]:
        with open('../data/sbm_graphs/louvain_based/' + dataset + '/' + dataset + 'undirected_sbm_louvain_seed_' + str(entry['seed']) + '.cites', 'a') as output_file:
            for edge in list(entry['graph'].edges()):
                source_id = reverse_node_mappings_dict[dataset][edge[0]]
                target_id = reverse_node_mappings_dict[dataset][edge[1]]
                output_file.write(source_id + ' ' + target_id + '\n')


In [None]:
# def create_and_store_graph_transformations(filepath):
#     node_mappings = {}
#     reverse_node_mappings = {}
#     with open(filepath, 'r') as f1:
#         with open('../data/raw/cora_full/cora_full_remapped.cites', 'w') as f2:
#             counter = 0
#             for line in f1:
#                 edge = line.strip().split()
#                 source_node = str(edge[0])
#                 if source_node not in node_mappings:
#                     node_mappings[source_node] = counter
#                     reverse_node_mappings[counter] = source_node
#                     counter += 1
#                 target_node = str(edge[1])
#                 if target_node not in node_mappings:
#                     node_mappings[target_node] = counter
#                     reverse_node_mappings[counter] = target_node
#                     counter += 1
#                 source_node = node_mappings[edge[0]]
#                 target_node = node_mappings[edge[1]]
#                 f2.write(str(source_node) + ' ' + str(target_node) + '\n')
#             store_community_dict_as_file(reverse_node_mappings, '../data/raw/cora_full/cora_full_remapped.pkl')
# 
# 
# def create_and_store_graph_transformations_reverse_directed(filepath):
#     node_mappings = {}
#     reverse_node_mappings = {}
#     with open(filepath, 'r') as f3:
#         with open('../data/raw/cora_full/cora_full_remapped_reverse-directed.cites', 'w') as f4:
#             counter = 0
#             for line in f3:
#                 edge = line.strip().split()
#                 source_node = str(edge[1])
#                 if source_node not in node_mappings:
#                     node_mappings[source_node] = counter
#                     reverse_node_mappings[counter] = source_node
#                     counter += 1
#                 target_node = str(edge[0])
#                 if target_node not in node_mappings:
#                     node_mappings[target_node] = counter
#                     reverse_node_mappings[counter] = target_node
#                     counter += 1
#                 source_node = node_mappings[edge[1]]
#                 target_node = node_mappings[edge[0]]
#                 f4.write(str(source_node) + ' ' + str(target_node) + '\n')
#             store_community_dict_as_file(reverse_node_mappings, '../data/raw/cora_full/cora_full_remapped_reverse-directed.pkl')
