This file takes as input a CSV file and converts it into a JSON format file that contains distinct communities using the Louvain algorithm

In [None]:
import pandas as pd
import networkx as nx
import community as community_louvain  # Louvain method from the python-louvain package
import json

Load data

In [None]:
#Define a function to load the CSV file and return a dataframe
def load_data(csv_file):
    df = pd.read_csv(csv_file)
    return df

Build graph

In [None]:
#Build the Graph with call times as edge attributes
def build_graph(df):
    G = nx.Graph()
    # Add edges between clients with call time attributes (start and end)
    for _, row in df.iterrows():
        node1 = int(row['nodeNaam1'])  # Ensure int conversion
        node2 = int(row['nodeNaam2'])  # Ensure int conversion
        begintijd = int(row['begintijd'])  # Ensure int conversion
        eindtijd = int(row['eindtijd'])  # Ensure int conversion
        G.add_edge(node1, node2, begintijd=begintijd, eindtijd=eindtijd)  # Add edge with attributes
    return G

Apply Louvain algorithm

In [None]:
#Detect Communities Using Louvain Method and Store Call Times
def detect_communities_with_louvain(G):
    # Louvain method to detect communities
    partition = community_louvain.best_partition(G)
    
    # Create a dictionary to store nodes and edges per community
    community_dict = {}
    community_id = 1  # Start community IDs from 1

    for node, community in partition.items():
        if community not in community_dict:
            community_dict[community] = {
                'community_id': community_id,
                'nodes': [],
                'edges': []
            }
            community_id += 1
        community_dict[community]['nodes'].append(node)
    
    # Add edges for each community
    for community in community_dict.values():
        for node1 in community['nodes']:
            for node2 in community['nodes']:
                if node1 < node2 and G.has_edge(node1, node2):
                    # Get call times for each edge between nodes
                    call_times = G[node1][node2]
                    edge_info = {
                        'node1': int(node1),  # Ensure int conversion
                        'node2': int(node2),  # Ensure int conversion
                        'begintijd': int(call_times['begintijd']),  # Ensure int conversion
                        'eindtijd': int(call_times['eindtijd'])  # Ensure int conversion
                    }
                    if edge_info not in community['edges']:
                        community['edges'].append(edge_info)
    
    return list(community_dict.values())

Store the communities in a JSON file

In [None]:
def store_communities_with_call_times(communities, output_file):
    with open(output_file, 'w') as f:
        json.dump(communities, f, indent=4)

Call the functions to run the pipeline

In [None]:
# Full pipeline
def community_detection_with_louvain(csv_file, output_file):
    # Load data
    df = load_data(csv_file)
    
    # Build graph with call times
    G = build_graph(df)
    
    # Detect communities using Louvain method
    communities = detect_communities_with_louvain(G)
    
    # Store communities with call times
    store_communities_with_call_times(communities, output_file)

Examples of the pipeline ran

In [None]:
csv_file_path = '../data/full_data_5000.csv'  # replace with the actual CSV file path
output_communities_file = '50K_20.json'
community_detection_with_louvain(csv_file_path, output_communities_file)

In [None]:
csv_file_path = '../data/full_data_1000.csv'  # replace with the actual CSV file path
output_communities_file = '10K.json'
community_detection_with_louvain(csv_file_path, output_communities_file)

In [None]:
csv_file_path = '../data/10000.csv'  # replace with the actual CSV file path
output_communities_file = 'Small_test.json'
community_detection_with_louvain(csv_file_path, output_communities_file)

In [None]:
csv_file_path = '../data/full_data_20000.csv'  # replace with the actual CSV file path
output_communities_file = '20K.json'
community_detection_with_louvain(csv_file_path, output_communities_file)

## Evaluation of the Louvain algorithm

In [None]:
#Needed to measure time passed
import time

### Size evaluations

Load data from different sized datasets

In [None]:
# nodes 3000, edges 20
csv_file_path = '../data/evaluation_data/size-dataset/avgRela50_data_3000_20.csv' 
output_communities_file = '3Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 5000, edges 20
csv_file_path = '../data/evaluation_data/size-dataset/avgRela50_data_5000_20.csv' 
output_communities_file = '5Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 10000, edges 20
csv_file_path = '../data/evaluation_data/size-dataset/avgRela50_data_10000_20.csv' 
output_communities_file = '10Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 15000, edges 20
csv_file_path = '../data/evaluation_data/size-dataset/avgRela50_data_15000_20.csv' 
output_communities_file = '15Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 20000, edges 20
csv_file_path = '../data/evaluation_data/size-dataset/avgRela50_data_20000_20.csv' 
output_communities_file = '20Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 50000, edges 20
csv_file_path = '../data/evaluation_data/size-dataset/avgRela50_data_50000_20.csv' 
output_communities_file = '50Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

Plot the results in a graph

In [None]:
# graph to represent the speed of the algorithm for the various sizes of the data
time3k = 11.073784351348877
time5k = 25.070809841156006
time10k = 59.52233147621155
time15k = 85.41030240058899
time20k = 155.35351729393005
time50k = 341.9356949329376

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
x = [3, 5, 10, 15, 20, 50]
y = [time3k, time5k, time10k, time15k, time20k, time50k]
plt.plot(x, y, marker='o')
plt.xlabel('Number of Nodes')
plt.ylabel('Time in seconds')
plt.title('Time taken for community detection with Louvain method')
plt.grid()
plt.show()

### Complexity evaluation

Load data of different number of edges

In [None]:
# nodes 20000, edges 3
csv_file_path = '../data/evaluation_data/relation-dataset/avgRela50_data_20000_3.csv' 
output_communities_file = '10Knode_3edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 20000, edges 5
csv_file_path = '../data/evaluation_data/relation-dataset/avgRela50_data_20000_5.csv' 
output_communities_file = '10Knode_5edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 20000, edges 10
csv_file_path = '../data/evaluation_data/relation-dataset/avgRela50_data_20000_10.csv' 
output_communities_file = '10Knode_10edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 20000, edges 15
csv_file_path = '../data/evaluation_data/relation-dataset/avgRela50_data_20000_15.csv' 
output_communities_file = '10Knode_15edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

In [None]:
# nodes 20000, edges 20
csv_file_path = '../data/evaluation_data/relation-dataset/avgRela50_data_20000_20.csv' 
output_communities_file = '10Knode_20edge.json'
t0 = time.time()
community_detection_with_louvain(csv_file_path, output_communities_file)
t1 = time.time()
total = t1-t0
print(total)

Plot the result in a graph

In [None]:
# graph to represent the speed of the algorithm for the various avg. number of edges
time3 = 21.12712025642395
time5 = 44.96259093284607
time10 = 49.382936000823975
time15 = 53.61463212966919
time20 = 64.18218731880188

plt.figure(figsize=(10, 5))
x = [3,5,10,15,20]
y = [time3, time5, time10, time15, time20]
plt.plot(x, y, marker='o')
plt.xlabel('Average number of relations per node')
plt.ylabel('Time in seconds')
plt.title('Time taken for community detection with Louvain method')
plt.grid()
plt.show()