## spark setup & libraries

In [8]:
# spark setup
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.ml.feature import MinHashLSH, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, when, explode, lit, array_contains


# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

# show all rows with df.head
pd.options.display.max_columns = None
# remove warnings
import warnings
warnings.filterwarnings('ignore')

## create spark session + load data

In [2]:
spark = SparkSession.builder \
  .appName("DIS_project_5") \
  .master("local[*]") \
  .config("spark.driver.memory", "10G") \
  .config("spa\rk.driver.maxResultSize", "40g") \
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
  .getOrCreate()
spark
sc = spark.sparkContext

your 131072x1 screen size is bogus. expect trouble
24/11/02 16:50:44 WARN Utils: Your hostname, MadioLaptop resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/11/02 16:50:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/02 16:50:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
json_file_path = "../Community Detection/10K.json"
with open(json_file_path, 'r') as f:
    community_data = json.load(f)

community_rdd = sc.parallelize(community_data)
print(community_rdd.take(3))
print(community_rdd.count())
print(type(community_rdd))

                                                                                

[{'community_id': 1, 'nodes': [371, 340, 378, 667, 974, 368, 627, 64, 168, 257, 925, 767, 549, 890, 899, 226, 45, 71, 72, 381, 707, 973, 812, 839, 96, 433, 670, 468, 466, 876, 402, 646, 593, 366, 612], 'edges': [{'node1': 371, 'node2': 549, 'begintijd': 20240318013535, 'eindtijd': 20240318030219}, {'node1': 371, 'node2': 812, 'begintijd': 20240412020025, 'eindtijd': 20240412053609}, {'node1': 371, 'node2': 670, 'begintijd': 20240606074446, 'eindtijd': 20240606110610}, {'node1': 340, 'node2': 378, 'begintijd': 20240918161950, 'eindtijd': 20240918172041}, {'node1': 378, 'node2': 466, 'begintijd': 20240918071254, 'eindtijd': 20240918140257}, {'node1': 667, 'node2': 767, 'begintijd': 20241101133614, 'eindtijd': 20241101175233}, {'node1': 368, 'node2': 627, 'begintijd': 20240112043512, 'eindtijd': 20240112122716}, {'node1': 368, 'node2': 767, 'begintijd': 20240205225900, 'eindtijd': 20240206062846}, {'node1': 64, 'node2': 627, 'begintijd': 20240407064043, 'eindtijd': 20240407074414}, {'node

## group communities

In [16]:
# turn rdd into dataframe and add columns for count of nodes and edges
df_community = spark.createDataFrame(community_rdd)
df_community.show()
# Add columns for node size and edge size
df_with_sizes = df_community \
    .withColumn("node_size", F.size("nodes")) \
    .withColumn("edge_size", F.size("edges")) \

# group communities by number of nodes
# Group by node size
node_size_groups = df_with_sizes.groupBy("node_size").count()
node_size_groups.show()


# group communities by number of edges
# Group by edge size
edge_size_groups = df_with_sizes.groupBy("edge_size").count()
edge_size_groups.show()

+------------+--------------------+--------------------+
|community_id|               edges|               nodes|
+------------+--------------------+--------------------+
|           1|[{eindtijd -> 202...|[371, 340, 378, 6...|
|           2|[{eindtijd -> 202...|[419, 134, 779, 4...|
|           3|[{eindtijd -> 202...|[51, 411, 958, 98...|
|           4|[{eindtijd -> 202...|[87, 337, 351, 43...|
|           5|[{eindtijd -> 202...|[132, 290, 999, 4...|
|           6|[{eindtijd -> 202...|[759, 386, 483, 2...|
|           7|[{eindtijd -> 202...|[138, 398, 109, 4...|
|           8|[{eindtijd -> 202...|[204, 651, 618, 8...|
|           9|[{eindtijd -> 202...|[250, 298, 299, 1...|
|          10|[{eindtijd -> 202...|[451, 306, 543, 1...|
|          11|[{eindtijd -> 202...|[4, 577, 406, 900...|
|          12|[{eindtijd -> 202...|[797, 984, 476, 1...|
|          13|[{eindtijd -> 202...|[777, 567, 674, 9...|
|          14|[{eindtijd -> 202...|[305, 322, 86, 38...|
|          15|[{eindtijd -> 202