## spark setup & libraries

In [49]:
# spark setup
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.ml.feature import MinHashLSH, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, when, explode, lit, array_contains
# from itertools import groupby
from pyspark.sql import Window
from pyspark.sql.functions import row_number, ceil, col, udf
import math

# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

# show all rows with df.head
pd.options.display.max_columns = None
# remove warnings
import warnings
warnings.filterwarnings('ignore')

## create spark session + load data

In [3]:
spark = SparkSession.builder \
  .appName("DIS_project_5") \
  .master("local[*]") \
  .config("spark.driver.memory", "10G") \
  .config("spa\rk.driver.maxResultSize", "40g") \
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
  .getOrCreate()
spark
sc = spark.sparkContext

your 131072x1 screen size is bogus. expect trouble
24/11/07 09:55:59 WARN Utils: Your hostname, MadioLaptop resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/11/07 09:55:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/07 09:56:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
json_file_path = "../Community Detection/10K.json"
with open(json_file_path, 'r') as f:
    community_data = json.load(f)

community_rdd = sc.parallelize(community_data)
print(community_rdd.take(3))
print(community_rdd.count())
print(type(community_rdd))

                                                                                

[{'community_id': 1, 'nodes': [371, 340, 378, 667, 974, 368, 627, 64, 168, 257, 925, 767, 549, 890, 899, 226, 45, 71, 72, 381, 707, 973, 812, 839, 96, 433, 670, 468, 466, 876, 402, 646, 593, 366, 612], 'edges': [{'node1': 371, 'node2': 549, 'begintijd': 20240318013535, 'eindtijd': 20240318030219}, {'node1': 371, 'node2': 812, 'begintijd': 20240412020025, 'eindtijd': 20240412053609}, {'node1': 371, 'node2': 670, 'begintijd': 20240606074446, 'eindtijd': 20240606110610}, {'node1': 340, 'node2': 378, 'begintijd': 20240918161950, 'eindtijd': 20240918172041}, {'node1': 378, 'node2': 466, 'begintijd': 20240918071254, 'eindtijd': 20240918140257}, {'node1': 667, 'node2': 767, 'begintijd': 20241101133614, 'eindtijd': 20241101175233}, {'node1': 368, 'node2': 627, 'begintijd': 20240112043512, 'eindtijd': 20240112122716}, {'node1': 368, 'node2': 767, 'begintijd': 20240205225900, 'eindtijd': 20240206062846}, {'node1': 64, 'node2': 627, 'begintijd': 20240407064043, 'eindtijd': 20240407074414}, {'node

                                                                                

53
<class 'pyspark.rdd.RDD'>


## group communities

In [13]:
# turn rdd into dataframe and add columns for count of nodes and edges
df_community = spark.createDataFrame(community_rdd)
df_community.show()
# Add columns for node size and edge size
df_with_sizes = df_community \
    .withColumn("node_size", F.size("nodes")) \
    .withColumn("edge_size", F.size("edges")) \

df_with_sizes.show()

# group communities by number of nodes
# Group by node size
node_size_groups = df_with_sizes.groupBy("node_size").count()
node_size_groups.show()


# group communities by number of edges
# Group by edge size
edge_size_groups = df_with_sizes.groupBy("edge_size").count()
edge_size_groups.show()

                                                                                

+------------+--------------------+--------------------+
|community_id|               edges|               nodes|
+------------+--------------------+--------------------+
|           1|[{eindtijd -> 202...|[371, 340, 378, 6...|
|           2|[{eindtijd -> 202...|[419, 134, 779, 4...|
|           3|[{eindtijd -> 202...|[51, 411, 958, 98...|
|           4|[{eindtijd -> 202...|[87, 337, 351, 43...|
|           5|[{eindtijd -> 202...|[132, 290, 999, 4...|
|           6|[{eindtijd -> 202...|[759, 386, 483, 2...|
|           7|[{eindtijd -> 202...|[138, 398, 109, 4...|
|           8|[{eindtijd -> 202...|[204, 651, 618, 8...|
|           9|[{eindtijd -> 202...|[250, 298, 299, 1...|
|          10|[{eindtijd -> 202...|[451, 306, 543, 1...|
|          11|[{eindtijd -> 202...|[4, 577, 406, 900...|
|          12|[{eindtijd -> 202...|[797, 984, 476, 1...|
|          13|[{eindtijd -> 202...|[777, 567, 674, 9...|
|          14|[{eindtijd -> 202...|[305, 322, 86, 38...|
|          15|[{eindtijd -> 202

### make N groups by size

In [52]:
# group communities by number of nodes and edges
def group_by_size(df, num_groups, column_name):
  # sort the data by the column name
  sorted_df = df.sort(column_name)

  # calculate the size of each group
  group_size = math.ceil(df.count() / num_groups)
  
  # add a row number column to the dataframe
  df_with_row_number = df.withColumn("row_number", row_number().over(Window.orderBy(column_name)))

  # use row number to create groups
  df_groups = df_with_row_number.withColumn("group", ceil(col("row_number") / group_size)).drop("row_number")

  return df_groups


result = group_by_size(df_with_sizes, 3, 'node_size')
result.show(55)

24/11/07 11:33:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 11:33:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 11:33:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------------+--------------------+--------------------+---------+---------+-----+
|community_id|               edges|               nodes|node_size|edge_size|group|
+------------+--------------------+--------------------+---------+---------+-----+
|          16|[{eindtijd -> 202...|          [623, 936]|        2|        1|    1|
|          26|[{eindtijd -> 202...|           [24, 775]|        2|        1|    1|
|          30|[{eindtijd -> 202...|          [216, 710]|        2|        1|    1|
|          32|[{eindtijd -> 202...|          [740, 743]|        2|        1|    1|
|          33|[{eindtijd -> 202...|           [50, 749]|        2|        1|    1|
|          34|[{eindtijd -> 202...|          [193, 949]|        2|        1|    1|
|          35|[{eindtijd -> 202...|           [47, 620]|        2|        1|    1|
|          37|[{eindtijd -> 202...|          [150, 795]|        2|        1|    1|
|          38|[{eindtijd -> 202...|          [273, 976]|        2|        1|    1|
|   

In [61]:
# make unique pairs of communities in same group
def create_pairs_grouped_communities(df_groups):
  df_pairs = (
    df_groups.alias("df1")
    .join(df_groups.alias("df2"), (F.col("df1.group") == F.col("df2.group")) & (F.col("df1.community_id") < F.col("df2.community_id")))
    # .withColumn("pairs", (F.col("df1.community_id") + F.lit(", ") + F.col("df2.community_id")))
    .select(
      F.array(F.col("df1.community_id"), F.col("df2.community_id")).alias("pairs"),
      F.col("df1.group").alias("group"),
      F.col("df1.community_id").alias("community_id_1"),
      F.col("df2.community_id").alias("community_id_2"),
      F.col("df1.nodes").alias("nodes_1"),
      F.col("df2.nodes").alias("nodes_2"),
      F.col("df1.edges").alias("edges_1"),
      F.col("df2.edges").alias("edges_2"),
    )
  )

  return df_pairs

result_group_communities = create_pairs_grouped_communities(result)
result_group_communities.show()

24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------+-----+--------------+--------------+----------+----------+--------------------+--------------------+
|   pairs|group|community_id_1|community_id_2|   nodes_1|   nodes_2|             edges_1|             edges_2|
+--------+-----+--------------+--------------+----------+----------+--------------------+--------------------+
|[16, 26]|    1|            16|            26|[623, 936]| [24, 775]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 30]|    1|            16|            30|[623, 936]|[216, 710]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 32]|    1|            16|            32|[623, 936]|[740, 743]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 33]|    1|            16|            33|[623, 936]| [50, 749]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 34]|    1|            16|            34|[623, 936]|[193, 949]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 35]|    1|            16|            35|[623, 936]| [47, 620]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|

24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [62]:
# complete code to turn rdd of communities into dataframe of pairs of communities of similiar node or edge size
def rdd_community_to_dataframe_paired_community_by_column(rdd_communities, number_groups_of_communities_wanted, column_name):
  # turn rdd into dataframe and add columns for count of nodes and edges
  df_community = spark.createDataFrame(rdd_communities)
  df_with_sizes = df_community \
    .withColumn("node_size", F.size("nodes")) \
    .withColumn("edge_size", F.size("edges")) \

  # group communities by number of nodes and edges
  df_communities_groups = group_by_size(df_with_sizes, number_groups_of_communities_wanted, column_name)

  # make unique pairs of communities in same group
  result_grouped_communities = create_pairs_grouped_communities(df_communities_groups)

  # from this dataframe you can see which communities are grouped together, so all the pairs as a seperate column

  return result_grouped_communities


# example
example_result = rdd_community_to_dataframe_paired_community_by_column(community_rdd, 3, 'node_size')
example_result.show()

24/11/07 12:39:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:39:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:39:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:39:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:39:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 12:39:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/07 1

+--------+-----+--------------+--------------+----------+----------+--------------------+--------------------+
|   pairs|group|community_id_1|community_id_2|   nodes_1|   nodes_2|             edges_1|             edges_2|
+--------+-----+--------------+--------------+----------+----------+--------------------+--------------------+
|[16, 26]|    1|            16|            26|[623, 936]| [24, 775]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 30]|    1|            16|            30|[623, 936]|[216, 710]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 32]|    1|            16|            32|[623, 936]|[740, 743]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 33]|    1|            16|            33|[623, 936]| [50, 749]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 34]|    1|            16|            34|[623, 936]|[193, 949]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|[16, 35]|    1|            16|            35|[623, 936]| [47, 620]|[{eindtijd -> 202...|[{eindtijd -> 202...|
|