This document provides a starting point for the implementation of Locality Senstitive Hashing for further analysis. It is not currently ready to be used

## Spark setup & Libraries

In [None]:
# spark setup
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import MinHashLSH, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, when, explode, lit, array_contains


# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

# show all rows with df.head
pd.options.display.max_columns = None
# remove warnings
import warnings
warnings.filterwarnings('ignore')

## Create spark session

In [None]:
spark = SparkSession.builder \
  .appName("DIS_project_5") \
  .master("local[*]") \
  .config("spark.driver.memory", "10G") \
  .config("spa\rk.driver.maxResultSize", "40g") \
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
  .getOrCreate()
spark
sc = spark.sparkContext

In [None]:
sc.getConf().getAll()

## Load community data

In [None]:
json_file_path = "../Community Detection/10K.json"
with open(json_file_path, 'r') as f:
    community_data = json.load(f)

community_rdd = sc.parallelize(community_data)
print(community_rdd.take(3))
print(community_rdd.count())
print(type(community_rdd))

## LSH

In [None]:
# Step 1: Collect unique nodes and edges across all communities
all_nodes = sorted({node for community in community_rdd.collect() for node in community['nodes']})
all_edges = sorted({(edge['node1'], edge['node2']) for community in community_rdd.collect() for edge in community['edges']})

# Step 2: Convert RDD to DataFrame with binary columns for each unique node and edge
def create_binary_features(community):
    features = {}
    # Binary columns for nodes
    for node in all_nodes:
        features[f'node_{node}'] = 1 if node in community['nodes'] else 0
    # Binary columns for edges
    for edge in all_edges:
        features[f'edge_{edge[0]}_{edge[1]}'] = 1 if edge in community['edges'] else 0
    # Include the community_id for reference
    features['community_id'] = community['community_id']
    return Row(**features)

# Apply create_binary_features to each community in the RDD
binary_features_rdd = community_rdd.map(create_binary_features)

# Convert the RDD to a DataFrame
data_df = spark.createDataFrame(binary_features_rdd)


#step 3; assemble all the node and edge columns into a single feature column using vector assembler
feature_cols = [f"node_{node}" for node in all_nodes] + [f"edge_{edge[0]}_{edge[1]}" for edge in all_edges]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
vector_df = assembler.transform(data_df)
print((vector_df.select(["community_id"]).take(2)))

In [None]:
# step 4; initialise MinHashLSH and fit the model
minhash = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = minhash.fit(vector_df)

# transform data and show hashes
transformed_df = model.transform(vector_df)
print(transformed_df.show())


In [None]:
# test lsh model
# key_test_community = {'community_id': 1, 'nodes': [1, 2, 9, 7, 3, 8], 'edges': [{'node1': 1, 'node2': 2, 'begintijd': 20240921180004, 'eindtijd': 20240921225419}, {'node1': 1, 'node2': 9, 'begintijd': 20240318051804, 'eindtijd': 20240318115006}, {'node1': 1, 'node2': 7, 'begintijd': 20240112134348, 'eindtijd': 20240112143558}, {'node1': 1, 'node2': 3, 'begintijd': 20240928030750, 'eindtijd': 20240928065938}, {'node1': 2, 'node2': 9, 'begintijd': 20241204014916, 'eindtijd': 20241204073145}, {'node1': 2, 'node2': 7, 'begintijd': 20240417225406, 'eindtijd': 20240418030243}, {'node1': 2, 'node2': 3, 'begintijd': 20240726103240, 'eindtijd': 20240726171152}, {'node1': 2, 'node2': 8, 'begintijd': 20241022091933, 'eindtijd': 20241022141128}, {'node1': 7, 'node2': 9, 'begintijd': 20240326143919, 'eindtijd': 20240326184538}, {'node1': 7, 'node2': 8, 'begintijd': 20240225113935, 'eindtijd': 20240225140514}, {'node1': 3, 'node2': 9, 'begintijd': 20240710155403, 'eindtijd': 20240710164441}, {'node1': 3, 'node2': 7, 'begintijd': 20240625050041, 'eindtijd': 20240625050327}, {'node1': 8, 'node2': 9, 'begintijd': 20240218112750, 'eindtijd': 20240218145533}]}

first_key_vector = transformed_df.filter(transformed_df.community_id == 2).select("features").first()["features"]
print(first_key_vector)
# print(transformed_df.show(8))

# remove vector from dataframe with id 2
no_target_transformed_df = transformed_df.filter(transformed_df.community_id != 2)

number_of_neighbours = 5
# model.approxNearestNeighbors(no_target_transformed_df, first_key_vector, numNearestNeighbors=number_of_neighbours).show()
model.approxNearestNeighbors(no_target_transformed_df, first_key_vector, numNearestNeighbors=number_of_neighbours).select("community_id").show()

In [None]:
# nearest neighbours multiple at ones
# print(model.approxSimilarityJoin(transformed_df, transformed_df, 0.6).show())
print(transformed_df.show(1))
print(model.approxSimilarityJoin(transformed_df, transformed_df.show(1), 100).select("datasetA.community_id", "datasetB.community_id").show())

In [None]:
# find N nearest neighbors, by looping through each record in the data and finding the nearest neighbors for each record and saving them

## test method, alternative data

In [None]:
vec_df1 = vector_df.take(2)
print(f"features in sparsevector of community 1: \n{vec_df1[1].features}")
# sparse_vec_df1 = Vectors.sparse(vec_df1)
# print(sparse_vec_df1)

formatted_data = vector_df.select("community_id", "features").rdd.map(lambda row: (row.community_id, row.features))
print(formatted_data.collect())

# Row(nodes, edges, community_id=1, features=SparseVector(1722, {39: 1.0, 55: 1.0, 60: 1.0, 61: 1.0, 82: 1.0, 142: 1.0, 195: 1.0, 223: 1.0, 296: 1.0, 320: 1.0, 322: 1.0, 325: 1.0, 330: 1.0, 332: 1.0, 351: 1.0, 375: 1.0, 405: 1.0, 407: 1.0, 475: 1.0, 512: 1.0, 528: 1.0, 541: 1.0, 558: 1.0, 579: 1.0, 582: 1.0, 614: 1.0, 664: 1.0, 702: 1.0, 727: 1.0, 760: 1.0, 772: 1.0, 780: 1.0, 804: 1.0, 846: 1.0, 847: 1.0})), 

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

warnings.filterwarnings('ignore')


# define OG community dataset
temp_data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0])),
             (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0])),
             (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]))]
# print(f"temp_data:\n {temp_data}")


temp_data2 = formatted_data
# print(f"temp_data2: \n {temp_data2}")


# create dataframe
temp_df = spark.createDataFrame(temp_data, ["id", "features"])
# print(f"temp_df:\n {temp_df.collect()}")

temp_df2 = spark.createDataFrame(temp_data2, ["id", "features"])
# print(f"temp_df2:\n {temp_df2.collect()}")


# create model on dataset
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
mh.setSeed(42)
model = mh.fit(temp_df)
# print(f"model:\n {model}")

model2 = mh.fit(temp_df2)
# print(f"model2:\n {model2}")


# example apporx neighbour
# key = Vectors.sparse(6, [0, 1], [1.0, 1.0])
# model.approxNearestNeighbors(temp_df, key, 3).collect()


# getting cross reverence for approximty for all
result = model.approxSimilarityJoin(temp_df, temp_df, threshold=float('inf')).filter("datasetA.id != datasetB.id")
results_df = result.select(
  col("datasetA.id").alias("idA"),
  col("datasetB.id").alias("idB"),
  col("distCol").alias("JaccardDistance")
  )
print(f"count of temp_df: {temp_df.count()}")
print(f"count of results_df: {results_df.count()}")
results_df.show()

result2 = model2.approxSimilarityJoin(temp_df2, temp_df2, threshold=float('inf'))
results_df2 = result2.select(
  col("datasetA.id").alias("idA"),
  col("datasetB.id").alias("idB"),
  col("distCol").alias("JaccardDistance")
  )
print(f"count of temp_df2: {temp_df2.count()}")
print(f"count of results_df2: {results_df2.count()}")
results_df2.show()

# print results
# print(f"result type: {type(results_df)}")
# print(f"result2 type: {type(results_df2)}")




In [None]:
print(results_df2.rdd.getNumPartitions())
tempyNu = sc.parallelize(results_df2.rdd.collect(), numSlices=100)
# results_df2.repartition(100).rdd.getNumPartitions()
print(tempyNu.rdd.getNumPartitions())