In [0]:
g%spark.pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
    .appName("Neo4jConnection")\
    .getOrCreate()
query = """
MATCH (source)-[relation]->(target)
WHERE (source:Node1 OR target:Node1)
RETURN source.name as Source, target.name as Target, type(relation) as Type, relation.weight as weight, relation.book as book
"""

data =spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", "bolt://neo4j:7687") \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "bitnami1") \
    .option("query", query) \
    .option("partitions", "1")\
    .load()\

In [1]:
%spark.pyspark
# Vérification des données récupérées depuis Neo4j
data.show()
data.printSchema()


In [2]:
%spark.pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, count, when
import pandas as pd
import networkx as nx

# Initialize Spark session
spark = SparkSession.builder.master("local[1]") \
    .appName("Neo4jConnection")\
    .getOrCreate()

# Define your Neo4j query to fetch data
query = """
MATCH (source)-[relation]->(target)
WHERE (source:Node1 OR target:Node1)
RETURN source.name as Source, target.name as Target, type(relation) as Type, relation.weight as weight, relation.book as book
"""

# Load data from Neo4j into a Spark DataFrame
data = spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", "bolt://neo4j:7687") \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "bitnami1") \
    .option("query", query) \
    .option("partitions", "1")\
    .load()
    

# Add a column for similarity based on the weight of the relationship
data = data.withColumn("Similarity_Weight", when(col("weight") > 5, lit("High"))
                       .when((col("weight") <= 5) & (col("weight") > 3), lit("Medium"))
                       .otherwise(lit("Low")))

# Calculate Degree for each node (Source and Target)
degrees_source = data.groupBy("Source").agg(count("Target").alias("Source_Degree"))
degrees_target = data.groupBy("Target").agg(count("Source").alias("Target_Degree"))

# Join degree with the main DataFrame
data = data.join(degrees_source, "Source", "left").join(degrees_target, "Target", "left")


# Export Spark DataFrame to Pandas
pandas_data = data.toPandas()

# Create a graph using NetworkX from Source and Target columns
graph = nx.from_pandas_edgelist(pandas_data, 'Source', 'Target', create_using=nx.Graph())

# Calculate different centrality measures
degree_centrality = nx.degree_centrality(graph)
weighted_degree = dict(graph.degree(weight='weight'))
eigenvector_centrality = nx.eigenvector_centrality_numpy(graph, weight='weight')
pagerank = nx.pagerank(graph, weight='weight')
betweenness_centrality = nx.betweenness_centrality(graph, weight='weight')

# Create Pandas DataFrames for each centrality measure
degree_df = pd.DataFrame(list(degree_centrality.items()), columns=['Node', 'Degree_Centrality'])
weighted_degree_df = pd.DataFrame(list(weighted_degree.items()), columns=['Node', 'Weighted_Degree'])
eigenvector_df = pd.DataFrame(list(eigenvector_centrality.items()), columns=['Node', 'Eigenvector_Centrality'])
pagerank_df = pd.DataFrame(list(pagerank.items()), columns=['Node', 'PageRank'])
betweenness_df = pd.DataFrame(list(betweenness_centrality.items()), columns=['Node', 'Betweenness_Centrality'])

# Convert Pandas DataFrames to Spark DataFrames
degree_spark_df = spark.createDataFrame(degree_df)
weighted_degree_spark_df = spark.createDataFrame(weighted_degree_df)
eigenvector_spark_df = spark.createDataFrame(eigenvector_df)
pagerank_spark_df = spark.createDataFrame(pagerank_df)
betweenness_spark_df = spark.createDataFrame(betweenness_df)

# Join the centrality measures with the original DataFrame
data = data.join(degree_spark_df, data.Source == degree_spark_df.Node, "left").drop("Node")
data = data.join(weighted_degree_spark_df, data.Source == weighted_degree_spark_df.Node, "left").drop("Node")
data = data.join(eigenvector_spark_df, data.Source == eigenvector_spark_df.Node, "left").drop("Node")
data = data.join(pagerank_spark_df, data.Source == pagerank_spark_df.Node, "left").drop("Node")
data = data.join(betweenness_spark_df, data.Source == betweenness_spark_df.Node, "left").drop("Node")

# Fill null values with 0 in case some nodes don't have centrality measures
data = data.fillna(0)

# Display the final DataFrame with centrality features added
data.show()
data.printSchema()


In [3]:
%spark.pyspark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Initialise Spark session
spark = SparkSession.builder.master("local[1]") \
    .appName("Predict_Degree_Centrality")\
    .getOrCreate()

# Assume 'data' contient votre DataFrame avec toutes les caractéristiques nécessaires, y compris 'Degree_Centrality'

# Liste des colonnes utilisées comme caractéristiques pour la prédiction de Degree_Centrality
feature_columns = ['weight', 'book', 'Source_Degree', 'Target_Degree', 'Weighted_Degree',
                   'Eigenvector_Centrality', 'PageRank', 'Betweenness_Centrality']

# Utiliser VectorAssembler pour assembler toutes les colonnes de features en une seule colonne
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Créer un objet LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='Degree_Centrality')

# Créer un pipeline pour assembler les étapes de prétraitement et le modèle
pipeline = Pipeline(stages=[assembler, lr])

# Diviser les données en ensembles d'entraînement et de test (80% pour l'entraînement et 20% pour les tests)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=12345)

# Entraîner le modèle sur les données d'entraînement
model = pipeline.fit(train_data)

# Faire des prédictions sur les données de test
predictions = model.transform(test_data)

# Afficher les prédictions et les valeurs réelles
predictions.select('Degree_Centrality', 'prediction', *feature_columns).show()

from pyspark.ml.evaluation import RegressionEvaluator

# Calculer la MSE
evaluator = RegressionEvaluator(labelCol="Degree_Centrality", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error (MSE) on test data = {:.4f}".format(mse))

# Calculer la RMSE
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print("Root Mean Squared Error (RMSE) on test data = {:.4f}".format(rmse))

# Calculer R² (Coefficient de détermination)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print("R-squared (R²) on test data = {:.4f}".format(r2))




In [4]:
%spark.pyspark
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Créer un VectorAssembler pour assembler les colonnes de features
feature_columns = ['Source_Degree', 'Target_Degree', 'Degree_Centrality', 'Weighted_Degree',
                   'Eigenvector_Centrality', 'Betweenness_Centrality', 'weight']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Créer un modèle DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol='features', labelCol='PageRank')

# Créer un pipeline
pipeline = Pipeline(stages=[assembler, dt])

# Diviser les données en ensembles d'entraînement et de test
train_data, test_data = data.randomSplit([0.8, 0.2], seed=12345)

# Entraîner le modèle
model = pipeline.fit(train_data)

# Faire des prédictions sur les données de test
predictions = model.transform(test_data)

# Afficher les prédictions et les valeurs réelles
predictions.select('PageRank', 'prediction', *feature_columns).show()

# Calculer des métriques d'évaluation pour le modèle

evaluator = RegressionEvaluator(labelCol="PageRank", predictionCol="prediction", metricName="rmse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error (MSE) on test data = {:.4f}".format(mse))
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = {:.4f}".format(rmse))

r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print("R-squared (R²) on test data = {:.4f}".format(r2))



In [5]:
%spark.pyspark
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

# Initialise Spark session
spark = SparkSession.builder.master("local[1]") \
    .appName("Betweenness_Prediction_RF")\
    .getOrCreate()

# Assume 'data' contains your prepared DataFrame with all the features and target variable 'Betweenness_Centrality'

# Utiliser VectorAssembler pour assembler toutes les colonnes de features en une seule colonne
feature_columns = ['Source_Degree', 'Target_Degree', 'Degree_Centrality', 'Weighted_Degree',
                   'Eigenvector_Centrality', 'PageRank', 'weight']  # Ajoutez 'weight' comme feature
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Créer un objet RandomForestRegressor
rf = RandomForestRegressor(featuresCol='features', labelCol='Betweenness_Centrality')

# Créer un pipeline pour assembler les étapes de prétraitement et le modèle
pipeline = Pipeline(stages=[assembler, rf])

# Diviser les données en ensembles d'entraînement et de test (80% pour l'entraînement et 20% pour les tests)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=12345)

# Entraîner le modèle sur les données d'entraînement
model = pipeline.fit(train_data)

# Faire des prédictions sur les données de test
predictions = model.transform(test_data)

# Afficher les prédictions et les valeurs réelles
predictions.select('Betweenness_Centrality', 'prediction', *feature_columns).show()

from pyspark.ml.evaluation import RegressionEvaluator

# Calculer la MSE
evaluator = RegressionEvaluator(labelCol="Betweenness_Centrality", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error (MSE) on test data = {:.4f}".format(mse))

# Calculer la RMSE
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print("Root Mean Squared Error (RMSE) on test data = {:.4f}".format(rmse))

# Calculer R² (Coefficient de détermination)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print("R-squared (R²) on test data = {:.4f}".format(r2))


In [6]:
%spark.pyspark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Initialise Spark session
spark = SparkSession.builder.master("local[1]") \
    .appName("Predict_Weighted_Degree_RF")\
    .getOrCreate()

# Assume 'data' contains your prepared DataFrame with all the features and target variable 'Weighted_Degree'

# Utiliser VectorAssembler pour assembler toutes les colonnes de features en une seule colonne
feature_columns = ['Source_Degree', 'Target_Degree', 'Degree_Centrality', 'weight', 'Eigenvector_Centrality', 'PageRank', 'Betweenness_Centrality']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Créer un objet RandomForestRegressor
rf = RandomForestRegressor(featuresCol='features', labelCol='Weighted_Degree')

# Créer un pipeline pour assembler les étapes de prétraitement et le modèle
pipeline = Pipeline(stages=[assembler, rf])

# Diviser les données en ensembles d'entraînement et de test (80% pour l'entraînement et 20% pour les tests)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=12345)

# Entraîner le modèle sur les données d'entraînement
model = pipeline.fit(train_data)

# Faire des prédictions sur les données de test
predictions = model.transform(test_data)

# Afficher les prédictions et les valeurs réelles
predictions.select('Weighted_Degree', 'prediction', *feature_columns).show()


from pyspark.ml.evaluation import RegressionEvaluator

# Calculer la MSE
evaluator = RegressionEvaluator(labelCol="Weighted_Degree", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error (MSE) on test data = {:.4f}".format(mse))

# Calculer la RMSE
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print("Root Mean Squared Error (RMSE) on test data = {:.4f}".format(rmse))

# Calculer R² (Coefficient de détermination)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print("R-squared (R²) on test data = {:.4f}".format(r2))



In [7]:
%pyspark

from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

# Initialise Spark session
spark = SparkSession.builder.master("local[1]").appName("Weight_Clustering").getOrCreate()

# Convertir la colonne "weight" en type numérique (double)
data = data.withColumn("weight", col("weight").cast("double"))

# Supprimer les lignes avec des valeurs nulles dans la colonne 'weight'
data = data.dropna(subset=["weight"])

# Assembler les caractéristiques en un vecteur, en traitant les valeurs nulles
assembler = VectorAssembler(inputCols=["weight"], outputCol="features", handleInvalid="keep")
assembled_df = assembler.transform(data)

# Créer le modèle K-Means avec k=3 clusters
kmeans = KMeans(featuresCol="features", k=3)

# Entraîner le modèle K-Means
kmeans_model = kmeans.fit(assembled_df)

# Faire des prédictions sur les données
predictions = kmeans_model.transform(assembled_df)

# Afficher les données d'origine et les prédictions du K-Means
result = predictions.select("Source", "Target", "weight", "prediction")
result.show(100)

result.coalesce(1).write.csv('/zeppelin/notebook/result4.csv', header=True)


# Enregistrer les résultats au format CSV




In [8]:
%python
import pandas as pd
import matplotlib.pyplot as plt

# Load the saved CSV file into a Pandas DataFrame
result_pandas = pd.read_csv('/zeppelin/notebook/result3.csv/part-00000-fbb41e2a-e96e-4507-aeea-28e1a1717cae-c000.csv')

# Extract relevant columns for visualization
x = result_pandas['weight']
y = result_pandas['prediction']

# Plotting the scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(x, y, c=y, cmap='viridis', edgecolors='k')
plt.title('K-Means Clustering')
plt.xlabel('Weight')
plt.ylabel('Cluster')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()
z.show(plt)



