In [0]:
%pyspark 

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower

# Create a Spark session
spark = SparkSession.builder.appName("DataCleaning").config("spark.driver.extraClassPath", "path/to/your/hadoop/home/lib/*").getOrCreate()

# Assuming book1, book2, book3, book4, and book5 are PySpark DataFrames
# Replace the placeholders with your actual data or loading mechanisms.

# Load or create your DataFrames
book1 = spark.read.format("csv").load("/zeppelin/notebook/book1.csv", header=True)
book2 = spark.read.format("csv").load("/zeppelin/notebook/book2.csv", header=True)
book3 = spark.read.format("csv").load("/zeppelin/notebook/book3.csv", header=True)
book4 = spark.read.format("csv").load("/zeppelin/notebook/book4.csv", header=True)
book5 = spark.read.format("csv").load("/zeppelin/notebook/book5.csv", header=True)

# Concatenate the DataFrames vertically
concatenated_df = book1.union(book2).union(book3).union(book4).union(book5)

# Display the schema and the first few rows of the concatenated DataFrame
concatenated_df.printSchema()
concatenated_df.show()

# Data Cleaning Steps

# 1. Remove leading and trailing whitespaces from string columns
concatenated_df = concatenated_df.withColumn("Source", trim(col("Source").cast("string")))
concatenated_df = concatenated_df.withColumn("Target", trim(col("Target").cast("string")))

# 2. Handle outliers or incorrect values in the "weight" column
# For example, filter out rows where weight is negative
concatenated_df = concatenated_df.filter(col("weight") >= 0)

# 3. Standardize or clean up categorical values
# For example, convert all names to lowercase for consistency
concatenated_df = concatenated_df.withColumn("Source", lower(col("Source").cast("string")))
concatenated_df = concatenated_df.withColumn("Target", lower(col("Target").cast("string")))
concatenated_df = concatenated_df.na.drop(subset=["Source", "Target", "weight"])
concatenated_df = concatenated_df.filter(col("Source").isNotNull() & col("Target").isNotNull())

# 4. Drop unnecessary columns
# If there are columns that are not needed for analysis, drop them
concatenated_df.show(5)
concatenated_df = concatenated_df.na.drop()
concatenated_df.printSchema()

# Save the cleaned and concatenated DataFrame to a single CSV file
concatenated_df.coalesce(1).write.mode("overwrite").csv("/zeppelin/notebook/resultUnion.csv", header=True)
# Load the cleaned DataFrame
cleaned_df = spark.read.format("csv").load("/zeppelin/notebook/resultUnion.csv", header=True)
cleaned_df.show(10)
# Load the cleaned DataFrame

In [1]:
import org.neo4j.driver._
import org.neo4j.driver.Values
import scala.io.Source

val uri = "bolt://neo4j:7687"
val user = "neo4j"
val password = "bitnami1"

val driver = GraphDatabase.driver(uri, AuthTokens.basic(user, password))
val session = driver.session()

// Use HDFS file path
val csvFile = "/zeppelin/notebook/resultUnion.csv/part-00000-fa647f61-612d-40ed-81b5-1c8049caa127-c000.csv"
val lines = Source.fromFile(csvFile).getLines().toList

// Extract header and data
val header = lines.head.split(",")
val data = lines.tail.map(_.split(","))

// Insert data into Neo4j with a different graph structure
data.foreach { row =>
  // Check if the row has enough elements
  if (row.length >= 5) {
    val cypherQuery =
      s"""
         |MERGE (source:Node1 {name: '${row(0)}'})
         |MERGE (target:Node1 {name: '${row(1)}'})
         |MERGE (source)-[r:${row(2)} {weight: ${row(3).toInt}, book: ${row(4).toInt}}]->(target)
       """.stripMargin

    session.run(cypherQuery)
  } else {
    println(s"Skipping row: ${row.mkString(", ")} - Insufficient columns")
  }
}

// Close the session and driver when done
session.close()
driver.close()