In [19]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("notebook-test")
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow")
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow")
    .getOrCreate()
)
print("Spark version:", spark.version)
spark.range(5).show()


Spark version: 4.0.1
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



25/12/10 20:54:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [20]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("notebook-test")
    .getOrCreate()
)
print("Spark version:", spark.version)


Spark version: 4.0.1


In [21]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("big-data-project")
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow")
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow")
    .getOrCreate()
)


25/12/10 20:54:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [22]:
# Load the full Grocery_and_Gourmet_Food JSONL file

input_path = "../Grocery_and_Gourmet_Food.jsonl"
df = spark.read.json(input_path)
df.printSchema()
df.show(5, truncate=False)
df.count()

                                                                                

root
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- attachment_type: string (nullable = true)
 |    |    |-- large_image_url: string (nullable = true)
 |    |    |-- medium_image_url: string (nullable = true)
 |    |    |-- small_image_url: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)

+----------+------------+------+-----------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

14318520

In [23]:
# 2) Add row numbers so we can split into 3 parts

from pyspark.sql.window import Window
import pyspark.sql.functions as F
w = Window.orderBy(F.monotonically_increasing_id())
df_indexed = df.withColumn("row_num", F.row_number().over(w))
df.show ()

+----------+------------+--------------------+-----------+------+--------------------+-------------+--------------------+--------------------+-----------------+
|      asin|helpful_vote|              images|parent_asin|rating|                text|    timestamp|               title|             user_id|verified_purchase|
+----------+------------+--------------------+-----------+------+--------------------+-------------+--------------------+--------------------+-----------------+
|B00CM36GAQ|           0|                  []| B00CM36GAQ|   5.0|Excellent!! Yummy...|1587854482395|  Excellent!  Yummy!|AFKZENTNBQ7A7V7UX...|             true|
|B074J5WVYH|           0|                  []| B0759B7KLH|   5.0|Excellent!  The b...|1587854400380|   Delicious!!! Yum!|AFKZENTNBQ7A7V7UX...|             true|
|B079TRNVHX|           1|                  []| B079TRNVHX|   5.0|These are very ta...|1587853224527|Extremely Delicio...|AFKZENTNBQ7A7V7UX...|             true|
|B07194LN2Z|           0|         

In [24]:
# 3) Compute sizes for three roughly equal parts

total_rows = df_indexed.count()
third = total_rows // 3
third, total_rows

                                                                                

(4772840, 14318520)

In [25]:
# 4) Build three DataFrames (1/3 each)
part1 = df_indexed.filter(F.col("row_num") <= third)
part2 = df_indexed.filter(
    (F.col("row_num") > third) & (F.col("row_num") <= 2 * third)
)
part3 = df_indexed.filter(F.col("row_num") > 2 * third)

In [26]:
# 5) Write each part as CSV (Spark will create folders with CSV files inside)

output_cols = [
    "rating", "title", "text", "asin", "parent_asin",
    "user_id", "timestamp", "helpful_vote", "verified_purchase"
]
part1.select(output_cols).write.csv("../data/grocery_reviews_part1", header=True, mode="overwrite")
part2.select(output_cols).write.csv("../data/grocery_reviews_part2", header=True, mode="overwrite")
part3.select(output_cols).write.csv("../data/grocery_reviews_part3", header=True, mode="overwrite")


25/12/10 20:54:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/10 20:54:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/10 20:54:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/10 20:54:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/10 20:54:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/10 20:55:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/10 2