In [0]:
val ratingsDF = spark.read
  .option("inferSchema", "true")
  .option("header", "true")
  .csv("/user/sc10670_nyu_edu/project/*.csv.gz")

In [1]:
ratingsDF.printSchema()

In [2]:
z.show(ratingsDF)

In [3]:
val newratingsDF = ratingsDF
  .withColumn("source_file", regexp_replace(
    regexp_extract(input_file_name(), "([^/]+$)", 0),
    "\\.csv", ""
  ))


In [4]:
newratingsDF.printSchema()

In [5]:
z.show(newratingsDF)

In [6]:
val asinSchema = "parent_asin STRING, category STRING"

In [7]:
val asinSchema = "parent_asin STRING, category STRING"

val asinDF = spark.read
  .schema(asinSchema)
  .csv("/user/sc10670_nyu_edu/project/asin.csv")


In [8]:
asinDF.printSchema()

In [9]:
z.show(asinDF)

In [10]:
println(s"Number of records in the ratings dataframe: ${newratingsDF.count()}")
println(s"Number of records in the asin dataframe: ${asinDF.count()}")

In [11]:
z.show(newratingsDF.summary())

In [12]:
val missingRatingsDF = newratingsDF.filter(newratingsDF.columns.map(c => col(c).isNull).reduce(_ || _))
println(s"Number of records with null values in newratingsDF: ${missingRatingsDF.count()}")


In [13]:
val missingAsinDF = asinDF.filter($"parent_asin".isNull || $"category".isNull)
println(s"Number of records with null values in asinDF: ${missingAsinDF.count()}")


In [14]:
val mergedDF = newratingsDF.join(asinDF, Seq("parent_asin"), "left_outer")

val nullData = mergedDF.filter($"category".isNull)

// Optional: Replace null values in `category` if needed
//val filledDF = mergedDF.na.fill(Map("category" -> "Unknown"))

// Display the merged DataFrame or null rows
//filledDF.show()
nullData.show()

In [15]:
val updatedCategoriesDF = mergedDF.withColumn(
  "category",
  when(col("category").isNull, regexp_replace(col("source_file"), "_", " ")).otherwise(col("category"))
)


In [16]:
val nonRedundantDF = updatedCategoriesDF.drop("user_id", "parent_asin", "source_file")

In [17]:
val cleanedDF = nonRedundantDF.withColumn(
  "date",
  from_unixtime(col("timestamp") / 1000).cast("date")
).drop("timestamp")


In [18]:
cleanedDF.show()

In [19]:
val outputPath = "/user/sc10670_nyu_edu/project/amazon-clean.parquet"

cleanedDF.write.mode("overwrite").parquet(outputPath)


In [20]:
val catCountDF = cleanedDF.groupBy("category").agg(count("category") as "category_count").orderBy(desc("category_count"))

In [21]:
z.show(catCountDF)

In [22]:
val catRatingsDF = cleanedDF
  .groupBy("category")
  .agg(avg("rating").as("avg_rating"))
  .orderBy(desc("avg_rating"))


In [23]:
z.show(catRatingsDF)

In [24]:
val oldestDateDF=cleanedDF.orderBy("date")

In [25]:
z.show(oldestDateDF)