In [2]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import org.apache.hadoop.fs.{FileSystem, Path}

In [3]:
val spark = SparkSession.builder()
      .appName("CaseStudy2 - User Rating History Partitioning")
      .getOrCreate()

spark = org.apache.spark.sql.SparkSession@2e53ef54


org.apache.spark.sql.SparkSession@2e53ef54

In [4]:
val ratingsDataPath = "gs://spark-tasks-bucket/day_16_17/rating.csv"
val ratingsDF = spark.read.option("header", "true").option("inferSchema", "true").csv(ratingsDataPath)

ratingsDataPath = gs://spark-tasks-bucket/day_16_17/rating.csv
ratingsDF = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [5]:
ratingsDF.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [6]:
// Filter incomplete records
val updatedDF = ratingsDF.filter(col("userId").isNotNull && col("userId").isNotNull && col("rating").isNotNull)

updatedDF = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [7]:
// // There are no duplicate ratings, if yes filter them

// val duplicateRatingsDF = updatedDF
//   .groupBy("movieId", "userId")
//   .count() // Count occurrences of each (movieId, userId) pair
//   .filter(col("count") > 1) // Keep only those with multiple ratings

// duplicateRatingsDF.show()

+-------+------+-----+
|movieId|userId|count|
+-------+------+-----+
+-------+------+-----+



duplicateRatingsDF = [movieId: int, userId: int ... 1 more field]


[movieId: int, userId: int ... 1 more field]

In [8]:
val ratingsDF = updatedDF.select("userId", "movieId", "rating")

ratingsDF = [userId: int, movieId: int ... 1 more field]


[userId: int, movieId: int ... 1 more field]

In [11]:
ratingsDF.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      2|   3.5|
|     1|     29|   3.5|
|     1|     32|   3.5|
|     1|     47|   3.5|
|     1|     50|   3.5|
|     1|    112|   3.5|
|     1|    151|   4.0|
|     1|    223|   4.0|
|     1|    253|   4.0|
|     1|    260|   4.0|
|     1|    293|   4.0|
|     1|    296|   4.0|
|     1|    318|   4.0|
|     1|    337|   3.5|
|     1|    367|   3.5|
|     1|    541|   4.0|
|     1|    589|   3.5|
|     1|    593|   3.5|
|     1|    653|   3.0|
|     1|    919|   3.5|
+------+-------+------+
only showing top 20 rows



In [13]:
ratingsDF.select("userId").distinct().show() 

+------+
|userId|
+------+
| 81900|
| 82529|
| 82672|
| 83250|
| 83693|
| 85321|
| 85349|
| 87120|
| 87338|
| 87462|
| 87616|
| 87905|
| 90228|
| 90550|
| 91299|
| 92182|
| 92235|
| 93319|
| 94265|
| 95476|
+------+
only showing top 20 rows



In [15]:
// Has lot of records, so taking only 100000 records
val trimmedDF = ratingsDF.limit(100000)

trimmedDF = [userId: int, movieId: int ... 1 more field]


[userId: int, movieId: int ... 1 more field]

In [16]:
val outputPath = "hdfs:///user/day_16_17/case_study_2"
trimmedDF.coalesce(1).write.partitionBy("userId").format("parquet").mode("overwrite").save(outputPath)

outputPath = hdfs:///user/day_16_17/case_study_2


hdfs:///user/day_16_17/case_study_2

In [21]:
// Try reading the data from HDFS

val path = "hdfs:///user/day_16_17/case_study_2/userId=1"
spark.read.option("basePath", outputPath).parquet(path)
    .select("userId").distinct().show()

// This suggests that the data is partitioned based on userId as expected

+------+
|userId|
+------+
|     1|
+------+



path = hdfs:///user/day_16_17/case_study_2/userId=1


hdfs:///user/day_16_17/case_study_2/userId=1