In [4]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder
  .appName("Time Based data partitioning for ratings")
  .master("yarn")
  .getOrCreate()

val sc = spark.sparkContext

val ratingsFilePath = "gs://srinija/archive/rating.csv"

val ratingsDF = spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(ratingsFilePath)


ratingsDF.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



spark = org.apache.spark.sql.SparkSession@7b55b22f
sc = org.apache.spark.SparkContext@575b04e1
ratingsFilePath = gs://srinija/archive/rating.csv
ratingsDF = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [5]:
ratingsDF.show(10, truncate = false)

+------+-------+------+-------------------+
|userId|movieId|rating|timestamp          |
+------+-------+------+-------------------+
|1     |2      |3.5   |2005-04-02 23:53:47|
|1     |29     |3.5   |2005-04-02 23:31:16|
|1     |32     |3.5   |2005-04-02 23:33:39|
|1     |47     |3.5   |2005-04-02 23:32:07|
|1     |50     |3.5   |2005-04-02 23:29:40|
|1     |112    |3.5   |2004-09-10 03:09:00|
|1     |151    |4.0   |2004-09-10 03:08:54|
|1     |223    |4.0   |2005-04-02 23:46:13|
|1     |253    |4.0   |2005-04-02 23:35:40|
|1     |260    |4.0   |2005-04-02 23:33:46|
+------+-------+------+-------------------+
only showing top 10 rows



In [6]:
val ratingsWithYearDF = ratingsDF.withColumn("year", year(col("timestamp")))


ratingsWithYearDF = [userId: int, movieId: int ... 3 more fields]


[userId: int, movieId: int ... 3 more fields]

In [7]:
ratingsWithYearDF.show(10, truncate = false)

+------+-------+------+-------------------+----+
|userId|movieId|rating|timestamp          |year|
+------+-------+------+-------------------+----+
|1     |2      |3.5   |2005-04-02 23:53:47|2005|
|1     |29     |3.5   |2005-04-02 23:31:16|2005|
|1     |32     |3.5   |2005-04-02 23:33:39|2005|
|1     |47     |3.5   |2005-04-02 23:32:07|2005|
|1     |50     |3.5   |2005-04-02 23:29:40|2005|
|1     |112    |3.5   |2004-09-10 03:09:00|2004|
|1     |151    |4.0   |2004-09-10 03:08:54|2004|
|1     |223    |4.0   |2005-04-02 23:46:13|2005|
|1     |253    |4.0   |2005-04-02 23:35:40|2005|
|1     |260    |4.0   |2005-04-02 23:33:46|2005|
+------+-------+------+-------------------+----+
only showing top 10 rows



In [8]:
val OutputPath = "hdfs:///user/day_16_17/case_study_5/"
ratingsWithYearDF
  .write
  .partitionBy("year")  
  .format("parquet")   
  .mode("overwrite") 
  .save(OutputPath)

OutputPath = hdfs:///user/day_16_17/case_study_5/


hdfs:///user/day_16_17/case_study_5/

In [10]:
val hdfsPath =  "hdfs:///user/day_16_17/case_study_5/"
val fs = org.apache.hadoop.fs.FileSystem.get(spark.sparkContext.hadoopConfiguration)
val status = fs.listStatus(new org.apache.hadoop.fs.Path(hdfsPath))

status.foreach { fileStatus =>
  println(fileStatus.getPath)
}

/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/_SUCCESS
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=1995
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=1996
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=1997
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=1998
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=1999
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2000
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2001
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2002
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2003
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2004
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2005
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2006
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2007
/gateway/default/webhdfs/v1/user/day_16_17/case_study_5/year=2008
/gateway/de

lastException = null
hdfsPath = hdfs:///user/day_16_17/case_study_5/


fs: org.apache.hadoop.fs.FileSystem = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1119845070_45, ugi=root (auth:SIMPLE)]]
status: Array[org.apache.hadoop.fs.FileStatus] = Array(HdfsNamedFileStatus{path=hdfs://cluster-c1c6-m/user/day_16_17/case_study_5/_SUCCESS; isDirectory=false; length=0; replication=2; blocksize=134217728; modification_time=1732860177657; access_time=1732860177654; owner=root; group=hadoop; permission=rw-r--r--; isSymlink=false; hasAcl=false; isEncrypted=false; isErasureCoded=false}, HdfsLocatedFileStatus{path=hdfs://cluster-c1c6-m/user/day_16_17/case_study_5/year=1995; isDirectory=true; modification_time=1732860174240; access_time=0; owner=root; group=hadoop; permission=rwxr-xr-x; isSymlink=false; hasAcl=fal...


hdfs:///user/day_16_17/case_study_5/

In [11]:
spark.stop()