In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder
  .appName("Movie release dates metadata")
  .master("yarn")
  .getOrCreate()

val sc = spark.sparkContext

val ratingsFilePath = "gs://srinija/archive/movie.csv"

val moviesDF = spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(ratingsFilePath)


moviesDF.printSchema()



root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



spark = org.apache.spark.sql.SparkSession@75fc24ee
sc = org.apache.spark.SparkContext@6a4e3905
ratingsFilePath = gs://srinija/archive/movie.csv
moviesDF = [movieId: int, title: string ... 1 more field]


[movieId: int, title: string ... 1 more field]

In [2]:
moviesDF.show(truncate = false)

+-------+-------------------------------------+-------------------------------------------+
|movieId|title                                |genres                                     |
+-------+-------------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                     |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                       |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)              |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)             |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)   |Comedy                                     |
|6      |Heat (1995)                          |Action|Crime|Thriller                      |
|7      |Sabrina (1995)                       |Comedy|Romance                             |
|8      |Tom and Huck (1995)                  |Adventure|Children               

In [3]:
val metadataDF = moviesDF.select(
  col("movieId"),
  regexp_extract(col("title"), "\\((\\d{4})\\)", 1).cast("int").as("releaseYear")
).withColumn("releaseYear", when(col("releaseYear").isNull, lit(2002)).otherwise(col("releaseYear")))

metadataDF = [movieId: int, releaseYear: int]


[movieId: int, releaseYear: int]

In [5]:
val metadataOutputPath = "gs://srinija/outputs/metadata.json"
metadataDF.write
  .format("json")
  .mode("overwrite")
  .save(metadataOutputPath)

println(s"Metadata saved to: $metadataOutputPath")


Metadata saved to: gs://srinija/outputs/metadata.json


metadataOutputPath = gs://srinija/outputs/metadata.json


gs://srinija/outputs/metadata.json

In [6]:
val enrichedMoviesDF = moviesDF.join(metadataDF, Seq("movieId"), "left")

enrichedMoviesDF = [movieId: int, title: string ... 2 more fields]


[movieId: int, title: string ... 2 more fields]

In [7]:
val missingReleaseYearCount = enrichedMoviesDF.filter(col("releaseYear").isNull).count()
println(s"Number of movies without releaseYear: $missingReleaseYearCount") 

Number of movies without releaseYear: 0


missingReleaseYearCount = 0


0

In [9]:
val enrichedOutputPath = "hdfs:///user/day_16_17/case_study_3/"
enrichedMoviesDF.write
  .format("parquet")
  .mode("overwrite")
  .save(enrichedOutputPath)

println(s"Enriched movies saved to: $enrichedOutputPath")

Enriched movies saved to: hdfs:///user/day_16_17/case_study_3/


enrichedOutputPath = hdfs:///user/day_16_17/case_study_3/


hdfs:///user/day_16_17/case_study_3/

In [10]:
spark.stop()